def test_withSmiles(self): remover = SaltRemover(defnData="[Na+]\nCC(=O)O", defnFormat=InputFormat.SMILES) self.assertEqual(len(remover.salts), 2) mol = Chem.MolFromSmiles('CC(=O)O.[Na+]') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 0)
def generateMoleculeHierarchyTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.moleculeHierarchy: hierarchy = MoleculeHierarchy(molecule=molecule) else: hierarchy = molecule.moleculeHierarchy saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) if mol.GetNumAtoms() == base.GetNumAtoms(): hierarchy.parent_molecule = molecule else: hierarchy.parent_molecule = getParentMolregnoFromBase( MolToMolBlock(base)) hierarchy.active_molecule = hierarchy.parent_molecule try: hierarchy.save() except IntegrityError as e: if debug: print e.message else: raise e
def remove_salts(mol, dictionary=True, *args, **kwargs): """Removes salts from a molecule. This function removes detected salts following a salts dictionary by default. Parameters ---------- mol: rdkit.Chem.Mol The molecule to be modified. dictionary: bool, optional True (default): Activates the use of the salt dictionary. False: Uses the standard StripMol functionality, provided by rdkit.Chem.SaltRemover. defnData: list of str, optional If the dictionary is set to False, a custom dictionary can be set up. If not rdkit default values from '/scratch/RDKit_git/Data/Salts.txt' are used. Returns ------- mol: rdkit.Chem.Mol A new molecule with salts removed. Notes ----- The Salts Dictionary The dictionary used is a derived version from the ChEMBL salt dictionary, created for the standardiser application by Francis Atkinson. The salts are stored as list of (neutral) SMILES. """ lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) i = 0 if dictionary == True: salts = _extract_row_from_csv(0) salt_names = _extract_row_from_csv(1) list_len = len(salts) while i < list_len: salt = salts[i] salt_name = salt_names[i] test = Chem.MolToSmiles(mol) i += 1 remover = SaltRemover(defnData=salt) stripped_mol = remover.StripMol(mol) test_smiles = Chem.MolToSmiles(stripped_mol) if test_smiles != test: logging.debug("Following salt was stripped: %s", salt_name) mol = stripped_mol continue else: mol = SaltRemover(*args, **kwargs).StripMol(mol) return mol
def test_SmilesVsSmarts(self): # SMARTS remover = SaltRemover(defnData="[Cl,Br]") mol = Chem.MolFromSmiles('CN(Br)Cl.Cl') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 4) self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br') mol = Chem.MolFromSmiles('CN(C)C.Cl.Br') res, deleted = remover.StripMolWithDeleted(mol) self.assertEqual(Chem.MolToSmiles(res), 'CN(C)C') # Because we read in SMARTS, we should output as well. Otherwise, we will have # mismatches self.assertListEqual([Chem.MolToSmarts(m) for m in deleted], ['[Cl,Br]']) # SMILES remover = SaltRemover(defnData="Cl", defnFormat=InputFormat.SMILES) mol = Chem.MolFromSmiles('CN(Br)Cl.Cl') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 4) self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
def parse_smiles(smiles): """ Sanity check and normalization for drugs """ try: # Remove salts smiles = smiles.split()[0] mol = Chem.MolFromSmiles(smiles) remover = SaltRemover() mol = remover.StripMol(mol) parsed_smiles = Chem.MolToSmiles(mol) return parsed_smiles except Exception as e: pass return smiles
def generateCompoundPropertiesTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.compoundProperty: prop = CompoundProperties(molecule=molecule) else: prop = molecule.compoundProperty saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) prop.hbd = Descriptors.CalcNumHBD(mol) prop.hba = Descriptors.CalcNumHBA(mol) prop.rtb = Descriptors.CalcNumRotatableBonds(mol) prop.alogp = Crippen.MolLogP(mol) prop.psa = Descriptors.CalcTPSA(mol) prop.full_mwt = NewDescriptors.MolWt(mol) # prop.exact_mass = Descriptors.CalcExactMolWt(mol) if base.GetNumAtoms(): prop.mw_freebase = NewDescriptors.MolWt(base) prop.full_molformula = Descriptors.CalcMolFormula(mol) try: prop.save() except IntegrityError as e: if debug: print e.message else: raise e
def remove_water(m): from rdkit.Chem.SaltRemover import SaltRemover remover = SaltRemover(defnData="[O]") return remover.StripMol(m)
df = pd.read_csv(file_path).set_index('CID') # ## Make 3D optimized versions of the molecules # + # Make basic mol objects mols = {cid: Chem.MolFromSmiles(smi) for cid, smi in df['IsomericSMILES'].items()} # Then optimize them s = SaltRemover() p = ProgressBar(len(df)) for i, (cid, mol) in enumerate(mols.items()): p.animate(i, status=cid) try: mol.SetProp("_Name","%d: %s" % (cid, df.loc[cid, 'IsomericSMILES'])) mol = s.StripMol(mol, dontRemoveEverything=True) mol = Chem.AddHs(mol) AllChem.Compute2DCoords(mol) AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) # Is this deterministic? except Exception as e: p.log('Exception for %d: %s' % (cid, e)) mols[cid] = None else: mols[cid] = mol # Remove CIDs without a successful optimization mols = {cid: mol for cid, mol in mols.items() if mol} # - print("%d mol files successfully optimized from %d CIDs" % (len(mols), len(df)))
else: print('\tCongratulations, your dataset has not incorrect smiles.') ############################################################################## ############################ STEP 2: salt elimination ######################## ############################################################################## print('[+] Eliminating salts ') withoutsalts = [] for smi in df_clean_by_sanit['SMILES']: mol = Chem.MolFromSmiles(smi) remover = SaltRemover(defnData='[Na,Cl,K,O,OH,Fe,F,H,Al,Mg,Co,Ti,NH4,Mn,Si,Ca,Au,I,Hg,Mo,Zn,Br,Ag,Sr,Cu,Bi,S,Li,NH3,He,Y,Ar,Ba,La]') mol = remover.StripMol(mol) smiles_new = Chem.MolToSmiles(mol) smiles_new = smiles_new.replace('.[H+]', '').replace('[H+].', '') # because saltremover do not eliminate water withoutsalts.append(smiles_new) df_clean_by_sanit.insert(2,'W/O SALTS',withoutsalts) prompt = [] for smile_with, smile_without in zip(df_clean_by_sanit['SAN_SMILES'],df_clean_by_sanit['W/O SALTS']): if smile_with != smile_without: prompt.append('\t{} --> {}'.format(smile_with,smile_without))
class MolReader(MolIO): """ Read molecules from files and file-like objects. Supports SDF, SMILES, and RDKit binary format (via pickle). Parameters ---------- f : file, optional File-like object. mol_format : str, optional Molecule file format. Currently supports 'sdf', 'smi', and 'pkl'. remove_hydrogens : bool, optional (default False) Remove hydrogens from molecules. remove_salts : bool, optional (default True) Remove salts from molecules. Note that this will remove any hydrogens present on the molecule. compute_2d_coords : bool, optional (default True) Compute 2D coordinates when reading SMILES. If molecules are written to SDF without 2D coordinates, stereochemistry information will be lost. """ def __init__(self, f=None, mol_format=None, remove_hydrogens=False, remove_salts=True, compute_2d_coords=True): if not remove_hydrogens and remove_salts: warnings.warn('Compounds with salts will have hydrogens removed') super(MolReader, self).__init__(f, mol_format) self.remove_hydrogens = remove_hydrogens self.remove_salts = remove_salts if remove_salts: self.salt_remover = SaltRemover() self.compute_2d_coords = compute_2d_coords def __iter__(self): """ Iterate over molecules. """ return self.get_mols() def get_mols(self): """ Read molecules from a file-like object. Molecule conformers are grouped into a single molecule. Two molecules are considered conformers of the same molecule if they: * Are contiguous in the file * Have identical (canonical isomeric) SMILES strings * Have identical compound names (if set) Returns ------- A generator yielding (possibly multi-conformer) RDKit Mol objects. """ parent = None for mol in self._get_mols(): if parent is None: parent = mol continue if self.are_same_molecule(parent, mol): if mol.GetNumConformers(): for conf in mol.GetConformers(): parent.AddConformer(conf, assignId=True) else: continue # skip duplicate molecules without conformers else: parent = self.clean_mol(parent) if parent is not None: yield parent parent = mol if parent is not None: parent = self.clean_mol(parent) if parent is not None: yield parent def _get_mols(self): """ Read molecules from a file-like object. This method returns individual conformers from a file and does not attempt to combine them into multiconformer Mol objects. Returns ------- A generator yielding RDKit Mol objects. """ if self.mol_format == 'sdf': mols = self._get_mols_from_sdf() elif self.mol_format == 'smi': mols = self._get_mols_from_smiles() elif self.mol_format == 'pkl': mols = self._get_mols_from_pickle() else: raise NotImplementedError('Unrecognized molecule format ' + '"{}"'.format(self.mol_format)) # skip read errors while True: try: mol = mols.next() except StopIteration: break except Exception: warnings.warn('Skipping molecule.') continue else: if mol is not None: yield mol def _get_mols_from_sdf(self): """ Read SDF molecules from a file-like object. """ supplier = Chem.ForwardSDMolSupplier(self.f, removeHs=self.remove_hydrogens) for mol in supplier: yield mol def _get_mols_from_smiles(self): """ Read SMILES molecules from a file-like object. """ for line in self.f.readlines(): line = line.strip() if not line: continue split_line = line.split() if len(split_line) > 1: smiles, name = split_line else: smiles, = split_line name = None # hydrogens are removed by default, which triggers sanitization try: if self.remove_hydrogens: mol = Chem.MolFromSmiles(smiles) else: mol = Chem.MolFromSmiles(smiles, sanitize=False) Chem.SanitizeMol(mol) if self.compute_2d_coords: AllChem.Compute2DCoords(mol) except Exception: warnings.warn('Skipping ' + line) continue else: if name is not None: mol.SetProp('_Name', name) yield mol def _get_mols_from_pickle(self): """ Read pickled molecules from a file-like object. Files that contain multiple pickles are supported by repeated calls to load. """ while True: try: mols = cPickle.load(self.f) for mol in np.atleast_1d(mols): yield mol except EOFError: break def are_same_molecule(self, a, b): """ Test whether two molecules are conformers of the same molecule. Test for: * Identical (canonical isomeric) SMILES strings * Identical compound names (if set) Parameters ---------- a, b : RDKit Mol Molecules to compare. """ # get names, if available a_name = self._get_name(a) b_name = self._get_name(b) # get canonical isomeric SMILES a_smiles = self._get_isomeric_smiles(a) b_smiles = self._get_isomeric_smiles(b) assert a_smiles and b_smiles # test for same molecule return a_smiles == b_smiles and a_name == b_name def _get_name(self, mol): """ Get molecule name, if available. Parameters ---------- mol : RDKit Mol Molecule. """ if mol.HasProp('_Name'): return mol.GetProp('_Name') else: return None def _get_isomeric_smiles(self, mol): """ Get canonical isomeric SMILES for a molecule. Also sets the isomericSmiles property to avoid recomputing. Note that stereochemistry is not assigned from 3D coordinates; it must be explicitly present in the file or it will not show up in the SMILES conversion. Parameters ---------- mol : RDKit Mol Molecule. """ if mol.HasProp('isomericSmiles'): return mol.GetProp('isomericSmiles') else: smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) mol.SetProp('isomericSmiles', smiles, computed=True) return smiles def clean_mol(self, mol): """ Clean a molecule. Parameters ---------- mol : RDKit Mol Molecule. """ if self.remove_salts: # hydrogens must be removed for pattern matching to work properly try: mol_no_h = Chem.RemoveHs(mol) except ValueError: if mol.HasProp('_Name'): name = mol.GetProp('_Name') else: name = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) warnings.warn('Skipping ' + name) return None new = self.salt_remover.StripMol(mol_no_h) # only keep if it is valid (# the molecule may _be_ a salt) and has # actually been changed if new.GetNumAtoms() and mol_no_h.ToBinary() != new.ToBinary(): mol = new return mol
mol = input_cell if mol is None: stand_mol_list.append( ("Got empty molecule", index, mol, "No", None, None)) continue try: mol = rdMolStandardize.MetalDisconnector().Disconnect( mol) # Disconnect metals except ValueError as e: if len(Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)) > 1: mixture = "Yes" stand_mol_list.append( ("Failed at disconnect", index, None, mixture, None, str(e))) continue mol = r.StripMol(mol) # Check if we have multiple fragments present if len(Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)) > 1: mixture = "Yes" else: mixture = "No" # Standardize fragments separately for i, frag in enumerate( Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)): frag = r.StripMol(frag) if frag.GetNumAtoms() == 0:
#os.mkdir(current_dir) try: mol = MolFromSmiles(smi) #MolToImage(mol).save(os.path.join(current_dir,"smi.jpeg")) except Exception as e: print(i, e) try: stnd_mol = MolFromSmiles(stnd_smi) #MolToImage(stnd_mol).save(os.path.join(current_dir,"stnd_smi.jpeg")) except Exception as e: print(i, e) try: res = remover.StripMol(mol) #MolToImage(res).save(os.path.join(current_dir,"smi_res.jpeg")) res_smi = MolToSmiles(res) res_smi_list.append(res_smi) except Exception as e: res_smi = "-" res_smi_list.append(res_smi) print(i, e) try: stnd_res = remover.StripMol(stnd_mol) #MolToImage(stnd_res).save(os.path.join(current_dir,"stnd_smi_res.jpeg")) res_stnd_smi = MolToSmiles(stnd_res) res_stnd_smi_list.append(res_stnd_smi) except Exception as e: res_stnd_smi = "-"
def main(argv=sys.argv): valid_elements = ['H', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'Br', 'I'] valid_atomic_num = [1, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53] if len(argv) < 3: print """ OBJ to strip self-defined counterions Usage: %s [options] input output [options] --strip : if given, to run stripping salts/solvents --strip-sdf file: specify the mols/fragments to be removed --strip-smarts file: one SMARTS string per line --filter-invalid : if given, to remove molecules containing R group or elements other than %s --addh : if given, to add hydrogens --make3d : if given, 3D coordinates will be generated Attention 1. rdkit.Chem.SaltRemover.SaltRemover is called 2. if neither `--strip-sdf` nor `--strip-smarts` is provided, stripping salts will be done according to default salts defined in `RDConfig.RDDataDir/Salts.txt` 3. both `input` and `output` are .sdf 4. whenever `--make3d` is given, please make sure that there is no complex or salts/solvents can be stripped. Otherwise, maybe there is something wrong with optimized structure. """ % (argv[0], str(valid_elements)) sys.exit(1) options, args = getopt(argv[1:], '', [ 'strip-sdf=', 'strip-smarts=', 'strip', 'filter-invalid', 'addh', 'make3d' ]) filter_invalid = False strip = False addh = False make3d = False strip_sdf = None strip_smarts = None for opt, val in options: if opt == '--strip': strip = True elif opt == "--strip-sdf": strip_sdf = val elif opt == '--strip-smarts': strip_smarts = val elif opt == '--filter-invalid': filter_invalid = True elif opt == '--addh': addh = True elif opt == '--make3d': make3d = True else: print "Error: invalid option", opt sys.exit(1) assert len(args) == 2 infile = args[0] outfile = args[1] smarts = "" if strip_sdf is not None: print "To load fragments from", strip_sdf count = 0 for m in Chem.SDMolSupplier(strip_sdf): count += 1 if m is None: print "Warning: failed to read %dth molecule in %s" % ( count, strip_sdf) continue smarts += (Chem.MolToSmarts(m) + "\n") if strip_smarts is not None: print "to load fragments from", strip_smarts for line in open(strip_smarts, 'r'): smarts += line if strip: if smarts == "": remover = SaltRemover(defnData=smarts) else: remover = SaltRemover() else: remover = None inf = Chem.SDMolSupplier(infile) outf = Chem.SDWriter(outfile) count = 0 for m in inf: count += 1 if m is None: print "Warning: failed to load %dth molecule from %s" % (count, infile) continue if filter_invalid: invalid = False for a in m.GetAtoms(): if a.GetAtomicNum() not in valid_atomic_num: invalid = True break if invalid: continue if strip: m = remover.StripMol(m) if m.HasProp("_Name"): name = m.GetProp("_Name") else: name = "%s_%d" % (infile, count) if num_components(m) > 1: print "Warning: %s still has more than one components!" % name if addh: m = AllChem.AddHs(m) if make3d: AllChem.EmbedMolecule(m) AllChem.UFFOptimizeMolecule(m) outf.write(m) outf.close()
def test_github_4550(self): m = Chem.MolFromSmiles('Cl.C[N]1=CC=CC=C1', sanitize=False) self.assertEqual(m.GetNumAtoms(), 8) saltstrip = SaltRemover() res = saltstrip.StripMol(m, sanitize=False) self.assertEqual(Chem.MolToSmiles(res), 'CN1=CC=CC=C1')
def ARGE_function(root_filename_open): # STEP 1: from a dataset of molecules, generate all possible fragmentations depending brics bounds # STEP 2: determine best R0 depending on a score # STEP 3: when there is a match, characterize molecules from the dataset with best R0 and substituants associated # STEP 4: Iterate the process with only molecules undescribed with best R0 # STEP 0: Dataset of molecules, sdf required. List of smiles created. suppl = Chem.SDMolSupplier(root_filename_open) list_smiles_n = [] remover = SaltRemover() for mol in suppl: try: res = remover.StripMol(mol) list_smiles_n.append(Chem.MolToSmiles(res)) except: print("a line of the SDF has been ignored: " + mol) print( "STEP 0 succeed: file recognized as sdf, all rows recognized as molecules" ) # STEP 1: df_brics_frag_gen(), return df_brics_frag # Generation of all fragments combinations of all molecules depending on brics bonds cuts # A number of brics bonds cuts > 1 is used in a combination df_brics_frag = df_brics_frag_gen(list_smiles_n) # Iterative process, results are compilated in df_final dict_0 = {} df_final = pd.DataFrame(dict_0) num_ite = 1 while len(list_smiles_n) > 0: # STEP 2: R0 ranking depending on a score df_unique_frag = df_unique_frag_gen(df_brics_frag) # STEP 3: when it is possible, characterize molecules from the dataset with best R0 and substituants associated df_subs_r0 = df_subs_r0_gen(df_unique_frag, list_smiles_n) # STEP 3-bis: clean the results, attribute R1, R2, Rn labels to substituants df_subs_r0 = r0_clean(df_subs_r0, df_unique_frag, num_ite) # Concat results in df_final df_final = pd.concat([df_final, df_subs_r0], axis=0, sort=False) # STEP 4: Prepare df_brics_frag and list_smiles_n of the next iteration df_brics_frag = df_brics_frag_ite(df_subs_r0, df_brics_frag) list_smiles_n = list_smiles_n_ite(df_subs_r0, list_smiles_n) print("num ite: = " + str(num_ite)) num_ite = num_ite + 1 if len(df_brics_frag) == 0: # Handle molecules in list_smiles_n undescribed in df_brics_frag due to brics bonds number of 0 or 1 df_final = pd.concat( [df_final, final_undescribed_mol(list_smiles_n)], axis=0, sort=False) print("Iterative process succeed and residual molecules added") df_final["mol_mol"] = df_final["mol_smiles"].apply( lambda x: Chem.MolFromSmiles(x)) df_final = df_final.fillna(0) for x in df_final: df_final[x] = df_final[x].apply(lambda n: "" if n == 0 else n) df_final[x] = df_final[x].apply(lambda n: "" if n == "0" else n) return df_final