def generateMoleculeHierarchyTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.moleculeHierarchy: hierarchy = MoleculeHierarchy(molecule=molecule) else: hierarchy = molecule.moleculeHierarchy saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) if mol.GetNumAtoms() == base.GetNumAtoms(): hierarchy.parent_molecule = molecule else: hierarchy.parent_molecule = getParentMolregnoFromBase( MolToMolBlock(base)) hierarchy.active_molecule = hierarchy.parent_molecule try: hierarchy.save() except IntegrityError as e: if debug: print e.message else: raise e
def preprocess_smi(smi): # Filter 1- Convert to Canonical Smiles try: mol = Chem.MolFromSmiles(smi) can_smi = Chem.MolToSmiles(mol, True) except: return None # Filter 2- Remove salt remover = SaltRemover() mol = Chem.MolFromSmiles(can_smi) res, deleted = remover.StripMolWithDeleted(mol, dontRemoveEverything=True) removed_salt_smi = Chem.MolToSmiles(res) # Filter 3- Remove Charge uncharger = rdMolStandardize.Uncharger() m = Chem.MolFromSmiles(removed_salt_smi) p = uncharger.uncharge(m) uncharged_smi = Chem.MolToSmiles(p) # Filter 4 - Standardize the tautomer clean_smi = MolStandardize.canonicalize_tautomer_smiles(uncharged_smi) return clean_smi
def test_withSmiles(self): remover = SaltRemover(defnData="[Na+]\nCC(=O)O", defnFormat=InputFormat.SMILES) self.assertEqual(len(remover.salts), 2) mol = Chem.MolFromSmiles('CC(=O)O.[Na+]') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 0)
def standardizeSMILES(smiIn): # self.mol = loader.ReadMolFromSmile(self.smi) s = Standardizer() mol = Chem.MolFromSmiles(smiIn) try: out = timeFunction(normalize, mol) if out == "ERROR": print "Normalize SMILES: ERROR DURING THE PROCESS" else: molstandardized = out except: print "Normalize SMILES: ERROR INPUT SMI" if "molstandardized" in locals(): smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover() mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments # Case of fragment -> stock in log file, check after to control print "Fragments after standardization: " + smilesclean + "\n" smilesclean = "" if smilesclean == "": print "SMILES empty after preparation\n" return 1 else: print "Prepared SMI :" + str(smilesclean) + "\n" return smilesclean
def test_withDontRemoveEverything(self): testFile = os.sep.join( [os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf']) remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL) m = Chem.MolFromSmiles('Cc1ccccc1') mol, deleted = remover.StripMolWithDeleted(m, dontRemoveEverything=True) # List should be empty self.assertFalse(deleted) self.assertEqual(m, mol)
def remove_salts(mol, dictionary=True, *args, **kwargs): """Removes salts from a molecule. This function removes detected salts following a salts dictionary by default. Parameters ---------- mol: rdkit.Chem.Mol The molecule to be modified. dictionary: bool, optional True (default): Activates the use of the salt dictionary. False: Uses the standard StripMol functionality, provided by rdkit.Chem.SaltRemover. defnData: list of str, optional If the dictionary is set to False, a custom dictionary can be set up. If not rdkit default values from '/scratch/RDKit_git/Data/Salts.txt' are used. Returns ------- mol: rdkit.Chem.Mol A new molecule with salts removed. Notes ----- The Salts Dictionary The dictionary used is a derived version from the ChEMBL salt dictionary, created for the standardiser application by Francis Atkinson. The salts are stored as list of (neutral) SMILES. """ lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) i = 0 if dictionary == True: salts = _extract_row_from_csv(0) salt_names = _extract_row_from_csv(1) list_len = len(salts) while i < list_len: salt = salts[i] salt_name = salt_names[i] test = Chem.MolToSmiles(mol) i += 1 remover = SaltRemover(defnData=salt) stripped_mol = remover.StripMol(mol) test_smiles = Chem.MolToSmiles(stripped_mol) if test_smiles != test: logging.debug("Following salt was stripped: %s", salt_name) mol = stripped_mol continue else: mol = SaltRemover(*args, **kwargs).StripMol(mol) return mol
def test_withSdfFile(self): testFile = os.sep.join( [os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf']) remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL) self.assertEqual(len(remover.salts), 240) m = Chem.MolFromSmiles("Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O.[Na+]") tuple = remover.StripMolWithDeleted(m) self.assertEqual(Chem.MolToSmiles(tuple.mol), 'Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O') self.assertEqual(len(tuple.deleted), 1) self.assertEqual(Chem.MolToSmiles(tuple.deleted[0]), '[Na+]')
def __init__(self, f=None, mol_format=None, remove_hydrogens=False, remove_salts=True, compute_2d_coords=True): if not remove_hydrogens and remove_salts: warnings.warn('Compounds with salts will have hydrogens removed') super(MolReader, self).__init__(f, mol_format) self.remove_hydrogens = remove_hydrogens self.remove_salts = remove_salts if remove_salts: self.salt_remover = SaltRemover() self.compute_2d_coords = compute_2d_coords
def parse_smiles(smiles): """ Sanity check and normalization for drugs """ try: # Remove salts smiles = smiles.split()[0] mol = Chem.MolFromSmiles(smiles) remover = SaltRemover() mol = remover.StripMol(mol) parsed_smiles = Chem.MolToSmiles(mol) return parsed_smiles except Exception as e: pass return smiles
def __init__(self, DataFrame, threshold = None, set_threshold = False, standardise = True, process = True): """ Initialiser. : name (str/pd.DataFrame): : threshold (int): : set_threshold (bool): : standardise (bool): """ self.threshold = threshold self.set_threshold = set_threshold self.process = process self.standardise = standardise self.DataFrame = DataFrame # path with stored datasets self.path = '/projects/../../datasets/' self.pool = mp.Pool(processes = mp.cpu_count()) if self.standardise: if self.process: self.name = DataFrame # Preparing data for preprocessing self.open_file() self.filter_data() self.standardiser = mv.Standardizer() self.salt_remover = SaltRemover() self.accepted_atoms = ['H','C','N','O','F','S','Cl','Br']
def prepSMI(SMIin, defnFilename, removeMetal=1): mol = Chem.MolFromSmiles(SMIin) s = Standardizer() try: molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) except: return "Error: Standardization Fail" # remove salt # 1.default if defnFilename != "": remover = SaltRemover(defnFilename=defnFilename) else: remover = SaltRemover() molclean = remover(molstandardized) smilesclean = Chem.MolToSmiles(molclean) # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) try: lelem.remove("") except: pass # remove metal if removeMetal == 1: lnometal = [] for elem in lelem: if is_metalorion(elem) == 0: lnometal.append(elem) lelem = lnometal if len(lelem) == 1: smilesclean = str(lelem[0]) return smilesclean elif len(lelem) > 1: return "Error: Mixture or fragment ot check: " + smilesclean elif smilesclean == "": return "Error: SMILES empty after preparation" else: return "Error: No identified"
def test_withSmiFile(self): testFile = os.sep.join([ os.path.dirname(os.path.abspath(__file__)), 'test_data', 'c6h6-cdk.smi' ]) remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.SMILES) self.assertEqual(len(remover.salts), 216)
def SetupSaltRemover(): """Setup a salt removerr.""" Remover = None if OptionsInfo["SaltsByComponentsMode"]: return Remover return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS)
def check_salt(self, molecule: str, subType: str) -> str: """ Checks if the molecule is salt. :param molecule: :return salt: """ remover = SaltRemover() salt = None res, deleted = remover.StripMolWithDeleted(self.smiles_mol) if len(deleted) >= 1: salt = '_'.join([subType, 'salt']) return salt
def NeutraliseCharges_RemoveSalt(smiles, reactions=None): global _reactions if reactions is None: if _reactions is None: _reactions = _InitialiseNeutralisationReactions() reactions = _reactions mol = Chem.MolFromSmiles(smiles) if mol is not None: remover = SaltRemover() mol, deleted = remover.StripMolWithDeleted(mol) replaced = False for i, (reactant, product) in enumerate(reactions): while mol.HasSubstructMatch(reactant): replaced = True rms = AllChem.ReplaceSubstructs(mol, reactant, product) mol = rms[0] if replaced: return (Chem.MolToSmiles(mol, True), True) else: return (smiles, False) else: return (None, False)
def generateCompoundPropertiesTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.compoundProperty: prop = CompoundProperties(molecule=molecule) else: prop = molecule.compoundProperty saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) prop.hbd = Descriptors.CalcNumHBD(mol) prop.hba = Descriptors.CalcNumHBA(mol) prop.rtb = Descriptors.CalcNumRotatableBonds(mol) prop.alogp = Crippen.MolLogP(mol) prop.psa = Descriptors.CalcTPSA(mol) prop.full_mwt = NewDescriptors.MolWt(mol) # prop.exact_mass = Descriptors.CalcExactMolWt(mol) if base.GetNumAtoms(): prop.mw_freebase = NewDescriptors.MolWt(base) prop.full_molformula = Descriptors.CalcMolFormula(mol) try: prop.save() except IntegrityError as e: if debug: print e.message else: raise e
def test_SmilesVsSmarts(self): # SMARTS remover = SaltRemover(defnData="[Cl,Br]") mol = Chem.MolFromSmiles('CN(Br)Cl.Cl') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 4) self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br') mol = Chem.MolFromSmiles('CN(C)C.Cl.Br') res, deleted = remover.StripMolWithDeleted(mol) self.assertEqual(Chem.MolToSmiles(res), 'CN(C)C') # Because we read in SMARTS, we should output as well. Otherwise, we will have # mismatches self.assertListEqual([Chem.MolToSmarts(m) for m in deleted], ['[Cl,Br]']) # SMILES remover = SaltRemover(defnData="Cl", defnFormat=InputFormat.SMILES) mol = Chem.MolFromSmiles('CN(Br)Cl.Cl') res = remover.StripMol(mol) self.assertEqual(res.GetNumAtoms(), 4) self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
# Could allow only H, C, N, O, S, P, F, Cl, Br, I for a in fragment.GetAtoms(): if a.GetAtomicNum() == 6: return False return True def contains_nonorg(fragment): # organic: H, C, N, O, P, S, F, Cl, Br, I for a in fragment.GetAtoms(): if a.GetAtomicNum() not in [1, 6, 7, 8, 15, 16, 9, 17, 35, 53]: return "Yes" return "No" r = SaltRemover() molecule_column = input_table['Molecule'] # Input from KNIME table stand_mol_list = [] errs = [] mixture = "No" for index, input_cell in molecule_column.iteritems( ): # iterate through molecule list mol = input_cell if mol is None: stand_mol_list.append( ("Got empty molecule", index, mol, "No", None, None)) continue try: mol = rdMolStandardize.MetalDisconnector().Disconnect(
"""Functions that can be used to preprocess SMILES sequnces in the form used in the publication.""" import numpy as np import pandas as pd import tensorflow as tf from rdkit.Chem.SaltRemover import SaltRemover from rdkit import Chem from rdkit.Chem import Descriptors REMOVER = SaltRemover() ORGANIC_ATOM_SET = set([5, 6, 7, 8, 9, 15, 16, 17, 35, 53]) def dataframe_to_tfrecord(df, tfrecord_file_name, random_smiles_key=None, canonical_smiles_key=None, inchi_key=None, mol_feature_keys=None, shuffle_first=False): """Function to create a tf-record file to train the tranlation model from a pandas dataframe. Args: df: Dataframe with the sequnce representations of the molecules. tfrecord_file_name: Name/Path of the file to write the tf-record file to. random_smiles_key: header of the dataframe row which holds the randomized SMILES sequnces. canonical_smiles_key: header of the dataframe row which holds the canonicalized SMILES sequnces. inchi_key: header of the dataframe row which holds the InChI sequnces. mol_feature_keys:header of the dataframe row which holds molecualar features. shuffle_first: Defines if dataframe is shuffled first before writing to tf-record file. Returns: None """
import pyrfume from pyrfume import odorants from rickpy import ProgressBar # - file_path = os.path.join(pyrfume.DATA, 'all_cids_properties.csv') df = pd.read_csv(file_path).set_index('CID') # ## Make 3D optimized versions of the molecules # + # Make basic mol objects mols = {cid: Chem.MolFromSmiles(smi) for cid, smi in df['IsomericSMILES'].items()} # Then optimize them s = SaltRemover() p = ProgressBar(len(df)) for i, (cid, mol) in enumerate(mols.items()): p.animate(i, status=cid) try: mol.SetProp("_Name","%d: %s" % (cid, df.loc[cid, 'IsomericSMILES'])) mol = s.StripMol(mol, dontRemoveEverything=True) mol = Chem.AddHs(mol) AllChem.Compute2DCoords(mol) AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) # Is this deterministic? except Exception as e: p.log('Exception for %d: %s' % (cid, e)) mols[cid] = None else: mols[cid] = mol
print('\t\t',dataset_smiles_y.iloc[11]['SMILES'],'\n') else: print('\tCongratulations, your dataset has not incorrect smiles.') ############################################################################## ############################ STEP 2: salt elimination ######################## ############################################################################## print('[+] Eliminating salts ') withoutsalts = [] for smi in df_clean_by_sanit['SMILES']: mol = Chem.MolFromSmiles(smi) remover = SaltRemover(defnData='[Na,Cl,K,O,OH,Fe,F,H,Al,Mg,Co,Ti,NH4,Mn,Si,Ca,Au,I,Hg,Mo,Zn,Br,Ag,Sr,Cu,Bi,S,Li,NH3,He,Y,Ar,Ba,La]') mol = remover.StripMol(mol) smiles_new = Chem.MolToSmiles(mol) smiles_new = smiles_new.replace('.[H+]', '').replace('[H+].', '') # because saltremover do not eliminate water withoutsalts.append(smiles_new) df_clean_by_sanit.insert(2,'W/O SALTS',withoutsalts) prompt = [] for smile_with, smile_without in zip(df_clean_by_sanit['SAN_SMILES'],df_clean_by_sanit['W/O SALTS']): if smile_with != smile_without:
def main(argv=sys.argv): valid_elements = ['H', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'Br', 'I'] valid_atomic_num = [1, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53] if len(argv) < 3: print """ OBJ to strip self-defined counterions Usage: %s [options] input output [options] --strip : if given, to run stripping salts/solvents --strip-sdf file: specify the mols/fragments to be removed --strip-smarts file: one SMARTS string per line --filter-invalid : if given, to remove molecules containing R group or elements other than %s --addh : if given, to add hydrogens --make3d : if given, 3D coordinates will be generated Attention 1. rdkit.Chem.SaltRemover.SaltRemover is called 2. if neither `--strip-sdf` nor `--strip-smarts` is provided, stripping salts will be done according to default salts defined in `RDConfig.RDDataDir/Salts.txt` 3. both `input` and `output` are .sdf 4. whenever `--make3d` is given, please make sure that there is no complex or salts/solvents can be stripped. Otherwise, maybe there is something wrong with optimized structure. """ % (argv[0], str(valid_elements)) sys.exit(1) options, args = getopt(argv[1:], '', [ 'strip-sdf=', 'strip-smarts=', 'strip', 'filter-invalid', 'addh', 'make3d' ]) filter_invalid = False strip = False addh = False make3d = False strip_sdf = None strip_smarts = None for opt, val in options: if opt == '--strip': strip = True elif opt == "--strip-sdf": strip_sdf = val elif opt == '--strip-smarts': strip_smarts = val elif opt == '--filter-invalid': filter_invalid = True elif opt == '--addh': addh = True elif opt == '--make3d': make3d = True else: print "Error: invalid option", opt sys.exit(1) assert len(args) == 2 infile = args[0] outfile = args[1] smarts = "" if strip_sdf is not None: print "To load fragments from", strip_sdf count = 0 for m in Chem.SDMolSupplier(strip_sdf): count += 1 if m is None: print "Warning: failed to read %dth molecule in %s" % ( count, strip_sdf) continue smarts += (Chem.MolToSmarts(m) + "\n") if strip_smarts is not None: print "to load fragments from", strip_smarts for line in open(strip_smarts, 'r'): smarts += line if strip: if smarts == "": remover = SaltRemover(defnData=smarts) else: remover = SaltRemover() else: remover = None inf = Chem.SDMolSupplier(infile) outf = Chem.SDWriter(outfile) count = 0 for m in inf: count += 1 if m is None: print "Warning: failed to load %dth molecule from %s" % (count, infile) continue if filter_invalid: invalid = False for a in m.GetAtoms(): if a.GetAtomicNum() not in valid_atomic_num: invalid = True break if invalid: continue if strip: m = remover.StripMol(m) if m.HasProp("_Name"): name = m.GetProp("_Name") else: name = "%s_%d" % (infile, count) if num_components(m) > 1: print "Warning: %s still has more than one components!" % name if addh: m = AllChem.AddHs(m) if make3d: AllChem.EmbedMolecule(m) AllChem.UFFOptimizeMolecule(m) outf.write(m) outf.close()
def main(filename,remove,select,identity,verbose,check,cpu): """ remove or select molecules by using predefined or custom filters """ if check: checkPredefined() sys.exit(0) select_xml= getxmls(select) remove_xml= getxmls(remove) showFilters(select_xml,remove_xml) FS = readFilters(select_xml,remove_xml) b= os.path.basename(filename) prefix= b.split(".")[0] ext= b.split(".")[-1].lower() if ext == "smi" or ext == "smiles" or ext == "ism" : mols = Chem.SmilesMolSupplier(filename,titleLine=False) outfile = prefix+"-selected.smi" outWriter = Chem.SmilesWriter(outfile,includeHeader=False,delimiter=' ') elif ext == "sdf" or ext == "sd" : mols = Chem.SDMolSupplier(filename) outfile = prefix+"-selected.sdf" outWriter = Chem.SDWriter(outfile) logfile = prefix+"-rejected.csv" f = open(logfile,"w") logWriter= csv.writer(f,delimiter=",",quotechar='"') p = Pool(cpu) # test molId with the first record if identity: try: Id = mols[0].GetProp(identity) assert Id except: print("Error: cannot define Id by given 'identity'") sys.exit(9) else: if mols[0].GetProp("_Name"): identity= "_Name" else: identity= guessIdentity(mols) num = 0 num_records= 0 num_remove = 0 num_select = 0 done= [] for m in mols : num_records += 1 if m == None : continue Id = m.GetProp(identity) if (not Id) or (Id in done): Id = default_molId_prefix+"%04d" % num m.SetProp("_Name",Id) m = SaltRemover().StripMol(m, dontRemoveEverything=True) workload = [] for filtername,action,entries in FS : workload.extend([(m,Id,action,filtername,grp,smarts,lb,ub) \ for grp,smarts,lb,ub in entries]) results = list(p.map(worker, workload)) vote_to_remove = 0 vote_to_select = 0 reasons= [] for res in results: flag,action,molId,vstr,filtername,entryname,lb,ub = res if (flag == True and action == "select") or \ (flag == False and action == "remove") : vote_to_select += 1 else: vote_to_remove += 1 reasons.append(res) # verdict if vote_to_remove > 0: num_remove += 1 for res in reasons: logWriter.writerow(res) # rejected if verbose: print(" %-10s %s %-30s %10s [%s..%s]" % res[2:]) else: num_select += 1 outWriter.write(m) # passed done.append(Id) num += 1 num_done = len(done) print("%d/%d done <-- %s" % (num_done,num_records,filename)) print("%d/%d selected --> %s" % (num_select,num_done,outfile)) print("%d/%d rejected --> %s" % (num_remove,num_done,logfile))
class MolReader(MolIO): """ Read molecules from files and file-like objects. Supports SDF, SMILES, and RDKit binary format (via pickle). Parameters ---------- f : file, optional File-like object. mol_format : str, optional Molecule file format. Currently supports 'sdf', 'smi', and 'pkl'. remove_hydrogens : bool, optional (default False) Remove hydrogens from molecules. remove_salts : bool, optional (default True) Remove salts from molecules. Note that this will remove any hydrogens present on the molecule. compute_2d_coords : bool, optional (default True) Compute 2D coordinates when reading SMILES. If molecules are written to SDF without 2D coordinates, stereochemistry information will be lost. """ def __init__(self, f=None, mol_format=None, remove_hydrogens=False, remove_salts=True, compute_2d_coords=True): if not remove_hydrogens and remove_salts: warnings.warn('Compounds with salts will have hydrogens removed') super(MolReader, self).__init__(f, mol_format) self.remove_hydrogens = remove_hydrogens self.remove_salts = remove_salts if remove_salts: self.salt_remover = SaltRemover() self.compute_2d_coords = compute_2d_coords def __iter__(self): """ Iterate over molecules. """ return self.get_mols() def get_mols(self): """ Read molecules from a file-like object. Molecule conformers are grouped into a single molecule. Two molecules are considered conformers of the same molecule if they: * Are contiguous in the file * Have identical (canonical isomeric) SMILES strings * Have identical compound names (if set) Returns ------- A generator yielding (possibly multi-conformer) RDKit Mol objects. """ parent = None for mol in self._get_mols(): if parent is None: parent = mol continue if self.are_same_molecule(parent, mol): if mol.GetNumConformers(): for conf in mol.GetConformers(): parent.AddConformer(conf, assignId=True) else: continue # skip duplicate molecules without conformers else: parent = self.clean_mol(parent) if parent is not None: yield parent parent = mol if parent is not None: parent = self.clean_mol(parent) if parent is not None: yield parent def _get_mols(self): """ Read molecules from a file-like object. This method returns individual conformers from a file and does not attempt to combine them into multiconformer Mol objects. Returns ------- A generator yielding RDKit Mol objects. """ if self.mol_format == 'sdf': mols = self._get_mols_from_sdf() elif self.mol_format == 'smi': mols = self._get_mols_from_smiles() elif self.mol_format == 'pkl': mols = self._get_mols_from_pickle() else: raise NotImplementedError('Unrecognized molecule format ' + '"{}"'.format(self.mol_format)) # skip read errors while True: try: mol = mols.next() except StopIteration: break except Exception: warnings.warn('Skipping molecule.') continue else: if mol is not None: yield mol def _get_mols_from_sdf(self): """ Read SDF molecules from a file-like object. """ supplier = Chem.ForwardSDMolSupplier(self.f, removeHs=self.remove_hydrogens) for mol in supplier: yield mol def _get_mols_from_smiles(self): """ Read SMILES molecules from a file-like object. """ for line in self.f.readlines(): line = line.strip() if not line: continue split_line = line.split() if len(split_line) > 1: smiles, name = split_line else: smiles, = split_line name = None # hydrogens are removed by default, which triggers sanitization try: if self.remove_hydrogens: mol = Chem.MolFromSmiles(smiles) else: mol = Chem.MolFromSmiles(smiles, sanitize=False) Chem.SanitizeMol(mol) if self.compute_2d_coords: AllChem.Compute2DCoords(mol) except Exception: warnings.warn('Skipping ' + line) continue else: if name is not None: mol.SetProp('_Name', name) yield mol def _get_mols_from_pickle(self): """ Read pickled molecules from a file-like object. Files that contain multiple pickles are supported by repeated calls to load. """ while True: try: mols = cPickle.load(self.f) for mol in np.atleast_1d(mols): yield mol except EOFError: break def are_same_molecule(self, a, b): """ Test whether two molecules are conformers of the same molecule. Test for: * Identical (canonical isomeric) SMILES strings * Identical compound names (if set) Parameters ---------- a, b : RDKit Mol Molecules to compare. """ # get names, if available a_name = self._get_name(a) b_name = self._get_name(b) # get canonical isomeric SMILES a_smiles = self._get_isomeric_smiles(a) b_smiles = self._get_isomeric_smiles(b) assert a_smiles and b_smiles # test for same molecule return a_smiles == b_smiles and a_name == b_name def _get_name(self, mol): """ Get molecule name, if available. Parameters ---------- mol : RDKit Mol Molecule. """ if mol.HasProp('_Name'): return mol.GetProp('_Name') else: return None def _get_isomeric_smiles(self, mol): """ Get canonical isomeric SMILES for a molecule. Also sets the isomericSmiles property to avoid recomputing. Note that stereochemistry is not assigned from 3D coordinates; it must be explicitly present in the file or it will not show up in the SMILES conversion. Parameters ---------- mol : RDKit Mol Molecule. """ if mol.HasProp('isomericSmiles'): return mol.GetProp('isomericSmiles') else: smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) mol.SetProp('isomericSmiles', smiles, computed=True) return smiles def clean_mol(self, mol): """ Clean a molecule. Parameters ---------- mol : RDKit Mol Molecule. """ if self.remove_salts: # hydrogens must be removed for pattern matching to work properly try: mol_no_h = Chem.RemoveHs(mol) except ValueError: if mol.HasProp('_Name'): name = mol.GetProp('_Name') else: name = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) warnings.warn('Skipping ' + name) return None new = self.salt_remover.StripMol(mol_no_h) # only keep if it is valid (# the molecule may _be_ a salt) and has # actually been changed if new.GetNumAtoms() and mol_no_h.ToBinary() != new.ToBinary(): mol = new return mol
df["STND_SMILES"].values)) == len(df), "Standardize Smiles reduced" non_cases = np.squeeze(np.argwhere((df["STND_SMILES"] == "-").values)) df = df.drop(non_cases.tolist()) df = df.reset_index(drop=True) print(df[df.duplicated("STND_SMILES", False)][["Result", "STND_SMILES"]]) # No Contradict df = df.drop_duplicates("STND_SMILES") # Drop first duplicated ones df = df.reset_index(drop=True) # 4. Salt 제거 df = df.reset_index(drop=True) remover = SaltRemover() base_dir = "pic_salt" res_smi_list = [] res_stnd_smi_list = [] for i in range(len(df)): smi, stnd_smi = df.loc[i][["SMILES", "STND_SMILES"]] current_dir = os.path.join(base_dir, str(i)) #os.mkdir(current_dir) try: mol = MolFromSmiles(smi) #MolToImage(mol).save(os.path.join(current_dir,"smi.jpeg")) except Exception as e: print(i, e)
from rdkit import Chem from rdkit.Chem.SaltRemover import SaltRemover remover = SaltRemover() def standardize_smiles(smiles: str) -> str: smiles = smiles.replace('\\', '') smiles = smiles.replace('/', '') smiles = smiles.replace('@', '') mol = Chem.MolFromSmiles(smiles) res = remover.StripMol(mol, dontRemoveEverything=True) smiles = Chem.MolToSmiles(res) return smiles
def prepareChem(self, prSMIclean): psmiclean = prSMIclean + self.name + ".smi" # try if existing if path.exists(psmiclean): psmiclean = prSMIclean + self.name + ".smi" fsmiclean = open(psmiclean, "r") smiclean = fsmiclean.readlines() fsmiclean.close() smiclean = smiclean[0].strip() self.smiclean = smiclean self.mol = Chem.MolFromSmiles(smiclean) self.log = self.log + "Prep SMI :" + str(self.smi) + "\n" self.log = self.log + "Prepared SMI :" + str(self.smiclean) + "\n" else: #self.mol = loader.ReadMolFromSmile(self.smi) s = Standardizer() mol = Chem.MolFromSmiles(self.smi) try: out = toolbox.timeFunction(normalize, mol) if out == "ERROR": self.log = self.log + "Normalize SMILES: ERROR DURING THE PROCESS\n" else: molstandardized = out except: self.log = self.log + "Normalize SMILES: ERROR INPUT SMI\n" if "molstandardized" in locals(): smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover(defnFilename="Salts.txt") mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments # Case of fragment -> stock in log file, check after to control self.log = self.log + "Fragments after standardization: " + smilesclean + "\n" smilesclean = "" if smilesclean == "": self.log = self.log + "ERROR SMILES: SMILES empty after preparation\n" else: self.log = self.log + "Prepared SMI :" + str( smilesclean) + "\n" fsmiclean = open(psmiclean, "w") fsmiclean.write(smilesclean) fsmiclean.close() self.smiclean = smilesclean self.psmiclean = psmiclean
from rdkit import DataStructs from rdkit import Chem from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem.rdmolops import RemoveStereochemistry from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem.Descriptors import HeavyAtomMolWt from rdkit.Chem.SaltRemover import SaltRemover #Putting "O" in here will unify hydrates, like morphine and morphine monohydrate (called morphine by mesh!) remover = SaltRemover(defnData='[Cl,Br,K,I,Na,O]') chems = [] n = 0 with open('smiles.txt','r') as inf, open('simpler.txt','w') as outf: for line in inf: n+=1 if n % 10000 == 0: print(n) x = line.strip().split('\t') cid = x[0] #this set of identifiers causes rdkit to segfault :( # given the number of things in the list, a better strategy than run it till it dies, and try # again is probably advisable #if cid in ['CHEBI:17627', 'CHEBI:50385','CHEBI:18140','CHEBI:38277','CHEBI:50162', # 'CHEBI:29297','CHEBI:29293','CHEBI:133488','CHEBI:30158','CHEBI:51220', ## 'CHEBI:30470','CHEBI:36301','CHEBI:38284','CHEBI:48998','CHEBI:37189', # 'CHEBI:60532','CHEBI:51221','CHEBI:29416', 'CHEBI:36163','CHEBI:29296', # 'CHEBI:51508','CHEBI:30665','CHEBI:29886','CHEBI:85715','CHEBI:49851', # 'CHEBI:30197','CHEBI:30125','CHEBI:37856','CHEBI:38283','CHEBI:10098', # 'CHEBI:132769','CHEBI:133489','CHEBI:134067','CHEBI:141330','CHEBI:15432',
def remove_water(m): from rdkit.Chem.SaltRemover import SaltRemover remover = SaltRemover(defnData="[O]") return remover.StripMol(m)
class DrugNameConverter: """ Class for converting drug names to InChI keys using PubChem API to query drug names and RDKit for generating InChI keys. Includes options for using isomeric forms and for removing salts from drugs. """ remover = SaltRemover() @classmethod def to_inchi_keys(cls, name, isomeric=True, strip_salts=True): """ Queries PubChem API for a drug with a given name and returns a set of corresponding InChI Keys. Parameters: name (str): name of drug Keyword arguments: isomeric (bool): if True, returns InChI Keys computed from isomeric SMILES otherwise, returns InChI Keys computed from canonical SMILES strip_salts (bool): if True, computed InChI Keys using both the original drug SMILES and also the SMILES where all salts are removed Returns: inchi_keys (set): set of InChI Keys corresponding to the drug name queried """ inchi_keys = set() for smiles in PubChemQuery.name_to_smiles(name, isomeric=isomeric): mol = Chem.MolFromSmiles(smiles) inchi_keys.add(Chem.MolToInchiKey(mol)) if strip_salts: stripped_mol = cls.remover.StripMol(mol, dontRemoveEverything=True) inchi_keys.add(Chem.MolToInchiKey(stripped_mol)) return inchi_keys @classmethod def batch_to_inchi_keys_single_thread(cls, names, verbose=0, **kwargs): """ Queries PubChem API for a list of drug names and returns a dictionary mapping each name to a set of corresponding InChI Keys. Parameters: names (list or set): drug names to query Keyword arguments: verbose (bool): print progess if True **kwargs: keyword arguments passed to cls.to_inchi_keys Returns: all_inchi_keys (dict): dictionary mapping each drug name to a set of corresponding InChI Keys """ all_inchi_keys = {} names = set(names) for name in names: inchi_keys = cls.to_inchi_keys(name, **kwargs) all_inchi_keys[name] = inchi_keys if verbose: print( f'Completed { len(all_inchi_keys) }/{ len(names) } drugs...', end='\r') return all_inchi_keys @classmethod def batch_to_inchi_keys(cls, names, num_cores=3, verbose=1, **kwargs): """ Queries PubChem API for a list of drug names and returns a dictionary mapping each name to a set of corresponding InChI Keys. Uses multi-threading to parallelize requests. Parameters: names (list or set): drug names to query Keyword arguments: num_cores (int/None): number of threads to use; if None, uses CPU count (at least 1 and at most 12) verbose (bool): show status bar if True **kwargs: keyword arguments passed to cls.to_inchi_keys Returns: all_inchi_keys (dict): dictionary mapping each drug name to a set of corresponding InChI Keys """ requests.get( f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound' ) # initial request necessary before pooling (gives status code 400) if num_cores is None: num_cores = min(max(mp.cpu_count(), 1), 12) # uses at least 1 core and at most 12 names = list(set(names)) with Pool(num_cores) as p: if verbose: res = list( tqdm(p.imap(partial(cls.to_inchi_keys, **kwargs), names), total=len(names))) else: res = p.map(partial(cls.to_inchi_keys, **kwargs), names) return dict(zip(names, res)) @staticmethod def invert_dict(key_to_value_set: dict) -> dict: """ Converts a dictionary with keys mapping to sets of values (e.g. drug name to set of InChI keys) into a dictionary with the values as keys, mapping to sets of the former keys (e.g. InChI key to drug names). """ assert (isinstance(key_to_value_set, dict)) value_to_key_set = {} for key in key_to_value_set: assert (isinstance(key_to_value_set[key], set)) for value in key_to_value_set[key]: if value not in value_to_key_set: value_to_key_set[value] = {key} else: value_to_key_set[value].add(key) return value_to_key_set