def std(mol, returnMetals=False): # Standardize and return a dictionary with the smiles as keys # and the molecule object and whether it's a metal ion as values stdD = {} # Check single atom compounds, to see if they are metal ions if mol.GetNumAtoms() == 1: at = mol.GetAtoms()[0].GetSymbol() symbol = '[%s]' % at if at in _metals and returnMetals: cmpd = Chem.MolFromSmiles(symbol) stdD[symbol] = (cmpd, True, True, '') else: (passed, std_cmpd, errmessage) = standardise.run(mol) if passed: stdD[symbol] = (std_cmpd, False, passed, errmessage) else: # Extract metal ions from complex compounds comp_mol, metals = disconnect(mol) if returnMetals: for metal in metals: metalmol = Chem.MolFromSmiles(metal) metal = '[%s]' % metalmol.GetAtoms()[0].GetSymbol() cmpd = Chem.MolFromSmiles(metal) stdD[metal] = (cmpd, True, True, '') # For the rest of the molecule, standardize and add standardise.run(comp_mol) try: (passed, std_cmpds, errmessage) = standardise.run(comp_mol) except: passed = False errmessage = 'Failed' if passed: stdD[Chem.MolToSmiles(std_cmpds, isomericSmiles=True)] = (std_cmpds, False, True, '') elif errmessage == 'Multiple non-salt/solvate components': cmpdD = {} for cmpd in std_cmpds: inchi = Chem.MolToInchi(cmpd) cmpdD[inchi] = cmpd for inchi in cmpdD: cmpd = cmpdD[inchi] stdD[Chem.MolToSmiles(cmpd, isomericSmiles=True)] = ( cmpd, False, True, 'Multiple non-salt/solvate components') else: stdD[Chem.MolToSmiles(mol, isomericSmiles=True)] = (mol, False, False, errmessage) return stdD
def process_data(type_dataset, dataset, i): new_ids = [] fingerprints = [] smiles = [] mol_oh = [] n_smiles = 0 for smile in dataset: try: mol = standardise.run(smile) if len(mol) <= 120: fp = AllChem.GetMorganFingerprintAsBitVect( Chem.MolFromSmiles(mol), 2, nBits=1024) m_oh = ohf.featurize([mol], 120) if str(m_oh) != 'nan': new_ids.append('{}_{}'.format(type_dataset, n_smiles + 1 + i)) fingerprints.append('[{}]'.format(','.join( [str(x) for x in fp]))) smiles.append(mol) mol_oh.append(m_oh[0]) n_smiles += 1 except: print('{} in {}'.format(smile, type_dataset)) pass print(len(smiles)) return np.string_(new_ids), np.string_(fingerprints), np.string_( smiles), np.array(mol_oh), n_smiles + i
def get_rdk_mol(smi, perform_standardisation=False): mol = Chem.MolFromSmiles(smi) if mol is None: raise Exception if mol is not None and perform_standardisation: try: mol = standardise.run(mol) except standardise.StandardiseException as e: pass return mol
def get_canonical_smile(x): """ Make our smiles canonical :param x: smile (string) :return: canonical smile (string) """ try: return standardise.run(x) except Exception: return 'None'
def calc_descriptors(row, fp_type, fp_radius, con_desc_list, stdrise=True, hashed=False): rdmol = Chem.MolFromSmiles(row['smiles']) if rdmol: if stdrise: try: rdmol = standardise.run(rdmol, output_rules_applied=[]) except: return None, None, None, None, None, None, None if fp_type == 'ecfp': if hashed: fps = AllChem.GetMorganFingerprintAsBitVect(rdmol, fp_radius, 2048, useFeatures=False) fps = {key: 1 for key in fps.GetOnBits()} else: # NB works better with binary features, removing FP feature freq (useCounts=False) fps = AllChem.GetMorganFingerprint( rdmol, fp_radius, useFeatures=False, useCounts=False).GetNonzeroElements() else: if hashed: fps = AllChem.GetMorganFingerprintAsBitVect(rdmol, fp_radius, 2048, useFeatures=True) fps = {key: 1 for key in fps.GetOnBits()} else: fps = AllChem.GetMorganFingerprint( rdmol, fp_radius, useFeatures=True, useCounts=False).GetNonzeroElements() alogp = Descriptors.MolLogP( rdmol) if 'alogp' in con_desc_list else None mw = Descriptors.MolWt(rdmol) if 'mw' in con_desc_list else None n_h_atoms = Descriptors.HeavyAtomCount( rdmol) if 'n_h_atoms' in con_desc_list else None rtb = Descriptors.NumRotatableBonds( rdmol) if 'rtb' in con_desc_list else None hbd = Descriptors.NumHDonors(rdmol) if 'hbd' in con_desc_list else None hba = Descriptors.NumHAcceptors( rdmol) if 'hba' in con_desc_list else None return fps, alogp, mw, n_h_atoms, rtb, hbd, hba else: return None, None, None, None, None, None, None
def standardize(mol): """ Wrapper to aply the structure normalization protocol provided by Francis Atkinson (EBI). If no non-salt components can be found in the mixture, the original mixture is returned. Returns a tuple containing: 1) True/False: depending on the result of the method 2) (if True ) The output molecule (if False) The error message """ try: parent = standardise.run(Chem.MolToMolBlock(mol)) except standardise.StandardiseException as e: if e.name == "no_non_salt": parent = Chem.MolToMolBlock(mol) else: return (False, e.name) return (True, parent)
def preprocessMolecule(inp): if not inp: return False def checkC(mm): mwt = Descriptors.MolWt(mm) for atom in mm.GetAtoms(): if atom.GetAtomicNum() == 6 and 100 <= mwt <= 1000: return True return False def checkHm(mm): for atom in mm.GetAtoms(): if atom.GetAtomicNum() in [2,10,13,18]: return False if 21 <= atom.GetAtomicNum() <= 32: return False if 36 <= atom.GetAtomicNum() <= 52: return False if atom.GetAtomicNum() >= 54: return False return True try: std_mol = standardise.run(inp) except standardise.StandardiseException: return None if not std_mol or checkHm(std_mol) == False or checkC(std_mol) == False: return None else: return std_mol
def mol_to_standardised_mol(mol, name=None): """Standardise mol(s).""" try: from standardiser import standardise from standardiser.utils import StandardiseException except ImportError: logging.warning( "standardiser module unavailable. Using unstandardised mol.") return mol if name is None: try: name = mol.GetProp("_Name") except KeyError: name = repr(mol) if isinstance(mol, PropertyMol): mol_type = PropertyMol mol = rdkit.Chem.Mol(mol) else: mol_type = rdkit.Chem.Mol logging.debug("Standardising {}".format(name)) try: std_mol = standardise.run(mol) except AttributeError: # backwards-compatible with old standardiser std_mol = standardise.apply(mol) except StandardiseException: logging.error( ("Standardisation of {} failed. Using unstandardised " "mol.".format(name)), exc_info=True, ) return mol_type(mol) std_mol = mol_type(std_mol) try: std_mol.SetProp("_Name", mol.GetProp("_Name")) except KeyError: pass return std_mol
mols_now.append(m) ys.append(y_now) mol_ids.append(chembl_ids[i]) activities = ys chembl_ids = visited from standardiser import standardise import logging incorrect_mols = [] # to remove those that cannot be standardised mols = [] #standardise.logger.setLevel('DEBUG') for i, m in enumerate(mols_now): print "Standardizing molecule: ", i parent = None try: parent = standardise.run(m) mols.append(parent) except standardise.StandardiseException as e: logging.warning(e.message) incorrect_mols.append(i) activities = [ x for i, x in enumerate(activities) if i not in incorrect_mols ] chembl_ids = [ x for i, x in enumerate(chembl_ids) if i not in incorrect_mols ] #-------------------------------------------------------- # writing in .sdf format: #--------------------------------------------------------
def normalize(self, ifile, method): ''' Generates a simplified SDFile with MolBlock and an internal ID for further processing Note that this method is applied to every molecule and that it removes mol blocks in the input SDFile not able to generate a valid mol Also, when defined in control, applies chemical standardization protocols, like the one provided by Francis Atkinson (EBI), accessible from: https://github.com/flatkinson/standardiser Returns a tuple containing the result of the method and (if True) the name of the output molecule and an error message otherwyse ''' success_list = [True for i in range(sdfu.count_mols(ifile))] if not method: method = '' LOG.info('Starting normalization...') try: suppl = Chem.SDMolSupplier(ifile) LOG.debug(f'mol supplier created from {ifile}') except Exception as e: LOG.error('Unable to create mol supplier with the exception: ' f'{e}') return False, 'Error at processing input file for standardizing structures' filename, fileext = os.path.splitext(ifile) ofile = filename + '_std' + fileext LOG.debug(f'writing standarized molecules to {ofile}') with open(ofile, 'w') as fo: mcount = 0 # merror = 0 for m in suppl: # molecule not recognised by RDKit if m is None: LOG.error('Unable to process molecule' f' #{mcount+1} in {ifile}') continue name = sdfu.getName(m, count=mcount, field=self.parameters['SDFile_name'], suppl=suppl) parent = None if 'standardize' in method: try: parent = standardise.run(Chem.MolToMolBlock(m)) except standardise.StandardiseException as e: if e.name == "no_non_salt": # very commong warning, use parent mol and proceed LOG.debug( f'"No non salt error" found. Skiped standardize for mol' f' #{mcount} {name}') parent = Chem.MolToMolBlock(m) else: # serious issue, no parent was generated, use original mol if (parent is None): LOG.error( f'Critical standardize exception: {e}' f' when processing mol #{mcount} {name}. Skipping normalization' ) parent = Chem.MolToMolBlock(m) # minor isse, parent was generated, show a warning and proceed else: LOG.info( f'Standardize exception: {e}' f' when processing mol #{mcount} {name}. Normalization applied' ) #return False, e.name except Exception as e: # this error means an execution error running standardizer # the molecule is discarded and therefore the list of molecules must be updated LOG.error( f'Critical standardize execution exception {e}' f' when processing mol #{mcount} {name}. Discarding molecule' ) success_list[mcount] = False continue else: LOG.info(f'Skipping normalization.') parent = Chem.MolToMolBlock(m) # in any case, write parent plus internal ID (flameID) fo.write(parent) # *** discarded method to control errors **** # flameID = 'fl%0.10d' % mcount # fo.write('> <flameID>\n'+flameID+'\n\n') mcount += 1 # terminator fo.write('$$$$\n') return success_list, ofile
def flatkinsonStandardizer(mol): return standardise.run(mol)
prism_zinc.extend(smile) for item in [smiles, zinc_smiles]: del item gc.collect() prism_zinc = shuffle(prism_zinc) print("Before standardiser: {}".format(len(prism_zinc))) standard_smiles = [] for i in range(len(prism_zinc)): smile = prism_zinc[i] try: m = Chem.MolToSmiles(Chem.MolFromSmiles(smile), isomericSmiles=True, canonical=True) mol = standardise.run(m) standard_smiles.append(mol) except standardise.StandardiseException: pass print("After standardiser: {}".format(len(standard_smiles))) del prism_zinc gc.collect() with open( '/hps/research1/icortes/acunha/data/ZINC_PRISM_SMILES/zinc_prism_smiles_processed.smi', "w") as f: f.write('\n'.join(standard_smiles))
continue chembl_help.append(list(chembl[i])) i = i + 1 #pprint (chembl_help) #Chembl standardize for lig in range(0, len(chembl_help)): #print ('Now I do this from Chembl: ' + chembl_help[lig][0]) mol = inchi.MolFromInchi(chembl_help[lig][0], sanitize=False) try: rdmolops.RemoveStereochemistry(mol) except Exception: print("Not able to remove stereochemistry. Chembl.") try: mol = standardise.run(mol) except standardise.StandardiseException as e: logging.warn(e.message) try: mol = s.standardize(mol) except Exception: print("Not able to standardize. Chembl.") try: mol = s.tautomer_parent(mol, skip_standardize=True) except Exception: print("Not able to make tautomer parent. Chembl.") mol = s.stereo_parent(mol, skip_standardize=True) chembl_help[lig][0] = inchi.MolToInchi(mol) #BDB preparing bdb_help = []
def normalize(inF, outF, singleF, failedF, remove_salts= True, keep_nonorganic= False, verbose=False, pH=7.4) : count = 0 ## count for the whole dataset count_inc = 0 ## count for only included molecules count_exc = 0 ## count for only excluded molecules all_salts = 0 ## count for entries with only salts / solvent fail_sanity = 0 ## count for entries that fail sanity check fail_mol = 0 ## count for entries that fail to create mol object fail_prot = 0 ## count for entries that fail protonation header = '%s\n' %('\t'.join(['CAS', 'Component', 'Original smiles', 'smiles'])) fail_header = '%s\n' %('\t'.join(['CAS', 'Original smiles', 'Error'])) outF.write(header) singleF.write(header) failedF.write(fail_header) for line in inF: count += 1 try: cas, smi = line.rstrip().split('\t') except: print ('Failed parsing line:') print (line) failedF.write(line.rstrip()+'\tFailed parsing line\n') continue mol = Chem.MolFromSmiles(smi) if mol is None: count_exc += 1 fail_mol += 1 failedF.write(line.rstrip()+'\tFailed to create molecule object\n') continue try: #mol = standardise.run(mol, keep_nonorganic= keep_nonorganic, remove_salts= remove_salts) succ, mol, err = standardise.run(mol, keep_nonorganic= keep_nonorganic) except Exception as err: err = '{}'.format(err) count_exc += 1 fail_sanity += 1 failedF.write('{}\t{}\t{}\n'.format(cas, smi, err)) continue i = 1 if succ: count_inc += 1 nHA = mol.GetNumHeavyAtoms() if nHA < 2: singleF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, Chem.MolToSmiles(mol, isomericSmiles=True))) else: outF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, Chem.MolToSmiles(mol, isomericSmiles=True))) #prot, protMol = protonate(mol, pH) #if prot: # outF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, Chem.MolToSmiles(protMol, isomericSmiles=True))) #else: # failedF.write('{}\t{}\t{}\n'.format(cas, smi, protMol)) # fail_prot += 1 else: smis = set([Chem.MolToSmiles(moli, isomericSmiles=True) for moli in mol]) if err == 'Multiple non-salt/solvate components': for smii in smis: moli = Chem.MolFromSmiles(smii) nHA = moli.GetNumHeavyAtoms() if nHA < 2: singleF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, smii)) else: outF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, smii)) #prot, protMol = protonate(Chem.MolFromSmiles(smii), pH) #if prot: # outF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, Chem.MolToSmiles(protMol, isomericSmiles=True))) #else: # failedF.write('{}\t{}\t{}\n'.format(cas, smi, protMol)) # fail_prot += 1 i += 1 count_inc += 1 elif err == 'No non-salt/solvate components': metal = False for smii in smis: moli = Chem.MolFromSmiles(smii) nHA = moli.GetNumHeavyAtoms() if nHA == 1 and moli.GetAtomWithIdx(0).GetSymbol() in _metals: singleF.write('{}\t{}\t{}\t{}\n'.format(cas, i, smi, smii)) metal = True i += 1 if metal: count_inc += 1 else: count_exc += 1 all_salts += 1 failedF.write('{}\t{}\t{}\n'.format(cas, smi, err)) os.system('rm in.sdf out.sdf') print ('the full dataset = {}'.format(count)) print ('Molecules normalized = {}'.format(count_inc)) print ('Molecules excluded = {}'.format(count_exc)) print (' Fail RDkit mol object = {}'.format(fail_mol)) print (' Fail protonation = {}'.format(fail_prot)) print (' Fail sanity check = {}'.format(fail_sanity)) print (' Only salts / solvent = {}'.format(all_salts))
def main(): ######################################################################## # # Program Parameters... # script_name = os.path.splitext(os.path.basename(sys.argv[0]))[0] ###### # Options, arguments and logging... argparser = argparse.ArgumentParser(description="Standardise compounds") argparser.add_argument("-V", "--verbose", action="store_true", help="enable verbose logger") argparser.add_argument("-r", "--output_rules_applied", action="store_true", help="enable output of rules applied") argparser.add_argument("infile", help="Input file (SDF or SMILES)") config = argparser.parse_args() logger = make_logger.run(__name__) ###### # Initialisation... rule_names = ["{:02d} {}".format(x['n'], x['name']) for x in standardise.rules.rule_set] counts = Counter({x: 0 for x in list(errors.keys()) + ['read', 'standardised']}) input_type = os.path.splitext(config.infile)[1] # sdf or smi ###### logger.info("Input type = '{in_type}'".format(in_type=input_type)) if input_type == ".sdf": # Read/write SDF... infile = SDF.readFile(open(config.infile)) outfile = open("standardised.sdf", "w") errfile = open("errors.sdf", "w") for original in infile: counts["read"] += 1 logger.info(">>> Starting mol '{name}'...".format(name=original.name)) ok = True try: if config.output_rules_applied: rules_applied = [] parent = standardise.run(original.molblock, output_rules_applied=rules_applied) else: parent = standardise.run(original.molblock) except standardise.StandardiseException as err: logger.warn(">>> {error} for '{name}'".format(error=errors[err.name], name=original.name)) counts[err.name] += 1 errfile.write("{mol}> <n>\n{nread}\n\n> <error>\n{error}\n\n$$$$\n".format(mol=original.molblock, nread=counts["read"], error=errors[err.name])) ok = False if ok: logger.info("Mol '{name}' OK".format(name=original.name)) counts["standardised"] += 1 parent = re.sub(r'^\w*\n', original.name + '\n', parent) if config.output_rules_applied: rules_applied = ';'.join(rule_names[x-1] for x in rules_applied) if rules_applied else '' outfile.write("{mol}> <n>\n{nread}\n\n<rules_applied>\n{rules}\n\n$$$$\n".format(mol=parent, nread=counts["read"], rules=rules_applied)) else: outfile.write("{mol}> <n>\n{nread}\n\n$$$$\n".format(mol=parent, nread=counts["read"])) if counts["read"] % 100 == 0: logger.info("...done: {read} read, {standardised} OK...".format(**counts)) else: # Read/write (tab-seperated) SMILES + name... infile = csv.reader(open(config.infile), delimiter="\t") outfile = csv.writer(open("standardised.smi", "w"), delimiter="\t") errfile = csv.writer(open("errors.smi", "w"), delimiter="\t") for original in infile: counts["read"] += 1 smiles, name = original logger.info(">>> Starting mol '{name}'...".format(name=name)) ok = True try: if config.output_rules_applied: rules_applied = [] parent = standardise.run(smiles, output_rules_applied=rules_applied) else: parent = standardise.run(smiles) except standardise.StandardiseException as err: logger.warn(">>> {error} for mol '{name}'".format(error=errors[err.name], name=name)) counts[err.name] += 1 errfile.writerow(original + [err.name]) ok = False if ok: logger.info("Mol '{name}' OK".format(name=name)) counts["standardised"] += 1 if config.output_rules_applied: rules_applied = ';'.join(rule_names[x-1] for x in rules_applied) if rules_applied else '' outfile.writerow([parent, name, smiles, rules_applied]) else: outfile.writerow([parent, name]) if counts["read"] % 100 == 0: logger.info("...done: {read} read, {standardised} OK...".format(**counts)) logger.info("Finished: {read} read, {standardised} OK in total.".format(**counts)) logger.info("Counts: " + json.dumps(counts, indent=4))
def main(): ######################################################################## # # Program Parameters... # script_name = os.path.splitext(os.path.basename(sys.argv[0]))[0] ###### # Options, arguments and logging... argparser = argparse.ArgumentParser(description="Standardise compounds") argparser.add_argument("-V", "--verbose", action="store_true", help="enable verbose logger") argparser.add_argument("-r", "--output_rules_applied", action="store_true", help="enable output of rules applied") argparser.add_argument("-i", dest="infile", help="Input file (SDF or SMILES)") argparser.add_argument("-o", dest="outfile", help="Output file") config = argparser.parse_args() logger = make_logger.run(__name__) ###### # Initialisation... rule_names = [ "{:02d} {}".format(x['n'], x['name']) for x in standardise.rules.rule_set ] counts = Counter( {x: 0 for x in list(errors.keys()) + ['read', 'standardised']}) input_type = os.path.splitext(config.infile)[1] # sdf or smi outfile_basename = os.path.splitext(config.infile)[0] outfile_ext = os.path.splitext(config.infile)[1] ###### logger.info("Input type = '{in_type}'".format(in_type=input_type)) if input_type == ".sdf": # Read/write SDF... infile = SDF.readFile(open(config.infile)) outfile = open(config.outfile, "w") errfile = open(outfile_basename + "_errors." + outfile_ext, "w") for original in infile: counts["read"] += 1 logger.info( ">>> Starting mol '{name}'...".format(name=original.name)) ok = True try: if config.output_rules_applied: rules_applied = [] parent = standardise.run( original.molblock, output_rules_applied=rules_applied) else: parent = standardise.run(original.molblock) except standardise.StandardiseException as err: logger.warn(">>> {error} for '{name}'".format( error=errors[err.name], name=original.name)) counts[err.name] += 1 errfile.write( "{mol}> <n>\n{nread}\n\n> <error>\n{error}\n\n$$$$\n". format(mol=original.molblock, nread=counts["read"], error=errors[err.name])) ok = False if ok: logger.info("Mol '{name}' OK".format(name=original.name)) counts["standardised"] += 1 parent = re.sub(r'^\w*\n', original.name + '\n', parent) if config.output_rules_applied: rules_applied = ';'.join( rule_names[x - 1] for x in rules_applied) if rules_applied else '' outfile.write( "{mol}> <n>\n{nread}\n\n<rules_applied>\n{rules}\n\n$$$$\n" .format(mol=parent, nread=counts["read"], rules=rules_applied)) else: outfile.write("{mol}> <n>\n{nread}\n\n$$$$\n".format( mol=parent, nread=counts["read"])) if counts["read"] % 100 == 0: logger.info( "...done: {read} read, {standardised} OK...".format( **counts)) else: # Read/write (tab-seperated) SMILES + name... infile = csv.reader(open(config.infile), delimiter="\t") outfile = csv.writer(open(config.outfile, "w"), delimiter="\t") errfile_name = outfile_basename + "_errors." + outfile_ext errfile = csv.writer(open(errfile_name, "w"), delimiter="\t") for original in infile: counts["read"] += 1 smiles, name = original logger.info(">>> Starting mol '{name}'...".format(name=name)) ok = True try: if config.output_rules_applied: rules_applied = [] parent = standardise.run( smiles, output_rules_applied=rules_applied) else: parent = standardise.run(smiles) except standardise.StandardiseException as err: logger.warn(">>> {error} for mol '{name}'".format( error=errors[err.name], name=name)) counts[err.name] += 1 errfile.writerow(original + [err.name]) ok = False if ok: logger.info("Mol '{name}' OK".format(name=name)) counts["standardised"] += 1 if config.output_rules_applied: rules_applied = ';'.join( rule_names[x - 1] for x in rules_applied) if rules_applied else '' outfile.writerow([parent, name, smiles, rules_applied]) else: outfile.writerow([parent, name]) if counts["read"] % 100 == 0: logger.info( "...done: {read} read, {standardised} OK...".format( **counts)) logger.info( "Finished: {read} read, {standardised} OK in total.".format(**counts)) logger.info("Counts: " + json.dumps(counts, indent=4))