def clean_smiles(smiles_df): """ Helper function which runs the standardization tool on a list of smiles strings. Args: smiles_df: DataFrame which contains smiles strings in a column named "smiles" Returns: The original DataFrame, but with the smiles strings in the "smiles" column standardized and any rows which contained problematic smiles removed """ standard = Standardizer(prefer_organic=True) for index, row in smiles_df.iterrows(): try: mol = Chem.MolFromSmiles(row['smiles']) std_mol = standard.fragment_parent(mol, skip_standardize=False) smiles_df['smiles'][index] = Chem.MolToSmiles(std_mol) except: print("Error cleaning " + str(index) + " " + str(row['smiles'])) print(smiles_df.loc[index]) smiles_df.drop(index, inplace=True) return smiles_df
def filter_salts(in_lines, Verbose=False): # standardize structures and remove salts # # This should be called before any other filters having to do with molecular structures as it # affects both the molecular structure and the molecular weight of many compounds that come out of ChEMBL s = Standardizer() #salt_file = code_dir / 'Salts.txt' salt_file = conf_dir + '/Salts.txt' remover = SaltRemover.SaltRemover(defnFilename=salt_file) for i in range(len(in_lines)): mol_in = Chem.MolFromSmiles(in_lines['canonical_smiles'][i]) mol_out = s.standardize(mol_in) smiles_out = Chem.MolToSmiles(remover(mol_out), isomericSmiles=False) if '.' in smiles_out: in_lines = in_lines.drop(i) else: in_lines.loc[i, 'canonical_smiles'] = smiles_out # in_lines['canonical_smiles'].replace(i,smiles_out) # ## I believe you should just use replace # The replace function replaces values equal to i with smiles_out # so I do not think we want to use replace if Verbose: print('Number of compounds after desalting pass: ', len(in_lines)) return in_lines.reset_index(drop=True)
def testFragmentLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_fragment100k): try: s = Standardizer() frag = s.fragment_parent(data.mol) ns = Chem.MolToSmiles(frag) except Exception: raise AssertionError(f'Line {data.lineNo}: MolVS normalization failed for SMILES {data.smiles}') self.assertEqual(ns, data.expected)
def testMetalLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_metal100k): try: n = Standardizer() nm = n.disconnect_metals(data.mol) ns = Chem.MolToSmiles(nm) except Exception: raise AssertionError(f'Line {data.lineNo}: MolVS normalization failed for SMILES {data.smiles}') self.assertEqual(ns, data.expected)
def testMetalLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_metal100k): try: n = Standardizer() nm = n.disconnect_metals(data.mol) ns = Chem.MolToSmiles(nm) except Exception: raise AssertionError( 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data)) self.assertEqual(ns, data.expected)
def testNormalizeLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_nomralized100k): try: n = Standardizer() nm = n.normalize(data.mol) ns = Chem.MolToSmiles(nm) except Exception: raise AssertionError( 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data)) self.assertEqual(ns, data.expected)
def testFragmentLong(self): if not doLong: raise unittest.SkipTest('long test') for data in self.readPCSdata(self.dataPCS_fragment100k): try: s = Standardizer() frag = s.fragment_parent(data.mol) ns = Chem.MolToSmiles(frag) except Exception: raise AssertionError( 'Line {0.lineNo}: MolVS normalization failed for SMILES {0.smiles}'.format(data)) self.assertEqual(ns, data.expected)
def split_data(mols, acts, test_percent, split): mols_train = [] mols_test = [] molnames_train = [] molnames_test = [] acts_train = [] acts_test = [] actnames_train = [] actnames_test = [] # Split molecules and activities training set into training and test sets m_train, m_test, a_train, a_test = train_test_split(mols, acts, test_size=test_percent, random_state=split) # Make a list of the names of all the molecules in the training list names_train = [] for mol in m_train: names_train.append(mol[1]) # Iterate over all the molecules we have read in for i in range(len(mols)): # assert mols[i][1] == acts[i][1] if mols[i][1] in names_train: # is the molecule in the training set? mols_train.append(mols[i][0]) molnames_train.append(mols[i][1]) acts_train.append(acts[i][0]) actnames_train.append(acts[i][1]) else: # the molecule is in the test set if it isn't in the the training set mols_test.append(mols[i][0]) molnames_test.append(mols[i][1]) acts_test.append(acts[i][0]) actnames_test.append(acts[i][1]) assert molnames_train == actnames_train assert molnames_test == actnames_test # Standardize structures of the training set and test set s = Standardizer() standard_mols_train = [] for mol in mols_train: standard_mols_train.append(s.standardize(mol)) standard_mols_test = [] for mol in mols_test: standard_mols_test.append(s.standardize(mol)) return standard_mols_train, molnames_train, acts_train, standard_mols_test, molnames_test, acts_test
def standardize_mol(mol_file): if Path(mol_file).exists(): '''Chem.MolFromMolFile() only works with string, not Path object''' mol_file = str(mol_file) mol = Chem.MolFromMolFile(mol_file) s = Standardizer() smol = s.standardize(mol) with open(mol_file, 'w') as f: f.write(Chem.MolToMolBlock(smol)) else: # print('file does not exist.') raise RuntimeError('File does not exist.')
def prepSMI(SMIin, defnFilename, removeMetal=1): mol = Chem.MolFromSmiles(SMIin) s = Standardizer() try: molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) except: return "Error: Standardization Fail" # remove salt # 1.default if defnFilename != "": remover = SaltRemover(defnFilename=defnFilename) else: remover = SaltRemover() molclean = remover(molstandardized) smilesclean = Chem.MolToSmiles(molclean) # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) try: lelem.remove("") except: pass # remove metal if removeMetal == 1: lnometal = [] for elem in lelem: if is_metalorion(elem) == 0: lnometal.append(elem) lelem = lnometal if len(lelem) == 1: smilesclean = str(lelem[0]) return smilesclean elif len(lelem) > 1: return "Error: Mixture or fragment ot check: " + smilesclean elif smilesclean == "": return "Error: SMILES empty after preparation" else: return "Error: No identified"
def standardizeSMILES(smiIn): # self.mol = loader.ReadMolFromSmile(self.smi) s = Standardizer() mol = Chem.MolFromSmiles(smiIn) try: out = timeFunction(normalize, mol) if out == "ERROR": print "Normalize SMILES: ERROR DURING THE PROCESS" else: molstandardized = out except: print "Normalize SMILES: ERROR INPUT SMI" if "molstandardized" in locals(): smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover() mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments # Case of fragment -> stock in log file, check after to control print "Fragments after standardization: " + smilesclean + "\n" smilesclean = "" if smilesclean == "": print "SMILES empty after preparation\n" return 1 else: print "Prepared SMI :" + str(smilesclean) + "\n" return smilesclean
def clean_smiles(smi): """ Helper function which runs the standardization tool on the input smiles string Args: smi: Input smiles string Returns: The standardized version of the input smiles string """ s = Standardizer(prefer_organic=True) try: mol = Chem.MolFromSmiles(smi) std_mol = s.fragment_parent(mol, skip_standardize=False) std_smi = Chem.MolToSmiles(std_mol) return std_smi except: print("Issue with input smiles string. Unable to clean " + str(smi)) return None
def Tautomerize(mol): try: if mol.GetBoolProp('tautomerized'): return except KeyError: pass smi1 = Chem.MolToSmiles(mol) from molvs import Standardizer s = Standardizer() try: s.standardize(mol) except ValueError as e: MutateFail(mol) return False #from molvs.tautomer import TautomerCanonicalizer #t = TautomerCanonicalizer() #t.canonicalize(mol) mol.SetBoolProp('tautomerized', True) smi2 = Chem.MolToSmiles(mol) if not smi1 == smi2: print "tautomerized:", smi1, 'to:', smi2 return True
def sanitize_smiles_molvs(smiles, largest_fragment=False): """Sanitize a SMILES with MolVS Parameters ---------- smiles : str SMILES string for a molecule. largest_fragment : bool Whether to select only the largest covalent unit in a molecule with multiple fragments. Default to False. Returns ------- str SMILES string for the sanitized molecule. """ standardizer = Standardizer() standardizer.prefer_organic = True mol = Chem.MolFromSmiles(smiles) if mol is None: return smiles try: mol = standardizer.standardize( mol) # standardize functional group reps if largest_fragment: mol = standardizer.largest_fragment( mol) # remove product counterions/salts/etc. mol = standardizer.uncharge(mol) # neutralize, e.g., carboxylic acids except Exception: pass return Chem.MolToSmiles(mol)
def Tautomerize(mol, aromatic=aromaticity): try: if mol.GetBoolProp('tautomerized'): return mol except KeyError: pass Chem.SanitizeMol(mol) if not (aromatic or aromaticity): Chem.Kekulize(mol, True) smi1 = Chem.MolToSmiles(mol) from molvs import Standardizer s = Standardizer() try: molnew = s.standardize(mol) except ValueError as e: raise MutateFail(mol) if not aromatic: Chem.Kekulize(molnew, True) smi2 = Chem.MolToSmiles(molnew) if smi1 == smi2: # we return mol because it contains some properties # tautomerized mols need to get the props again mol.SetBoolProp('tautomerized', True) return mol else: if mol.HasProp('failedfilter'): ff = mol.GetProp('failedfilter') molnew.SetProp('failedfilter', ff) #print "tautomerized:", smi1, 'to:', smi2 with open('tautomerized.smi', 'a') as f: f.write("{} {}\n".format(smi1, smi2)) molnew.SetBoolProp('tautomerized', True) return molnew
def standardizeMolVS(inMol): f = fragment.LargestFragmentChooser() outMol = f.choose(inMol) c = charge.Uncharger() outMol = c.uncharge(outMol) s = Standardizer() outMol = s.standardize(outMol) n = normalize.Normalizer() outMol = n.normalize(outMol) t = tautomer.TautomerCanonicalizer() outMol = t.canonicalize(outMol) # Transform with Inchi #print "inMol" #print Chem.MolToSmiles(inMol) #inchi = Chem.inchi.MolToInchi(inMol) #print inchi #print "outMol" #print Chem.MolToSmiles(outMol) #inchi = Chem.inchi.MolToInchi(outMol) #print inchi #outMol = Chem.inchi.MolFromInchi(inchi) return outMol
def testNormalizeShort(self): for data in self.readPCSdata(self.dataPCS_nomralized1k): n = Standardizer() nm = n.normalize(data.mol) ns = Chem.MolToSmiles(nm) self.assertEqual(ns, data.expected)
def standardize_main(args): mol = _read_mol(args) s = Standardizer() mol = s.standardize(mol) _write_mol(mol, args)
from molvs import Standardizer from pprint import pprint import psycopg2 import sys import re from io import StringIO chem.WrapLogs() #sio = sys.stderr = StringIO() import logging #reload(logging) logging.basicConfig(filename='logging.log', level=logging.DEBUG, format="[%(asctime)s %(levelname)-8s] %(message)s", datefmt="%Y/%b/%d %H:%M:%S") from standardiser import standardise s = Standardizer() from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) dictionary = ["C", "H", "O", "N", "S", "P", "F", "Cl", "Br", "I"] #CHEMBL preparing chembl_new = [] chembl_help = [] conn2 = psycopg2.connect('dbname=chembl user=data host=/tmp/') curs2 = conn2.cursor() curs2.execute( "select standard_inchi, standard_inchi from (((((target_dictionary T INNER JOIN target_components TC on T.tid = TC.tid) INNER JOIN component_sequences CSEQ on TC.component_id = CSEQ.component_id) INNER JOIN assays A on T.tid = A.tid) INNER JOIN activities AC on A.assay_ID = AC.assay_ID) INNER JOIN docs D on AC.doc_id = D.doc_id) INNER JOIN compound_structures CS on AC.molregno = CS.molregno where CSEQ.accession IN ('P10827', 'P10828', 'P10276', 'P10826', 'P13631', 'Q07869', 'Q03181', 'P37231', 'P20393', 'Q14995', 'P35398', 'Q92753', 'P51449', 'P55055', 'Q13133', 'Q96RI1', 'P11473', 'O75469', 'Q14994', 'P41235', 'Q14541', 'P19793', 'P28702', 'P48443', 'P13056', 'P49116', 'Q9Y466', 'Q9Y5X4', 'P10589', 'P24468', 'P10588', 'P03372', 'Q92731', 'P11474', 'O95718', 'P62508', 'P04150', 'P08235', 'P06401', 'P10275', 'P22736', 'P43354', 'Q92570', 'Q13285', 'O00482', 'Q15406', 'P51843', 'Q15466');" )
def testFragmentShort(self): for data in self.readPCSdata(self.dataPCS_fragmnet1k): s = Standardizer() frag = s.fragment_parent(data.mol) ns = Chem.MolToSmiles(frag) self.assertEqual(ns, data.expected)
from rdkit import Chem from copy import copy from pipelines.utils import utils from molvs import enumerate_tautomers_smiles,canonicalize_tautomer_smiles,Standardizer from molvs.charge import Uncharger,Reionizer from standardiser import standardise standardizer = Standardizer() def _spam(n): out=[] for perm in _getPerms(n): elem = [ int(i) for i in list(perm) ] out.append(elem) return out def _getPerms(n): from itertools import permutations for i in _getCandidates(n): for perm in set(permutations(i)): yield ''.join(perm) def _getCandidates(n): for i in range(0, n+1): res = "1" * i + "0" * (n - i) yield res def enumerateTautomers(mol): """
def normalize(mol, lout): s = Standardizer() molstandardized = s.standardize(mol) #print molstandardized lout.append(molstandardized)
def read_mols(mode, method, basename, datadir='Default', modeldir='Default'): currworkdir = os.getcwd() if datadir == 'Default': datadir = os.path.join(currworkdir, 'data') else: if not os.path.isdir(datadir): print("error: ", datadir, " is not a directory. exiting.") exit(2) if modeldir == 'Default': modeldir = os.path.join(currworkdir, 'models') else: if not os.path.isdir(modeldir): print("error: ", modeldir, " is not a directory. exiting.") exit(2) else: print('setting modeldir to ', modeldir, '.') print( 'Have you set the random splits to be correct for the model?') mol_data_filename = basename + '.smi' act_data_filename = basename + '.act' moldatafile = os.path.join(datadir, mol_data_filename) actdatafile = os.path.join(datadir, act_data_filename) # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state)) model_filename = "model_%s.dat" % output_ext index_filename = "indices_%s.dat" % output_ext appdom_fp_filename = "training-FPs_%s.dat" % output_ext appdom_rad_filename = "AD-radius_%s.dat" % output_ext if mode.startswith('class'): if os.path.isfile(actdatafile): actfh = open(actdatafile) activities = [] # array of tuples: (activity, molecule name) for actline in actfh: line = actline.split() act = float(line[1]) actname = line[0] activities.append((act, actname)) actfh.close() elif mode.startswith('reg') and method == 'xgb': bits_filename = "sigbits_%s.dat" % output_ext bits_file = os.path.join(modeldir, bits_filename) with open(bits_file, 'rb') as f: significant_bits = pickle.load(f) model_file = os.path.join(modeldir, model_filename) loaded_model = pickle.load(open(model_file, "rb")) index_file = os.path.join(modeldir, index_filename) with open(index_file, 'rb') as f: indexes = pickle.load(f) appdom_fp_file = os.path.join(modeldir, appdom_fp_filename) with open(appdom_fp_file, 'rb') as f: appdom_fps = pickle.load(f) appdom_rad_file = os.path.join(modeldir, appdom_rad_filename) with open(appdom_rad_file, 'rb') as f: appdom_radius = pickle.load(f) # Read in molecules from test set molfh = open(moldatafile) molecules = [] # array of tuples: (molecule, molecule name) for molline in molfh: line = molline.split() mol = Chem.MolFromSmiles(line[0]) molname = line[1] molecules.append((mol, molname)) molfh.close() mols_train = [] molnames_train = [] if 'activities' in locals(): acts_train = [] actnames_train = [] for i in range(len(molecules)): mols_train.append(molecules[i][0]) molnames_train.append(molecules[i][1]) if mode.startswith('class') and 'activities' in locals(): acts_train.append(activities[i][0]) actnames_train.append(activities[i][1]) # Standardize structures s = Standardizer() standard_mols_train = [] for mol in mols_train: standard_mols_train.append(s.standardize(mol)) return_dict = {} return_dict['molnames'] = molnames_train return_dict['molecules'] = standard_mols_train return_dict['model'] = loaded_model return_dict['inds'] = indexes if mode.startswith('reg') and method == 'xgb': return_dict['sigbits'] = significant_bits elif mode.startswith('class') and 'activities' in locals(): return_dict['activities'] = acts_train return_dict['ad_fps'] = appdom_fps return_dict['ad_radius'] = appdom_radius return return_dict
def testReionizeShort(self): for data in self.readPCSdata(self.dataPCS_reionize1k): n = Standardizer() nm = n.reionize(data.mol) ns = Chem.MolToSmiles(nm) self.assertEqual(ns, data.expected)
def testMetalShort(self): for data in self.readPCSdata(self.dataPCS_metal1k): n = Standardizer() nm = n.disconnect_metals(data.mol) ns = Chem.MolToSmiles(nm) self.assertEqual(ns, data.expected)
parser.add_option("--bonds_as_doubles", dest="bonds_as_doubles", default=False) opts,args = parser.parse_args() fpred = open(opts.pred_path) fgold = open(opts.gold_path) feval = open(opts.pred_path + '.eval_by_smiles', 'w') print('## Bond types in output files are doubles? {}'.format(opts.bonds_as_doubles)) idxfunc = lambda a: a.GetAtomMapNum() bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4} from molvs import Standardizer standardizer = Standardizer() standardizer.prefer_organic = True def sanitize_smiles(smi, largest_fragment=False): mol = Chem.MolFromSmiles(smi) if mol is None: return smi try: mol = standardizer.standardize(mol) # standardize functional group reps if largest_fragment: mol = standardizer.largest_fragment(mol) # remove product counterions/salts/etc. mol = standardizer.uncharge(mol) # neutralize, e.g., carboxylic acids except Exception: pass return Chem.MolToSmiles(mol)
from pprint import pprint import psycopg2 import sys import re import csv from io import StringIO chem.WrapLogs() #sio = sys.stderr = StringIO() import logging #reload(logging) logging.basicConfig(filename='logging.log', level=logging.DEBUG, format="[%(asctime)s %(levelname)-8s] %(message)s", datefmt="%Y/%b/%d %H:%M:%S") from standardiser import standardise s = Standardizer() from rdkit import RDLogger lg = RDLogger.logger() lg.setLevel(RDLogger.ERROR) dictionary = ["C", "H", "O", "N", "S", "P", "F", "Cl", "Br", "I"] with open('receptors.csv', 'r', encoding='utf-8') as receptor: reader = csv.reader( receptor, delimiter=',', ) final = list(reader) #pprint (final)
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return
def __init__(self, dcompound, logfile, writecheck=1, kSMILES="CANONICAL_SMILES", kID="CMPD_CHEMBLID"): self.compound = dcompound loader = pydrug.PyDrug() # if SMILES, load using SMILES code if not kSMILES in dcompound.keys(): try: smile = runExternalSoft.babelConvertSDFtoSMILE( dcompound["sdf"]) self.compound[kSMILES] = smile except: print "ERROR INPUT SDF - l33" self.log = "ERROR" try: logfile.write(self.compound[kID] + "\t---\tERROR-SDF ORIGINAL INPUT\n") except: pass return #Standardize smile code try: smilestandadized = standardize_smiles(self.compound[kSMILES]) except: logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tERROR-SMILES INPUT" "\n") self.log = "ERROR" return #Standardize using molvs (http://molvs.readthedocs.io/en/latest/api.html#molvs-fragment) s = Standardizer() mol = Chem.MolFromSmiles(smilestandadized) molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover() mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments #Case of fragment -> stock in log file, check after to control logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tFRAGMENT IN INPUT" "\n") print ".".join(lelem), " - FRAGMENTS - l66" self.log = "ERROR" return else: pass print self.compound[kSMILES], "SMILES IN - l25 liganddescriptors" print smilesclean, "SMILES without salt and standardized" # case where only salt are included if smilesclean == "": logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tEMPTY SMILES AFTER " "STANDARDIZATION\n") print "EMPTY SMILES AFTER STANDARDIZATION - l84" self.log = "ERROR" return self.compound[kSMILES] = smilesclean self.log = "OK" if writecheck == 1: # SMILES code pfileSMILES = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".smi" fileSMILES = open(pfileSMILES, "w") fileSMILES.write(self.compound[kSMILES]) fileSMILES.close() # SDF input if "sdf" in self.compound.keys(): pfileSDF = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".sdf" fileSDF = open(pfileSDF, "w") fileSDF.write(self.compound["sdf"]) fileSDF.close() # read mol self.mol = loader.ReadMolFromSmile(self.compound[kSMILES])
def prepareChem(self, prSMIclean): psmiclean = prSMIclean + self.name + ".smi" # try if existing if path.exists(psmiclean): psmiclean = prSMIclean + self.name + ".smi" fsmiclean = open(psmiclean, "r") smiclean = fsmiclean.readlines() fsmiclean.close() smiclean = smiclean[0].strip() self.smiclean = smiclean self.mol = Chem.MolFromSmiles(smiclean) self.log = self.log + "Prep SMI :" + str(self.smi) + "\n" self.log = self.log + "Prepared SMI :" + str(self.smiclean) + "\n" else: #self.mol = loader.ReadMolFromSmile(self.smi) s = Standardizer() mol = Chem.MolFromSmiles(self.smi) try: out = toolbox.timeFunction(normalize, mol) if out == "ERROR": self.log = self.log + "Normalize SMILES: ERROR DURING THE PROCESS\n" else: molstandardized = out except: self.log = self.log + "Normalize SMILES: ERROR INPUT SMI\n" if "molstandardized" in locals(): smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover(defnFilename="Salts.txt") mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments # Case of fragment -> stock in log file, check after to control self.log = self.log + "Fragments after standardization: " + smilesclean + "\n" smilesclean = "" if smilesclean == "": self.log = self.log + "ERROR SMILES: SMILES empty after preparation\n" else: self.log = self.log + "Prepared SMI :" + str( smilesclean) + "\n" fsmiclean = open(psmiclean, "w") fsmiclean.write(smilesclean) fsmiclean.close() self.smiclean = smilesclean self.psmiclean = psmiclean
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
#!/usr/env/bin python # -*- coding: utf-8 -*- """ unit testing for MolVS steps for PubChem Substances tests include molvs.standardize_smiles Standardizer().normalize Standardizer().disconnect_metals Standardizer().reionize molvs.standardize.canonicalize_tautomer_smiles molvs.validate.Validator() Standardizer().fragment_parent """ import gzip import os.path import unittest from collections import namedtuple import molvs from molvs import Standardizer, validate from rdkit import Chem, RDConfig doLong = False TestData = namedtuple('TestData', 'lineNo,smiles,mol,expected') class TestCase(unittest.TestCase): dataPCS_standardize_smiles100k = os.path.join(RDConfig.RDBaseDir, 'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_standardize_sm.csv.gz') dataPCS_standardize_smiles1k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '1kPCS_standardize_sm.csv.gz') dataPCS_nomralized1k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '1kPCS_normalized.csv.gz') dataPCS_nomralized100k = os.path.join(RDConfig.RDBaseDir, 'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_normalized.csv.gz') dataPCS_metal100k = os.path.join(RDConfig.RDBaseDir,'rdkit', 'Chem', 'MolStandardize', 'test_data', '100kPCS_metals.csv.gz')