def split_data(mols, acts, test_percent, split): mols_train = [] mols_test = [] molnames_train = [] molnames_test = [] acts_train = [] acts_test = [] actnames_train = [] actnames_test = [] # Split molecules and activities training set into training and test sets m_train, m_test, a_train, a_test = train_test_split(mols, acts, test_size=test_percent, random_state=split) # Make a list of the names of all the molecules in the training list names_train = [] for mol in m_train: names_train.append(mol[1]) # Iterate over all the molecules we have read in for i in range(len(mols)): # assert mols[i][1] == acts[i][1] if mols[i][1] in names_train: # is the molecule in the training set? mols_train.append(mols[i][0]) molnames_train.append(mols[i][1]) acts_train.append(acts[i][0]) actnames_train.append(acts[i][1]) else: # the molecule is in the test set if it isn't in the the training set mols_test.append(mols[i][0]) molnames_test.append(mols[i][1]) acts_test.append(acts[i][0]) actnames_test.append(acts[i][1]) assert molnames_train == actnames_train assert molnames_test == actnames_test # Standardize structures of the training set and test set s = Standardizer() standard_mols_train = [] for mol in mols_train: standard_mols_train.append(s.standardize(mol)) standard_mols_test = [] for mol in mols_test: standard_mols_test.append(s.standardize(mol)) return standard_mols_train, molnames_train, acts_train, standard_mols_test, molnames_test, acts_test
def filter_salts(in_lines, Verbose=False): # standardize structures and remove salts # # This should be called before any other filters having to do with molecular structures as it # affects both the molecular structure and the molecular weight of many compounds that come out of ChEMBL s = Standardizer() #salt_file = code_dir / 'Salts.txt' salt_file = conf_dir + '/Salts.txt' remover = SaltRemover.SaltRemover(defnFilename=salt_file) for i in range(len(in_lines)): mol_in = Chem.MolFromSmiles(in_lines['canonical_smiles'][i]) mol_out = s.standardize(mol_in) smiles_out = Chem.MolToSmiles(remover(mol_out), isomericSmiles=False) if '.' in smiles_out: in_lines = in_lines.drop(i) else: in_lines.loc[i, 'canonical_smiles'] = smiles_out # in_lines['canonical_smiles'].replace(i,smiles_out) # ## I believe you should just use replace # The replace function replaces values equal to i with smiles_out # so I do not think we want to use replace if Verbose: print('Number of compounds after desalting pass: ', len(in_lines)) return in_lines.reset_index(drop=True)
def sanitize_smiles_molvs(smiles, largest_fragment=False): """Sanitize a SMILES with MolVS Parameters ---------- smiles : str SMILES string for a molecule. largest_fragment : bool Whether to select only the largest covalent unit in a molecule with multiple fragments. Default to False. Returns ------- str SMILES string for the sanitized molecule. """ standardizer = Standardizer() standardizer.prefer_organic = True mol = Chem.MolFromSmiles(smiles) if mol is None: return smiles try: mol = standardizer.standardize( mol) # standardize functional group reps if largest_fragment: mol = standardizer.largest_fragment( mol) # remove product counterions/salts/etc. mol = standardizer.uncharge(mol) # neutralize, e.g., carboxylic acids except Exception: pass return Chem.MolToSmiles(mol)
def Tautomerize(mol): try: if mol.GetBoolProp('tautomerized'): return except KeyError: pass smi1 = Chem.MolToSmiles(mol) from molvs import Standardizer s = Standardizer() try: s.standardize(mol) except ValueError as e: MutateFail(mol) return False #from molvs.tautomer import TautomerCanonicalizer #t = TautomerCanonicalizer() #t.canonicalize(mol) mol.SetBoolProp('tautomerized', True) smi2 = Chem.MolToSmiles(mol) if not smi1 == smi2: print "tautomerized:", smi1, 'to:', smi2 return True
def standardize_mol(mol_file): if Path(mol_file).exists(): '''Chem.MolFromMolFile() only works with string, not Path object''' mol_file = str(mol_file) mol = Chem.MolFromMolFile(mol_file) s = Standardizer() smol = s.standardize(mol) with open(mol_file, 'w') as f: f.write(Chem.MolToMolBlock(smol)) else: # print('file does not exist.') raise RuntimeError('File does not exist.')
def prepSMI(SMIin, defnFilename, removeMetal=1): mol = Chem.MolFromSmiles(SMIin) s = Standardizer() try: molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) except: return "Error: Standardization Fail" # remove salt # 1.default if defnFilename != "": remover = SaltRemover(defnFilename=defnFilename) else: remover = SaltRemover() molclean = remover(molstandardized) smilesclean = Chem.MolToSmiles(molclean) # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) try: lelem.remove("") except: pass # remove metal if removeMetal == 1: lnometal = [] for elem in lelem: if is_metalorion(elem) == 0: lnometal.append(elem) lelem = lnometal if len(lelem) == 1: smilesclean = str(lelem[0]) return smilesclean elif len(lelem) > 1: return "Error: Mixture or fragment ot check: " + smilesclean elif smilesclean == "": return "Error: SMILES empty after preparation" else: return "Error: No identified"
def Tautomerize(mol, aromatic=aromaticity): try: if mol.GetBoolProp('tautomerized'): return mol except KeyError: pass Chem.SanitizeMol(mol) if not (aromatic or aromaticity): Chem.Kekulize(mol, True) smi1 = Chem.MolToSmiles(mol) from molvs import Standardizer s = Standardizer() try: molnew = s.standardize(mol) except ValueError as e: raise MutateFail(mol) if not aromatic: Chem.Kekulize(molnew, True) smi2 = Chem.MolToSmiles(molnew) if smi1 == smi2: # we return mol because it contains some properties # tautomerized mols need to get the props again mol.SetBoolProp('tautomerized', True) return mol else: if mol.HasProp('failedfilter'): ff = mol.GetProp('failedfilter') molnew.SetProp('failedfilter', ff) #print "tautomerized:", smi1, 'to:', smi2 with open('tautomerized.smi', 'a') as f: f.write("{} {}\n".format(smi1, smi2)) molnew.SetBoolProp('tautomerized', True) return molnew
def standardizeMolVS(inMol): f = fragment.LargestFragmentChooser() outMol = f.choose(inMol) c = charge.Uncharger() outMol = c.uncharge(outMol) s = Standardizer() outMol = s.standardize(outMol) n = normalize.Normalizer() outMol = n.normalize(outMol) t = tautomer.TautomerCanonicalizer() outMol = t.canonicalize(outMol) # Transform with Inchi #print "inMol" #print Chem.MolToSmiles(inMol) #inchi = Chem.inchi.MolToInchi(inMol) #print inchi #print "outMol" #print Chem.MolToSmiles(outMol) #inchi = Chem.inchi.MolToInchi(outMol) #print inchi #outMol = Chem.inchi.MolFromInchi(inchi) return outMol
def normalize(mol, lout): s = Standardizer() molstandardized = s.standardize(mol) #print molstandardized lout.append(molstandardized)
def standardize_main(args): mol = _read_mol(args) s = Standardizer() mol = s.standardize(mol) _write_mol(mol, args)
#pprint (chembl_help) #Chembl standardize for lig in range(0, len(chembl_help)): #print ('Now I do this from Chembl: ' + chembl_help[lig][0]) mol = inchi.MolFromInchi(chembl_help[lig][0], sanitize=False) try: rdmolops.RemoveStereochemistry(mol) except Exception: print("Not able to remove stereochemistry. Chembl.") try: mol = standardise.run(mol) except standardise.StandardiseException as e: logging.warn(e.message) try: mol = s.standardize(mol) except Exception: print("Not able to standardize. Chembl.") try: mol = s.tautomer_parent(mol, skip_standardize=True) except Exception: print("Not able to make tautomer parent. Chembl.") mol = s.stereo_parent(mol, skip_standardize=True) chembl_help[lig][0] = inchi.MolToInchi(mol) #BDB preparing bdb_help = [] list_help = [] conn = psycopg2.connect('dbname=bdb user=data host=/tmp/') curs = conn.cursor() curs.execute(
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
def __init__(self, dcompound, logfile, writecheck=1, kSMILES="CANONICAL_SMILES", kID="CMPD_CHEMBLID"): self.compound = dcompound loader = pydrug.PyDrug() # if SMILES, load using SMILES code if not kSMILES in dcompound.keys(): try: smile = runExternalSoft.babelConvertSDFtoSMILE( dcompound["sdf"]) self.compound[kSMILES] = smile except: print "ERROR INPUT SDF - l33" self.log = "ERROR" try: logfile.write(self.compound[kID] + "\t---\tERROR-SDF ORIGINAL INPUT\n") except: pass return #Standardize smile code try: smilestandadized = standardize_smiles(self.compound[kSMILES]) except: logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tERROR-SMILES INPUT" "\n") self.log = "ERROR" return #Standardize using molvs (http://molvs.readthedocs.io/en/latest/api.html#molvs-fragment) s = Standardizer() mol = Chem.MolFromSmiles(smilestandadized) molstandardized = s.standardize(mol) smilestandadized = Chem.MolToSmiles(molstandardized) # remove salt # 1.default remover = SaltRemover() mol = Chem.MolFromSmiles(smilestandadized) molcleandefault = remover(mol) # 2. Personal remover homeremover = SaltRemover(defnData=LSALT) molclean = homeremover(molcleandefault) smilesclean = Chem.MolToSmiles(molclean) # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound lelem = smilesclean.split(".") if len(lelem) > 1: # reduce double, case of several salts are included - 255 lelem = list(set(lelem)) for smilesdel in LSMILESREMOVE: if smilesdel in lelem: lelem.remove(smilesdel) try: lelem.remove("") # case of bad smile except: pass if len(lelem) == 1: smilesclean = str(lelem[0]) else: # 4. Fragments #Case of fragment -> stock in log file, check after to control logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tFRAGMENT IN INPUT" "\n") print ".".join(lelem), " - FRAGMENTS - l66" self.log = "ERROR" return else: pass print self.compound[kSMILES], "SMILES IN - l25 liganddescriptors" print smilesclean, "SMILES without salt and standardized" # case where only salt are included if smilesclean == "": logfile.write(self.compound[kID] + "\t" + str(self.compound[kSMILES]) + "\tEMPTY SMILES AFTER " "STANDARDIZATION\n") print "EMPTY SMILES AFTER STANDARDIZATION - l84" self.log = "ERROR" return self.compound[kSMILES] = smilesclean self.log = "OK" if writecheck == 1: # SMILES code pfileSMILES = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".smi" fileSMILES = open(pfileSMILES, "w") fileSMILES.write(self.compound[kSMILES]) fileSMILES.close() # SDF input if "sdf" in self.compound.keys(): pfileSDF = pathFolder.PR_COMPOUNDS + str( dcompound[kID]) + ".sdf" fileSDF = open(pfileSDF, "w") fileSDF.write(self.compound["sdf"]) fileSDF.close() # read mol self.mol = loader.ReadMolFromSmile(self.compound[kSMILES])
def read_mols(mode, method, basename, datadir='Default', modeldir='Default'): currworkdir = os.getcwd() if datadir == 'Default': datadir = os.path.join(currworkdir, 'data') else: if not os.path.isdir(datadir): print("error: ", datadir, " is not a directory. exiting.") exit(2) if modeldir == 'Default': modeldir = os.path.join(currworkdir, 'models') else: if not os.path.isdir(modeldir): print("error: ", modeldir, " is not a directory. exiting.") exit(2) else: print('setting modeldir to ', modeldir, '.') print( 'Have you set the random splits to be correct for the model?') mol_data_filename = basename + '.smi' act_data_filename = basename + '.act' moldatafile = os.path.join(datadir, mol_data_filename) actdatafile = os.path.join(datadir, act_data_filename) # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state)) model_filename = "model_%s.dat" % output_ext index_filename = "indices_%s.dat" % output_ext appdom_fp_filename = "training-FPs_%s.dat" % output_ext appdom_rad_filename = "AD-radius_%s.dat" % output_ext if mode.startswith('class'): if os.path.isfile(actdatafile): actfh = open(actdatafile) activities = [] # array of tuples: (activity, molecule name) for actline in actfh: line = actline.split() act = float(line[1]) actname = line[0] activities.append((act, actname)) actfh.close() elif mode.startswith('reg') and method == 'xgb': bits_filename = "sigbits_%s.dat" % output_ext bits_file = os.path.join(modeldir, bits_filename) with open(bits_file, 'rb') as f: significant_bits = pickle.load(f) model_file = os.path.join(modeldir, model_filename) loaded_model = pickle.load(open(model_file, "rb")) index_file = os.path.join(modeldir, index_filename) with open(index_file, 'rb') as f: indexes = pickle.load(f) appdom_fp_file = os.path.join(modeldir, appdom_fp_filename) with open(appdom_fp_file, 'rb') as f: appdom_fps = pickle.load(f) appdom_rad_file = os.path.join(modeldir, appdom_rad_filename) with open(appdom_rad_file, 'rb') as f: appdom_radius = pickle.load(f) # Read in molecules from test set molfh = open(moldatafile) molecules = [] # array of tuples: (molecule, molecule name) for molline in molfh: line = molline.split() mol = Chem.MolFromSmiles(line[0]) molname = line[1] molecules.append((mol, molname)) molfh.close() mols_train = [] molnames_train = [] if 'activities' in locals(): acts_train = [] actnames_train = [] for i in range(len(molecules)): mols_train.append(molecules[i][0]) molnames_train.append(molecules[i][1]) if mode.startswith('class') and 'activities' in locals(): acts_train.append(activities[i][0]) actnames_train.append(activities[i][1]) # Standardize structures s = Standardizer() standard_mols_train = [] for mol in mols_train: standard_mols_train.append(s.standardize(mol)) return_dict = {} return_dict['molnames'] = molnames_train return_dict['molecules'] = standard_mols_train return_dict['model'] = loaded_model return_dict['inds'] = indexes if mode.startswith('reg') and method == 'xgb': return_dict['sigbits'] = significant_bits elif mode.startswith('class') and 'activities' in locals(): return_dict['activities'] = acts_train return_dict['ad_fps'] = appdom_fps return_dict['ad_radius'] = appdom_radius return return_dict
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return