def test_keep_biggest(): mol = Filters.keep_biggest(MolFromSmiles('CCCC.CC')) assert MolToSmiles(mol) == 'CCCC' mol = Filters.keep_biggest(MolFromSmiles('CCCCC.CC.[H].CCC')) assert MolToSmiles(mol) == 'CCCCC' mol = Filters.keep_biggest(MolFromInchi( 'InChI=1S/C5H12N2O2.C4H7NO4/c6-3-1-2-4(7)5(8)9;5-2(4(8)9)1-3(6)7/h4H,1-3,6-7H2,(H,8,9);2H,1,5H2,(H,6,7)(H,8,9)/t4-;2-/m00/s1')) assert MolToInchi(mol) == 'InChI=1S/C4H7NO4/c5-2(4(8)9)1-3(6)7/h2H,1,5H2,(H,6,7)(H,8,9)/t2-/m0/s1' mol = Filters.keep_biggest(MolFromInchi('InChI=1S/Mo.4O/q;;;2*-1')) assert MolToInchi(mol) == 'InChI=1S/Mo'
def test0InchiWritePubChem(self): for fp, f in self.dataset.items(): inchi_db = self.dataset_inchi[fp] same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue ref_inchi = inchi_db[m.GetProp('PUBCHEM_COMPOUND_CID')] x, y = MolToInchi(m), ref_inchi if x != y: # print("---------------") # print(m.GetProp('PUBCHEM_COMPOUND_CID')) # print(MolToSmiles(m)) # print(y) # print(x) if re.search(r'.[1-9]?ClO4', x) is not None: reasonable += 1 continue SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # if it is because RDKit does not think the bond is stereo z = MolToInchi(MolFromMolBlock(MolToMolBlock(m))) if y != z and inchiDiffPrefix(y, z) == 'b': reasonable += 1 continue # some warning try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error: reasonable += 1 continue diff += 1 print('InChI mismatch for PubChem Compound ' + m.GetProp('PUBCHEM_COMPOUND_CID')) print(MolToSmiles(m, True)) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI write Summary: {1} identical, {2} suffix variance, {3} reasonable{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 1162) self.assertEqual(diff, 0) self.assertEqual(reasonable, 19)
def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError( '"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError( 'Import error from depiction "{}" of type "{}"'.format( idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles( rdmol ) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError( '"{}" is not a valid output type'.format(otype)) return odepic
def calculate(self): """ Check if the SMILES is valid then update the info. :return: RDKit Mol object """ try: m = MolFromSmiles("".join(p.config['long_prefix']) + "".join(self.smiles.element)) self.smiles.properties[p.s_valid] = False if m is not None: m = AddHs(m) AllChem.EmbedMolecule(m) AllChem.UFFOptimizeMolecule(m) self.smiles.properties["InChI"] = MolToInchi(m) except Exception as e: print("Error rdkit : " + repr(e)) m = None if m is not None: self.smiles.properties[p.s_valid] = True with p.lock_update_data: p.tree_info[p.info_good] += 1 self.smiles.properties[p.s_id] = p.tree_info[p.info_good] else: with p.lock_update_data: p.tree_info[p.info_bad] += 1 self.smiles.properties[p.s_id] = p.tree_info[p.info_bad] return m
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions :param idepic: string depiction to be converted, str :param itype: type of depiction provided as input, str :param otype: types of depiction to be generated, {"", "", ..} :return odepic: generated depictions, {"otype1": "odepic1", ..} Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def test_sequence_minimal(): # Violacein mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer().compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21' # L-Lactate mol = MolFromInchi('')
def inchikey(self, m): """make InChi from molecule""" if self.hasInchi: from rdkit.Chem import MolToInchi, InchiToInchiKey return InchiToInchiKey(MolToInchi(m)) else: #plpy.notice('InChi not available') return None
def test_sequence_rr_legacy(): # Violacein mol = MolFromInchi( 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' ) ans = Standardizer(sequence_fun='sequence_rr_legacy').compute(mol) assert MolToInchi( ans ) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles( ans ) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
def testPrechloricAcid(self): examples = ( ('OCl(=O)(=O)=O', 'InChI=1S/ClHO4/c2-1(3,4)5/h(H,2,3,4,5)'), ('CC1=CC2=NCC(CN2C=C1)C(=O)c3ccc4cc(C)ccc4c3.OCl(=O)(=O)=O', 'InChI=1S/C21H20N2O.ClHO4/c1-14-3-4-17-11-18(6-5-16(17)9-14)21(24)19-12-22-20-10-15(2)7-8-23(20)13-19;2-1(3,4)5/h3-11,19H,12-13H2,1-2H3;(H,2,3,4,5)' ), ('CNc1ccc2nc3ccccc3[n+](C)c2c1.[O-]Cl(=O)(=O)=O', 'InChI=1S/C14H13N3.ClHO4/c1-15-10-7-8-12-14(9-10)17(2)13-6-4-3-5-11(13)16-12;2-1(3,4)5/h3-9H,1-2H3;(H,2,3,4,5)' ), ) for smiles, expected in examples: m = MolFromSmiles(smiles) inchi = MolToInchi(m) self.assertEqual(inchi, expected)
def commute_inchi(cls, mol_in): """Convert RDKit compound back and forth to InChi. Returns a new compound after the initial one has been converted back and forth to InChi. :param mol_in: RDKit Mol :return mol_out: RDKit Mol """ inchi = MolToInchi(mol_in, logLevel=None) # this is talkative... mol_out = MolFromInchi(inchi, sanitize=False, removeHs=False, logLevel=None, treatWarningAsError=False) if not mol_out: raise ValueError("Failed InChi validity filter.") # Copy the properties cls._copy_properties(mol_in, mol_out) return mol_out
def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) :param idepic: Input string :param itype: The type of input :param otype: Type of output. Valid options: inchi, smiles, inchikey :type idepic: str :type itype: str :type otype: dict :rtype: dict :return: Dictionnary of results """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def test2InchiOptions(self): m = MolFromSmiles("CC=C(N)C") inchi1 = MolToInchi(m).split('/', 1)[1] inchi2 = MolToInchi(m, "/SUU").split('/', 1)[1] self.assertEqual(inchi1 + '/b4-3?', inchi2)
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 621) self.assertEqual(diff, 0) self.assertEqual(reasonable, 560)
def process(self, input_text: str = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, input_type: str = "", lang: str = "eng", paged_text: bool = False, format_output: bool = True, opsin_types: list = None, standardize_mols: bool = True, convert_ions: bool = True, write_header: bool = True, iob_format: bool = False, dry_run: bool = False, csv_delimiter: str = ";", normalize_text: bool = True, remove_duplicates: bool = False, annotate: bool = True, annotation_sleep: int = 2, chemspider_token: str = "", continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with ChemSpot. Parameters ---------- input_text : str String to be processed by ChemSpot. input_file : str Path to file to be processed by ChemSpot. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. SDF is from OPSIN converted entities. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped. lang : str | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra". paged_text : bool If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities. ASCII control character 12 (Form Feed, '\f') is expected between pages. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. opsin_types : list | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want to convert entities, pass empty list. | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types` is ["SYSTEMATIC"] (these should be only IUPAC names). | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE" standardize_mols : bool If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN. convert_ions : bool If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted with OPSIN. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header: "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates" iob_format : bool If True, output will be in IOB format. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. normalize_text : bool If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization can ChemSpot produce unpredictable results which cannot be parsed. remove_duplicates : bool If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for entity name, SMILES etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.). annotation_sleep: int How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases. chemspider_token : str Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it. continue_on_failure : bool | If True, continue running even if ChemSpot returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from ChemSpot - stderr: str ... standard error output from ChemSpot - exit_code: int ... exit code from ChemSpot - content - list of OrderedDicts ... when `format_output` is True - None ... when `format_output` is False - normalized_text : str """ if opsin_types is None: opsin_types = ["SYSTEMATIC"] if input_text and input_file: input_file = "" self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.") elif not input_text and not input_file: raise ValueError("One of 'input_text' or 'input_file' must be set.") if not input_type and not input_text: possible_input_types = ["pdf", "image", "text"] input_type = get_input_file_type(input_file) if input_type not in possible_input_types: raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types)) elif input_type and not input_text: possible_input_types = ["pdf", "pdf_scan", "image", "text"] if input_type not in possible_input_types: raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types)) if input_type in ["pdf", "pdf_scan", "image"]: input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"]) input_file = "" if annotate and not chemspider_token: self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.") options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v}, self.options_internal) output_file_temp = None commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary) commands.insert(1, str(self.options_internal["max_memory"])) commands.append("-t") if normalize_text: normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True) if input_file: with open(input_file, mode="r") as f: input_text = f.read() input_text = normalizer(input_text) if not input_text: raise UserWarning("'input_text' is empty after normalization.") input_text = self.normalize_text(text=input_text) input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_normalized.write(input_text) input_file_normalized.flush() input_file = input_file_normalized.name else: if input_text: input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_temp.write(input_text) input_file_temp.flush() input_file = input_file_temp.name commands.append(os.path.abspath(input_file)) commands.append("-o") if format_output: output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") commands.append(os.path.abspath(output_file_temp.name)) else: commands.append(os.path.abspath(output_file)) if dry_run: return " ".join(commands) stdout, stderr, exit_code = common_subprocess(commands) if "OutOfMemoryError" in stderr: raise RuntimeError("ChemSpot memory error: {}".format(stderr)) to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None, "normalized_text": input_text if normalize_text else None} if not continue_on_failure and exit_code > 0: self.logger.warning("ChemSpot error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if normalize_text: to_return["normalized_text"] = input_text if not format_output: return to_return elif format_output: with open(output_file_temp.name, mode="r", encoding="utf-8") as f: output_chs = f.read() entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs) to_return["content"] = entities if remove_duplicates and not iob_format: seen = set() seen_add = seen.add to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))] if input_type in ["pdf", "pdf_scan"] or paged_text: page_ends = [] for i, page in enumerate(input_text.split("\f")): if page.strip(): try: page_ends.append(page_ends[-1] + len(page) - 1) except IndexError: page_ends.append(len(page) - 1) if opsin_types: if convert_ions: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])] else: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types] if to_convert: opsin = OPSIN(verbosity=self.verbosity) opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"], standardize_mols=standardize_mols, output_file_sdf=output_file_sdf, sdf_append=sdf_append) opsin_converted = iter(opsin_converted["content"]) else: self.logger.info("Nothing to convert with OPSIN.") if annotate: chemspider = ChemSpider(chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): if input_type in ["pdf", "pdf_scan"] or paged_text: ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1) if convert_ions: match_ion = self.re_ion.match(ent["entity"]) if match_ion: match_ion = match_ion.groupdict() match_charge = self.re_charge.search(match_ion["charge"]) if match_charge: match_charge = match_charge.groupdict() if match_charge["roman"]: smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"])) elif match_charge["digit"]: if "+" in match_ion["charge"]: smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"]) elif "-" in match_ion["charge"]: smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"]) elif match_charge["signs"]: smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0], len(match_charge["signs"])) mol = MolFromSmiles(smiles) if mol: inchi = MolToInchi(mol) if inchi: ent.update(OrderedDict( [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))])) else: ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) if opsin_types and to_convert: if ent["entity"] in to_convert: ent_opsin = next(opsin_converted) ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]), ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])])) elif convert_ions and self.re_ion.match(ent["entity"]): ent.update(OrderedDict([("opsin_error", "")])) elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert): ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")])) # TODO: this should be simplified...looks like garbage code if annotate: self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"]))) ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_name", ""), ("chs_cids_by_name", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_cids_by_formula", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB found_in_pch = False found_in_chs = False for _ in range(2): results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search(ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) else: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results = get_compounds(ent["entity"] or ent["abbreviation"], "name") if results: if len(results) == 1: found_in_pch = True result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) # only update identifiers if they weren't found in second DB if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else [] if results: if len(results) == 1: found_in_chs = True result = results[0] if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_common_name"] = result.common_name ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"), ("formula", "pch_cids_by_formula", "")]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent["inchi"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["inchi"]) if chemspider_token else [] elif search_field == "formula": if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["entity"], "formula") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass # ChemSpider doesn't have search field for 'formula' if results_pch: ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs])) sleep(0.5) sleep(annotation_sleep) if not found_in_pch and not found_in_chs: break if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) return to_return
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return
def test_sequence_tunable(): # Check default arguments args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(sequence_tunable) default_params = dict(zip(args[-len(defaults):], defaults)) assert default_params == { 'OP_REMOVE_ISOTOPE':True, 'OP_NEUTRALISE_CHARGE': True, 'OP_REMOVE_STEREO': False, 'OP_COMMUTE_INCHI': False, 'OP_KEEP_BIGGEST': True, 'OP_ADD_HYDROGEN': True, 'OP_KEKULIZE': True, 'OP_NEUTRALISE_CHARGE_LATE': True } # Violacein, default parameter mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable').compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21' # Violacein, strip stereo mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)' assert MolToSmiles(ans) == '[H]OC1=C([H])C2=C(C([H])=C1[H])N([H])C([H])=C2C1=C([H])C(=C2C(=O)N([H])C3=C([H])C([H])=C([H])C([H])=C23)C(=O)N1[H]' # Violacien, implicit Hs mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == 'OC1=CC2=C(C=C1)NC=C2C1=C/C(=C2/C3=CC=CC=C3N=C2O)C(O)=N1' # Violacien, no kekulerization mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_KEKULIZE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21' # Violacien, strip stereo & implicit Hs & no kekulerization mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True, 'OP_ADD_HYDROGEN': False, 'OP_KEKULIZE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)' assert MolToSmiles(ans) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21' # Lactate, default parameter mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable').compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1' assert MolToSmiles(ans) == '[H]OC(=O)[C@@]([H])(O[H])C([H])([H])[H]' # L-lactate, implicit Hs mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1' assert MolToSmiles(ans) == 'C[C@H](O)C(=O)O' # L-lactate, no stereo mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)' assert MolToSmiles(ans) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]' # L-lactate, no charge neutralisation mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_NEUTRALISE_CHARGE': False, 'OP_NEUTRALISE_CHARGE_LATE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1' assert MolToSmiles(ans) == '[H]O[C@]([H])(C(=O)[O-])C([H])([H])[H]' # L-lactate, implicit Hs & no stereo mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False, 'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)' assert MolToSmiles(ans) == 'CC(O)C(=O)O'
def test_commute_inchi(): inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1' mol = Filters.commute_inchi(MolFromInchi(inchi)) assert MolToInchi(mol) == inchi
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
import pandas as pd from tqdm import tqdm import os.path as osp from rdkit.Chem import MolFromSmiles, MolToInchi import torch if __name__ == '__main__': dd_csv_folder = "/scratch/projects/yzlab/group/temp_dd/solvation/calculated/" train_csv = pd.read_csv(osp.join(dd_csv_folder, "all.csv")) valid_csv = pd.read_csv(osp.join(dd_csv_folder, "valid.csv")) test_csv = pd.read_csv(osp.join(dd_csv_folder, "test.csv")) lipop_csv = pd.read_csv("lipop.csv") lipop_inchi = [ MolToInchi(MolFromSmiles(s)) for s in lipop_csv["cano_smiles"] ] freesolv_csv = pd.read_csv("freesolv.csv") freesolv_inchi = [ MolToInchi(MolFromSmiles(s)) for s in freesolv_csv["cano_smiles"] ] all_inchi = lipop_inchi all_inchi.extend(freesolv_inchi) inchi_exist_map = [] # concatenate them in this order concat_csv = pd.concat([train_csv, valid_csv, test_csv], ignore_index=True) for inchi in tqdm(concat_csv["InChI"]): if inchi in all_inchi: inchi_exist_map.append(1)
def test4MolToInchiKey(self): m = MolFromSmiles("CC=C(N)C") inchi = MolToInchi(m) k1 = InchiToInchiKey(inchi) k2 = MolToInchiKey(m) self.assertEqual(k1, k2)