def inchikey(self, m): """make InChi from molecule""" if self.hasInchi: from rdkit.Chem import MolToInchi, InchiToInchiKey return InchiToInchiKey(MolToInchi(m)) else: #plpy.notice('InChi not available') return None
def standardise(self): """Ensure Inchi etc was generated correctly""" if self.canonical_smiles: return if not self.std_ctab: self.std_ctab = self.ctab if not self.standard_inchi: self.standard_inchi = inchiFromPipe( self.std_ctab, settings.INCHI_BINARIES_LOCATION['1.02']) if not self.standard_inchi: raise Exception("inchi_error") else: self.standard_inchi_key = InchiToInchiKey( self.standard_inchi.encode("ascii"))
def getStructure(mol): data = dict() if settings.OPEN_SOURCE: try: inchi = inchiFromPipe(mol, settings.INCHI_BINARIES_LOCATION['1.02']) data['InChI'] = inchi inchiKey = InchiToInchiKey(inchi) data['InChIKey'] = inchiKey smiles = smilesFromMol(mol) data['Canonical_Smiles'] = smiles except: pass else: url = '%scuration' % settings.PIPLINE_PILOT_ENDPOINT result = requests.post(url, data=mol, timeout=60) status = result.status_code if status != 200: raise Exception("URL %s has status %s for mol %s" % (url, status, mol)) data = result.json() return data
def test3InchiKey(self): inchi = 'InChI=1S/C9H12/c1-2-6-9-7-4-3-5-8-9/h3-5,7-8H,2,6H2,1H3' self.assertEqual(InchiToInchiKey(inchi), 'ODLMAHJVESYWTB-UHFFFAOYSA-N')
def save(self, force_insert=False, force_update=False, *args, **kwargs): changed = False new = not bool(CompoundStructures.objects.filter(pk=self.pk).count()) if settings.OPEN_SOURCE: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True # newInchi = inchiFromPipe(self.molfile, settings.INCHI_BINARIES_LOCATION['1.02']) #if newInchi != self.standard_inchi: # self.standard_inchi = newInchi # changed = True mol = MolFromInchi(self.standard_inchi.encode("ascii")) if mol: # self.canonical_smiles = MolToSmiles(mol) if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) newInchiKey = InchiToInchiKey(self.standard_inchi.encode("ascii")) if self.standard_inchi_key != newInchiKey: self.standard_inchi_key = newInchiKey mol = MolFromInchi(self.standard_inchi.encode("ascii")) # self.canonical_smiles = MolToSmiles(mol) changed = True self.molfile = MolToMolBlock(MolFromMolBlock(str(self.molfile))) # This is how we do kekulisation in RDKit... self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) else: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True data = getStructure(self.molfile) newInchi = data['InChI'] if newInchi != self.standard_inchi: self.standard_inchi = newInchi self.standard_inchi_key = data['InChIKey'] #self.molformula = data['Molecular_Formula'] self.canonical_smiles = data['Canonical_Smiles'] changed = True if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) if not self.standard_inchi_key: self.standard_inchi_key = InchiToInchiKey(self.standard_inchi.encode("ascii")) self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) if changed: self.molecule.structure_key = self.standard_inchi_key self.molecule.structure_type = "MOL" self.molecule.molfile_update = datetime.now() self.molecule.save() structureChanged.send(sender=self.__class__, instance=self)
def process(self, input_text: str = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, input_type: str = "", lang: str = "eng", paged_text: bool = False, format_output: bool = True, opsin_types: list = None, standardize_mols: bool = True, convert_ions: bool = True, write_header: bool = True, iob_format: bool = False, dry_run: bool = False, csv_delimiter: str = ";", normalize_text: bool = True, remove_duplicates: bool = False, annotate: bool = True, annotation_sleep: int = 2, chemspider_token: str = "", continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with ChemSpot. Parameters ---------- input_text : str String to be processed by ChemSpot. input_file : str Path to file to be processed by ChemSpot. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. SDF is from OPSIN converted entities. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped. lang : str | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra". paged_text : bool If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities. ASCII control character 12 (Form Feed, '\f') is expected between pages. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. opsin_types : list | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want to convert entities, pass empty list. | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types` is ["SYSTEMATIC"] (these should be only IUPAC names). | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE" standardize_mols : bool If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN. convert_ions : bool If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted with OPSIN. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header: "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates" iob_format : bool If True, output will be in IOB format. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. normalize_text : bool If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization can ChemSpot produce unpredictable results which cannot be parsed. remove_duplicates : bool If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for entity name, SMILES etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.). annotation_sleep: int How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases. chemspider_token : str Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it. continue_on_failure : bool | If True, continue running even if ChemSpot returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from ChemSpot - stderr: str ... standard error output from ChemSpot - exit_code: int ... exit code from ChemSpot - content - list of OrderedDicts ... when `format_output` is True - None ... when `format_output` is False - normalized_text : str """ if opsin_types is None: opsin_types = ["SYSTEMATIC"] if input_text and input_file: input_file = "" self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.") elif not input_text and not input_file: raise ValueError("One of 'input_text' or 'input_file' must be set.") if not input_type and not input_text: possible_input_types = ["pdf", "image", "text"] input_type = get_input_file_type(input_file) if input_type not in possible_input_types: raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types)) elif input_type and not input_text: possible_input_types = ["pdf", "pdf_scan", "image", "text"] if input_type not in possible_input_types: raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types)) if input_type in ["pdf", "pdf_scan", "image"]: input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"]) input_file = "" if annotate and not chemspider_token: self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.") options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v}, self.options_internal) output_file_temp = None commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary) commands.insert(1, str(self.options_internal["max_memory"])) commands.append("-t") if normalize_text: normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True) if input_file: with open(input_file, mode="r") as f: input_text = f.read() input_text = normalizer(input_text) if not input_text: raise UserWarning("'input_text' is empty after normalization.") input_text = self.normalize_text(text=input_text) input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_normalized.write(input_text) input_file_normalized.flush() input_file = input_file_normalized.name else: if input_text: input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_temp.write(input_text) input_file_temp.flush() input_file = input_file_temp.name commands.append(os.path.abspath(input_file)) commands.append("-o") if format_output: output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") commands.append(os.path.abspath(output_file_temp.name)) else: commands.append(os.path.abspath(output_file)) if dry_run: return " ".join(commands) stdout, stderr, exit_code = common_subprocess(commands) if "OutOfMemoryError" in stderr: raise RuntimeError("ChemSpot memory error: {}".format(stderr)) to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None, "normalized_text": input_text if normalize_text else None} if not continue_on_failure and exit_code > 0: self.logger.warning("ChemSpot error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if normalize_text: to_return["normalized_text"] = input_text if not format_output: return to_return elif format_output: with open(output_file_temp.name, mode="r", encoding="utf-8") as f: output_chs = f.read() entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs) to_return["content"] = entities if remove_duplicates and not iob_format: seen = set() seen_add = seen.add to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))] if input_type in ["pdf", "pdf_scan"] or paged_text: page_ends = [] for i, page in enumerate(input_text.split("\f")): if page.strip(): try: page_ends.append(page_ends[-1] + len(page) - 1) except IndexError: page_ends.append(len(page) - 1) if opsin_types: if convert_ions: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])] else: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types] if to_convert: opsin = OPSIN(verbosity=self.verbosity) opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"], standardize_mols=standardize_mols, output_file_sdf=output_file_sdf, sdf_append=sdf_append) opsin_converted = iter(opsin_converted["content"]) else: self.logger.info("Nothing to convert with OPSIN.") if annotate: chemspider = ChemSpider(chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): if input_type in ["pdf", "pdf_scan"] or paged_text: ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1) if convert_ions: match_ion = self.re_ion.match(ent["entity"]) if match_ion: match_ion = match_ion.groupdict() match_charge = self.re_charge.search(match_ion["charge"]) if match_charge: match_charge = match_charge.groupdict() if match_charge["roman"]: smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"])) elif match_charge["digit"]: if "+" in match_ion["charge"]: smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"]) elif "-" in match_ion["charge"]: smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"]) elif match_charge["signs"]: smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0], len(match_charge["signs"])) mol = MolFromSmiles(smiles) if mol: inchi = MolToInchi(mol) if inchi: ent.update(OrderedDict( [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))])) else: ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) if opsin_types and to_convert: if ent["entity"] in to_convert: ent_opsin = next(opsin_converted) ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]), ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])])) elif convert_ions and self.re_ion.match(ent["entity"]): ent.update(OrderedDict([("opsin_error", "")])) elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert): ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")])) # TODO: this should be simplified...looks like garbage code if annotate: self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"]))) ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_name", ""), ("chs_cids_by_name", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_cids_by_formula", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB found_in_pch = False found_in_chs = False for _ in range(2): results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search(ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) else: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results = get_compounds(ent["entity"] or ent["abbreviation"], "name") if results: if len(results) == 1: found_in_pch = True result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) # only update identifiers if they weren't found in second DB if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else [] if results: if len(results) == 1: found_in_chs = True result = results[0] if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_common_name"] = result.common_name ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"), ("formula", "pch_cids_by_formula", "")]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent["inchi"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["inchi"]) if chemspider_token else [] elif search_field == "formula": if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["entity"], "formula") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass # ChemSpider doesn't have search field for 'formula' if results_pch: ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs])) sleep(0.5) sleep(annotation_sleep) if not found_in_pch and not found_in_chs: break if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) return to_return
def test4MolToInchiKey(self): m = MolFromSmiles("CC=C(N)C") inchi = MolToInchi(m) k1 = InchiToInchiKey(inchi) k2 = MolToInchiKey(m) self.assertEqual(k1, k2)
def generate_inchikey(inchi): return InchiToInchiKey(inchi)
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return