Python InchiToInchiKeyの例

プログラミング言語: Python

名前空間/パッケージ名: rdkit.Chem

クラス/型: InchiToInchiKey

hotexamples.comのコード掲載数: 10

Python InchiToInchiKey - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのrdkit.Chem.InchiToInchiKeyの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

InchiToInchiKey(10)

よく使われるメソッド

InchiToInchiKey (10)

コード例 #1

ファイルを表示

 def inchikey(self, m):
     """make InChi from molecule"""
     if self.hasInchi:
         from rdkit.Chem import MolToInchi, InchiToInchiKey
         return InchiToInchiKey(MolToInchi(m))
     else:
         #plpy.notice('InChi not available')
         return None

コード例 #2

ファイルを表示

    def standardise(self):
        """Ensure Inchi etc was generated correctly"""
        if self.canonical_smiles:
            return
        if not self.std_ctab:
            self.std_ctab = self.ctab
        if not self.standard_inchi:

            self.standard_inchi = inchiFromPipe(
                self.std_ctab, settings.INCHI_BINARIES_LOCATION['1.02'])
        if not self.standard_inchi:
            raise Exception("inchi_error")
        else:
            self.standard_inchi_key = InchiToInchiKey(
                self.standard_inchi.encode("ascii"))

コード例 #3

ファイルを表示

ファイル: utils.py プロジェクト: tsufz/chembiohub_ws

def getStructure(mol):
    data = dict()
    if settings.OPEN_SOURCE:
        try:
            inchi = inchiFromPipe(mol,
                                  settings.INCHI_BINARIES_LOCATION['1.02'])
            data['InChI'] = inchi
            inchiKey = InchiToInchiKey(inchi)
            data['InChIKey'] = inchiKey
            smiles = smilesFromMol(mol)
            data['Canonical_Smiles'] = smiles
        except:
            pass
    else:
        url = '%scuration' % settings.PIPLINE_PILOT_ENDPOINT
        result = requests.post(url, data=mol, timeout=60)
        status = result.status_code

        if status != 200:
            raise Exception("URL %s has status %s for mol %s" %
                            (url, status, mol))
        data = result.json()
    return data

コード例 #4

ファイルを表示

ファイル: UnitTestInchi.py プロジェクト: tlinnet/rdkit

 def test3InchiKey(self):
     inchi = 'InChI=1S/C9H12/c1-2-6-9-7-4-3-5-8-9/h3-5,7-8H,2,6H2,1H3'
     self.assertEqual(InchiToInchiKey(inchi), 'ODLMAHJVESYWTB-UHFFFAOYSA-N')

コード例 #5

ファイルを表示

    def save(self, force_insert=False, force_update=False, *args, **kwargs):

        changed = False
        new  =  not bool(CompoundStructures.objects.filter(pk=self.pk).count())
        if settings.OPEN_SOURCE:
            if self.molfile:
                if not new: # The structure already exists and we only want to modify it
                    super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula
                    changed = True
             #   newInchi = inchiFromPipe(self.molfile, settings.INCHI_BINARIES_LOCATION['1.02'])
                #if newInchi != self.standard_inchi:
                 #   self.standard_inchi = newInchi
                  #  changed = True
            mol = MolFromInchi(self.standard_inchi.encode("ascii"))
            if mol:
            # self.canonical_smiles = MolToSmiles(mol)
                if not self.standard_inchi:
                    raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk))

                newInchiKey = InchiToInchiKey(self.standard_inchi.encode("ascii"))
                if self.standard_inchi_key != newInchiKey:
                    self.standard_inchi_key = newInchiKey
                    mol = MolFromInchi(self.standard_inchi.encode("ascii"))
                    # self.canonical_smiles = MolToSmiles(mol)
                    changed = True
                    self.molfile = MolToMolBlock(MolFromMolBlock(str(self.molfile))) # This is how we do kekulisation in RDKit...

                self.clean_fields()
                self.validate_unique()
                super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs)

        else:
            if self.molfile:
                if not new: # The structure already exists and we only want to modify it
                    super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula
                    changed = True

                data = getStructure(self.molfile)

                newInchi = data['InChI']
                if newInchi != self.standard_inchi:
                    self.standard_inchi = newInchi
                    self.standard_inchi_key = data['InChIKey']
                    #self.molformula = data['Molecular_Formula']
                    self.canonical_smiles = data['Canonical_Smiles']
                    changed = True

            if not self.standard_inchi:
                raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk))

            if not self.standard_inchi_key:
                self.standard_inchi_key = InchiToInchiKey(self.standard_inchi.encode("ascii"))

            self.clean_fields()
            self.validate_unique()
            super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs)

        if changed:
            self.molecule.structure_key = self.standard_inchi_key
            self.molecule.structure_type = "MOL"
            self.molecule.molfile_update = datetime.now()
            self.molecule.save()
            structureChanged.send(sender=self.__class__, instance=self)

コード例 #6

ファイルを表示

    def process(self,
                input_text: str = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                sdf_append: bool = False,
                input_type: str = "",
                lang: str = "eng",
                paged_text: bool = False,
                format_output: bool = True,
                opsin_types: list = None,
                standardize_mols: bool = True,
                convert_ions: bool = True,
                write_header: bool = True,
                iob_format: bool = False,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                normalize_text: bool = True,
                remove_duplicates: bool = False,
                annotate: bool = True,
                annotation_sleep: int = 2,
                chemspider_token: str = "",
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with ChemSpot.

        Parameters
        ----------
        input_text : str
            String to be processed by ChemSpot.
        input_file : str
            Path to file to be processed by ChemSpot.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in. SDF is from OPSIN converted entities.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped.
        lang : str
            | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata
            | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra".
        paged_text : bool
            If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities.
            ASCII control character 12 (Form Feed, '\f') is expected between pages.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        opsin_types : list
            | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want
              to convert entities, pass empty list.
            | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types`
              is ["SYSTEMATIC"] (these should be only IUPAC names).
            | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE"
        standardize_mols : bool
            If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN.
        convert_ions : bool
            If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted
            with OPSIN.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header:
            "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates"
        iob_format : bool
            If True, output will be in IOB format.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        normalize_text : bool
            If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization
            can ChemSpot produce unpredictable results which cannot be parsed.
        remove_duplicates : bool
            If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but
            same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for entity name, SMILES etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
            | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.).
        annotation_sleep: int
            How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it.
        continue_on_failure : bool
            | If True, continue running even if ChemSpot returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from ChemSpot
            - stderr: str ... standard error output from ChemSpot
            - exit_code: int ... exit code from ChemSpot
            - content

              - list of OrderedDicts ... when `format_output` is True
              - None ... when `format_output` is False

            - normalized_text : str
        """

        if opsin_types is None:
            opsin_types = ["SYSTEMATIC"]

        if input_text and input_file:
            input_file = ""
            self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.")
        elif not input_text and not input_file:
            raise ValueError("One of 'input_text' or 'input_file' must be set.")

        if not input_type and not input_text:
            possible_input_types = ["pdf", "image", "text"]
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types))
        elif input_type and not input_text:
            possible_input_types = ["pdf", "pdf_scan", "image", "text"]
            if input_type not in possible_input_types:
                raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types))

        if input_type in ["pdf", "pdf_scan", "image"]:
            input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"])
            input_file = ""

        if annotate and not chemspider_token:
            self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.")

        options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v},
                           self.options_internal)
        output_file_temp = None

        commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary)
        commands.insert(1, str(self.options_internal["max_memory"]))
        commands.append("-t")

        if normalize_text:
            normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True)

            if input_file:
                with open(input_file, mode="r") as f:
                    input_text = f.read()

            input_text = normalizer(input_text)

            if not input_text:
                raise UserWarning("'input_text' is empty after normalization.")

            input_text = self.normalize_text(text=input_text)
            input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8")
            input_file_normalized.write(input_text)
            input_file_normalized.flush()
            input_file = input_file_normalized.name
        else:
            if input_text:
                input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8")
                input_file_temp.write(input_text)
                input_file_temp.flush()
                input_file = input_file_temp.name

        commands.append(os.path.abspath(input_file))
        commands.append("-o")
        if format_output:
            output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8")
            commands.append(os.path.abspath(output_file_temp.name))
        else:
            commands.append(os.path.abspath(output_file))

        if dry_run:
            return " ".join(commands)

        stdout, stderr, exit_code = common_subprocess(commands)

        if "OutOfMemoryError" in stderr:
            raise RuntimeError("ChemSpot memory error: {}".format(stderr))

        to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None,
                     "normalized_text": input_text if normalize_text else None}

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("ChemSpot error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if normalize_text:
            to_return["normalized_text"] = input_text

        if not format_output:
            return to_return
        elif format_output:
            with open(output_file_temp.name, mode="r", encoding="utf-8") as f:
                output_chs = f.read()

            entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs)
            to_return["content"] = entities

            if remove_duplicates and not iob_format:
                seen = set()
                seen_add = seen.add
                to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))]

            if input_type in ["pdf", "pdf_scan"] or paged_text:
                page_ends = []
                for i, page in enumerate(input_text.split("\f")):
                    if page.strip():
                        try:
                            page_ends.append(page_ends[-1] + len(page) - 1)
                        except IndexError:
                            page_ends.append(len(page) - 1)

            if opsin_types:
                if convert_ions:
                    to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])]
                else:
                    to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types]

                if to_convert:
                    opsin = OPSIN(verbosity=self.verbosity)
                    opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"],
                                                    standardize_mols=standardize_mols, output_file_sdf=output_file_sdf,
                                                    sdf_append=sdf_append)
                    opsin_converted = iter(opsin_converted["content"])
                else:
                    self.logger.info("Nothing to convert with OPSIN.")

            if annotate:
                chemspider = ChemSpider(chemspider_token) if chemspider_token else None

            for i, ent in enumerate(to_return["content"]):
                if input_type in ["pdf", "pdf_scan"] or paged_text:
                    ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1)

                if convert_ions:
                    match_ion = self.re_ion.match(ent["entity"])
                    if match_ion:
                        match_ion = match_ion.groupdict()
                        match_charge = self.re_charge.search(match_ion["charge"])
                        if match_charge:
                            match_charge = match_charge.groupdict()
                            if match_charge["roman"]:
                                smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"]))
                            elif match_charge["digit"]:
                                if "+" in match_ion["charge"]:
                                    smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"])
                                elif "-" in match_ion["charge"]:
                                    smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"])
                            elif match_charge["signs"]:
                                smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0],
                                                           len(match_charge["signs"]))

                            mol = MolFromSmiles(smiles)
                            if mol:
                                inchi = MolToInchi(mol)
                                if inchi:
                                    ent.update(OrderedDict(
                                        [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))]))
                                else:
                                    ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")]))
                            else:
                                ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")]))
                    else:
                        ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")]))

                if opsin_types and to_convert:
                    if ent["entity"] in to_convert:
                        ent_opsin = next(opsin_converted)
                        ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]),
                                                ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])]))
                    elif convert_ions and self.re_ion.match(ent["entity"]):
                        ent.update(OrderedDict([("opsin_error", "")]))
                    elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert):
                        ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")]))

                # TODO: this should be simplified...looks like garbage code
                if annotate:
                    self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"])))
                    ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""),
                                            ("pch_cids_by_name", ""), ("chs_cids_by_name", ""),
                                            ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""),
                                            ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""),
                                            ("pch_cids_by_formula", ""),
                                            ("pch_iupac_name", ""), ("chs_common_name", ""),
                                            ("pch_synonyms", "")]))

                    # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB
                    found_in_pch = False
                    found_in_chs = False
                    for _ in range(2):
                        results = []

                        # prefer InChI key
                        if "inchikey" in ent and ent["inchikey"]:
                            try:
                                results = get_compounds(ent["inchikey"], "inchikey")
                                if results:
                                    if len(results) == 1:
                                        result = results[0]
                                        synonyms = result.synonyms
                                        if synonyms:
                                            ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms))
                                        ent["pch_iupac_name"] = result.iupac_name
                                        if not found_in_chs:
                                            ent["smiles"] = result.canonical_smiles or ent["smiles"]
                                            ent["inchi"] = result.inchi or ent["inchi"]
                                            ent["inchikey"] = result.inchikey or ent["inchikey"]
                                    ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results]))
                            except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                pass

                            results = chemspider.search(ent["inchikey"]) if chemspider_token else []
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    ent["chs_common_name"] = result.common_name
                                    if not found_in_pch:
                                        ent["smiles"] = result.smiles or ent["smiles"]
                                        ent["inchi"] = result.stdinchi or ent["inchi"]
                                        ent["inchikey"] = result.stdinchikey or ent["inchikey"]
                                ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results]))
                        else:
                            if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                try:
                                    results = get_compounds(ent["entity"] or ent["abbreviation"], "name")
                                    if results:
                                        if len(results) == 1:
                                            found_in_pch = True
                                            result = results[0]
                                            synonyms = result.synonyms
                                            if synonyms:
                                                ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms))
                                            # only update identifiers if they weren't found in second DB
                                            if not found_in_chs:
                                                ent["smiles"] = result.canonical_smiles or ent["smiles"]
                                                ent["inchi"] = result.inchi or ent["inchi"]
                                                ent["inchikey"] = result.inchikey or ent["inchikey"]
                                            ent["pch_iupac_name"] = result.iupac_name
                                        ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results]))
                                except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                    pass

                            if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else []
                                if results:
                                    if len(results) == 1:
                                        found_in_chs = True
                                        result = results[0]
                                        if not found_in_pch:
                                            ent["smiles"] = result.smiles or ent["smiles"]
                                            ent["inchi"] = result.stdinchi or ent["inchi"]
                                            ent["inchikey"] = result.stdinchikey or ent["inchikey"]
                                        ent["chs_common_name"] = result.common_name
                                    ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results]))

                            for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"),
                                                                   ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"),
                                                                   ("formula", "pch_cids_by_formula", "")]:
                                results_pch = []
                                results_chs = []

                                if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]:
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["smiles"], "smiles")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                        results_chs = chemspider.search(ent["smiles"]) if chemspider_token else []
                                elif search_field == "inchi" and "inchi" in ent and ent["inchi"]:
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["inchi"], "inchi")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                        results_chs = chemspider.search(ent["inchi"]) if chemspider_token else []
                                elif search_field == "formula":
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["entity"], "formula")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    # ChemSpider doesn't have search field for 'formula'

                                if results_pch:
                                    ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch]))
                                if results_chs:
                                    ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs]))

                                sleep(0.5)

                        sleep(annotation_sleep)

                        if not found_in_pch and not found_in_chs:
                            break

            if output_file:
                dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header)

        return to_return

コード例 #7

ファイルを表示

 def test4MolToInchiKey(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi = MolToInchi(m)
     k1 = InchiToInchiKey(inchi)
     k2 = MolToInchiKey(m)
     self.assertEqual(k1, k2)

コード例 #8

ファイルを表示

ファイル: molecule_properties_tools.py プロジェクト: GPCRmd/GPCRmd

def generate_inchikey(inchi):
    return InchiToInchiKey(inchi)

コード例 #9

ファイルを表示

ファイル: OSRA.py プロジェクト: abarbarov/FORK-molminer

    def process(
            self,
            input_file: str,
            output_file: str = "",
            output_file_sdf: str = "",
            sdf_append: bool = False,
            #images_prefix: str = "",
            format_output: bool = True,
            write_header: bool = True,
            osra_output_format: str = "",
            output_formats: list = None,
            dry_run: bool = False,
            csv_delimiter: str = ";",
            use_gm: bool = True,
            gm_dpi: int = 300,
            gm_trim: bool = True,
            n_jobs: int = -1,
            input_type: str = "",
            standardize_mols: bool = True,
            annotate: bool = True,
            chemspider_token: str = "",
            custom_page: int = 0,
            continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OSRA.

        Parameters
        ----------
        input_file : str
            Path to file to be processed by OSRA.
        output_file : str
            File to write output in.
        output_file_sdf : str
            | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output.
            | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        NOT IMPLEMENTED | images_prefix : str
            Prefix for images of extracted compounds which will be written.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        osra_output_format : str
            | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "smi", "can", "sdf"
            | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet).
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA.
            | Default value: ["smiles"]

            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      Value      |    Source    |                                            Note                                            |
            +=================+==============+============================================================================================+
            |      smiles     |     RDKit    |                                          canonical                                         |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |   smiles_osra   | OSRA ("smi") |                                           SMILES                                           |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            | smiles_can_osra | OSRA ("can") |                                      canonical SMILES                                      |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      inchi      |     RDKit    | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     inchikey    |     RDKit    |                              The same applies as for "inchi".                              |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |       sdf       |     RDKit    |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     sdf_osra    | OSRA ("sdf") |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+

        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        use_gm : bool
            | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing.
            | If False, OSRA will use it's own conversion of PDF to image.
            | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information
              when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes
              incorrectly recognised structures.
        gm_dpi : int
            How many DPI will temporary PNG images have.
        gm_trim : bool
            If True, gm will trim the temporary PNG images.
        n_jobs : int
            | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images.
            | If -1 all CPUs are used.
            | If 1 is given, no parallel computing code is used at all, which is useful for debugging.
            | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf" or "image" and magic bytes check will be skipped.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for SMILES, InChI etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API. Make account there to obtain it.
        custom_page : bool
            When `use_gm` is False, this will set the page for all extracted compounds.
        continue_on_failure : bool
            | If True, continue running even if OSRA returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OSRA
            - stderr: str ... standard error output from OSRA
            - exit_code: int ... exit code from OSRA
            - content:

                - list of OrderedDicts ... when `format_output` is True.
                - None ... when `format_output` is False

            | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved.
            | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image
              extracted by OSRA.

        Notes
        -----
        Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise
        you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set).
        """

        options_internal = self.options_internal.copy()
        osra_smiles_outputs = ["smi", "can"]

        # OSRA output format check
        if osra_output_format:
            options_internal["output_format"] = osra_output_format
        else:
            osra_output_format = options_internal["output_format"]

        osra_valid_output_formats = {
            "can": "smiles_can_osra",
            "smi": "smiles_osra",
            "sdf": "sdf_osra"
        }
        if osra_output_format not in osra_valid_output_formats:
            raise ValueError(
                "Unknown OSRA output format. Possible values: {}".format(
                    osra_valid_output_formats.values()))

        if osra_output_format == "sdf":
            self.logger.warning(
                "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved."
            )

        # output formats check
        is_output_sdf = False
        is_output_sdf_osra = False
        if not output_formats:
            output_formats = ["smiles"]
        else:
            output_formats = sorted(list(set(output_formats)))
            possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"]
            output_formats = [
                x for x in output_formats if x in possible_output_formats
                or x == osra_valid_output_formats[osra_output_format]
            ]

            if ("sdf" in output_formats
                    or "sdf_osra" in output_formats) and not output_file_sdf:
                self.logger.warning(
                    "Cannot write SDF output: 'output_file_sdf' is not set.")
            if output_file_sdf:
                is_output_sdf = True
            if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf:
                is_output_sdf_osra = True
            if ("smiles_osra" in output_formats or "smiles_can_osra"
                    in output_formats) and osra_output_format == "sdf":
                try:
                    output_formats.remove("smiles_osra")
                except ValueError:
                    pass
                try:
                    output_formats.remove("smiles_can_osra")
                except ValueError:
                    pass
                self.logger.warning(
                    "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"."
                    .format(osra_output_format))

        # input file type check
        possible_input_types = ["pdf", "image"]
        if not input_type:
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                use_gm = False
                self.logger.warning(
                    "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)."
                    .format(input_type, possible_input_types))
        elif input_type not in possible_input_types:
            raise ValueError("Possible 'input_type' values are {}".format(
                possible_input_types))

        #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v},
        #                   options_internal)

        if annotate:
            if not chemspider_token:
                self.logger.warning(
                    "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty."
                )
            [
                output_formats.append(x)
                for x in ["smiles", "inchi", "inchikey"]
                if x not in output_formats
            ]
            output_formats = sorted(output_formats)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)
        commands.extend(
            ["--bond", "--coordinates", "--page", "--guess", "--print"])

        if dry_run:
            return " ".join(commands)

        osra_output_list = []
        if input_type == "image" or not use_gm:
            osra_output_list.append(
                self._process(input_file,
                              commands,
                              page=custom_page if custom_page else 1))
        elif input_type == "pdf":
            with tempfile.TemporaryDirectory() as temp_dir:
                stdout, stderr, exit_code = pdf_to_images(input_file,
                                                          temp_dir,
                                                          dpi=gm_dpi,
                                                          trim=gm_trim)
                osra_output_list = Parallel(n_jobs=n_jobs)(
                    delayed(self._process)(
                        temp_image_file, commands, page=page)
                    for temp_image_file, page in get_temp_images(temp_dir))

        # summarize OSRA results
        to_return = {
            "stdout": [],
            "stderr": [],
            "exit_code": [],
            "content": None,
            "pages": []
        }
        for result in osra_output_list:
            if result["stdout"]:
                to_return["stdout"].append(result["stdout"])
                to_return["stderr"].append(result["stderr"])
                to_return["exit_code"].append(result["exit_code"])
                to_return["pages"].append(result["page"])

        if not continue_on_failure:
            errors = [(page + 1, error)
                      for page, (exit_code, error) in enumerate(
                          zip(to_return["exit_code"], to_return["stderr"]))
                      if exit_code > 0]
            if errors:
                self.logger.warning("OSRA errors:")
                for page, error in errors:
                    eprint("\tError on page {}:".format(page))
                    eprint("\n\t\t".join("\n{}".format(error).splitlines()))
                return to_return

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write("\n".join(to_return["stdout"]))
            return to_return

        output_cols = OrderedDict([("bond_length", 1), ("resolution", 2),
                                   ("confidence", 3), ("page", 4),
                                   ("coordinates", 5)])

        if osra_output_format in osra_smiles_outputs:
            compound_template_dict = OrderedDict.fromkeys(
                output_formats + list(output_cols.keys()))
        else:
            compound_template_dict = OrderedDict.fromkeys(["page"] +
                                                          output_formats)

        if any(to_return["stdout"]):
            if standardize_mols:
                standardizer = Standardizer()

            compounds = []

            if is_output_sdf:
                if sdf_append:
                    if not os.path.isfile(output_file_sdf):
                        open(output_file_sdf, mode="w",
                             encoding="utf-8").close()
                    writer = SDWriter(
                        open(output_file_sdf, mode="a", encoding="utf-8"))
                else:
                    writer = SDWriter(output_file_sdf)

            for output, page in zip(to_return["stdout"], to_return["pages"]):
                if osra_output_format in osra_smiles_outputs:
                    lines = [x.strip() for x in output.split("\n") if x]
                else:
                    lines = [x for x in output.split("$$$$") if x.strip()]

                for line in lines:
                    """
                    # so much problems with --learn
                    # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1"
                    if "learn" in filtered_cols:
                        learn_start = filtered_cols.index("learn") + 1 #  "smiles" col isn't in output_cols
                        learn_end = filtered_cols.index("learn") + 1 + 3
                        line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])]
                    """

                    if not line:
                        continue

                    if osra_output_format in osra_smiles_outputs:
                        line = [x.strip() for x in line.split()]
                        if custom_page:
                            line[output_cols["page"]] = custom_page
                        elif use_gm:
                            line[output_cols["page"]] = page
                        mol = MolFromSmiles(
                            line[0],
                            sanitize=False if standardize_mols else True)
                    elif osra_output_format == "sdf":
                        line = "\n" + line.strip()
                        mol = MolFromMolBlock(
                            line,
                            strictParsing=False,
                            sanitize=False if standardize_mols else True,
                            removeHs=False if standardize_mols else True)

                    if mol:
                        compound = compound_template_dict.copy()

                        if standardize_mols:
                            try:
                                mol = standardizer.standardize(mol)
                            except ValueError as e:
                                self.logger.warning(
                                    "Cannot standardize '{}': {}".format(
                                        MolToSmiles(mol), str(e)))

                        for f in output_formats:
                            if f == "smiles":
                                compound["smiles"] = MolToSmiles(
                                    mol, isomericSmiles=True)
                            elif f == "smiles_osra" and osra_output_format == "smi":
                                compound["smiles_osra"] = line[0]
                            elif f == "smiles_can_osra" and osra_output_format == "can":
                                compound["smiles_can_osra"] = line[0]
                            elif f == "inchi":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchi"] = inchi
                                else:
                                    compound["inchi"] = ""
                                    self.logger.warning(
                                        "Cannot convert to InChI: {}".format(
                                            MolToSmiles(mol)))
                            elif f == "inchikey":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchikey"] = InchiToInchiKey(
                                        inchi)
                                else:
                                    compound["inchikey"] = ""
                                    self.logger.warning(
                                        "Cannot create InChI-key from InChI: {}"
                                        .format(MolToSmiles(mol)))
                            elif f == "sdf":
                                compound["sdf"] = MolToMolBlock(
                                    mol, includeStereo=True)
                            elif f == "sdf_osra":
                                compound["sdf_osra"] = line

                        if is_output_sdf:
                            writer.write(mol)

                        if osra_output_format in osra_smiles_outputs:
                            compound.update([(x[0], x[1]) for x in zip(
                                list(output_cols.keys()), line[1:])])
                        else:
                            compound[
                                "page"] = page if use_gm else custom_page if custom_page else 1

                        compounds.append(compound)
                    else:
                        self.logger.warning("Cannot convert to RDKit mol: " +
                                            line[0])

            if is_output_sdf_osra:
                with open(output_file_sdf + "-osra.sdf",
                          mode="w",
                          encoding="utf-8") as f:
                    f.write("".join(to_return["stdout"]))

            to_return["content"] = sorted(compounds, key=lambda x: x["page"])

            if annotate:
                chemspider = ChemSpider(
                    chemspider_token) if chemspider_token else None

                for i, ent in enumerate(to_return["content"]):
                    self.logger.info("Annotating entity {}/{}...".format(
                        i + 1, len(to_return["content"])))
                    ent.update(
                        OrderedDict([("pch_cids_by_inchikey", ""),
                                     ("chs_cids_by_inchikey", ""),
                                     ("pch_cids_by_smiles", ""),
                                     ("chs_cids_by_smiles", ""),
                                     ("pch_cids_by_inchi", ""),
                                     ("chs_cids_by_inchi", ""),
                                     ("pch_iupac_name", ""),
                                     ("chs_common_name", ""),
                                     ("pch_synonyms", "")]))

                    results = []

                    # prefer InChI key
                    if "inchikey" in ent and ent["inchikey"]:
                        try:
                            results = get_compounds(ent["inchikey"],
                                                    "inchikey")
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    synonyms = result.synonyms
                                    if synonyms:
                                        ent["pch_synonyms"] = "\"{}\"".format(
                                            "\",\"".join(synonyms))
                                    ent["pch_iupac_name"] = result.iupac_name
                                ent["pch_cids_by_inchikey"] = "\"{}\"".format(
                                    ",".join([str(c.cid) for c in results]))
                        except (BadRequestError, NotFoundError,
                                PubChemHTTPError, ResponseParseError,
                                ServerError, TimeoutError, PubChemPyError):
                            pass

                        results = chemspider.search(
                            ent["inchikey"]) if chemspider_token else []
                        if results:
                            if len(results) == 1:
                                result = results[0]
                                ent["chs_common_name"] = result.common_name
                            ent["chs_cids_by_inchikey"] = "\"{}\"".format(
                                ",".join([str(c.csid) for c in results]))
                    else:
                        for search_field, col_pch, col_chs in [
                            ("smiles", "pch_cids_by_smiles",
                             "chs_cids_by_smiles"),
                            ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi")
                        ]:
                            results_pch = []
                            results_chs = []

                            if search_field == "smiles" and "smiles" in ent and ent[
                                    "smiles"] and "*" not in ent["smiles"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["smiles"], "smiles")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["smiles"]) if chemspider_token else []
                            elif search_field == "inchi" and "inchi" in ent and ent[
                                    "inchi"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["inchi"], "inchi")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["inchi"]) if chemspider_token else []

                            if results_pch:
                                ent[col_pch] = "\"{}\"".format(",".join(
                                    [str(c.cid) for c in results_pch]))
                            if results_chs:
                                ent[col_chs] = "\"{}\"".format(",".join(
                                    [str(c.csid) for c in results_chs]))

                            sleep(0.5)

            if output_file:
                dict_to_csv(to_return["content"],
                            output_file=output_file,
                            csv_delimiter=csv_delimiter,
                            write_header=write_header)

            if is_output_sdf:
                writer.close()
        elif not any(to_return["stdout"]) and output_file:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(compound_template_dict.keys()),
                             write_header=write_header)

        return to_return

コード例 #10

ファイルを表示

ファイル: OPSIN.py プロジェクト: abarbarov/FORK-molminer

    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return