Beispiel #1
0
def test_keep_biggest():
    mol = Filters.keep_biggest(MolFromSmiles('CCCC.CC'))
    assert MolToSmiles(mol) == 'CCCC'
    mol = Filters.keep_biggest(MolFromSmiles('CCCCC.CC.[H].CCC'))
    assert MolToSmiles(mol) == 'CCCCC'
    mol = Filters.keep_biggest(MolFromInchi(
        'InChI=1S/C5H12N2O2.C4H7NO4/c6-3-1-2-4(7)5(8)9;5-2(4(8)9)1-3(6)7/h4H,1-3,6-7H2,(H,8,9);2H,1,5H2,(H,6,7)(H,8,9)/t4-;2-/m00/s1'))
    assert MolToInchi(mol) == 'InChI=1S/C4H7NO4/c5-2(4(8)9)1-3(6)7/h2H,1,5H2,(H,6,7)(H,8,9)/t2-/m0/s1'
    mol = Filters.keep_biggest(MolFromInchi('InChI=1S/Mo.4O/q;;;2*-1'))
    assert MolToInchi(mol) == 'InChI=1S/Mo'
Beispiel #2
0
    def test0InchiWritePubChem(self):
        for fp, f in self.dataset.items():
            inchi_db = self.dataset_inchi[fp]
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                ref_inchi = inchi_db[m.GetProp('PUBCHEM_COMPOUND_CID')]
                x, y = MolToInchi(m), ref_inchi
                if x != y:
                    # print("---------------")
                    # print(m.GetProp('PUBCHEM_COMPOUND_CID'))
                    # print(MolToSmiles(m))
                    # print(y)
                    # print(x)
                    if re.search(r'.[1-9]?ClO4', x) is not None:
                        reasonable += 1
                        continue
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # if it is because RDKit does not think the bond is stereo
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(m)))
                    if y != z and inchiDiffPrefix(y, z) == 'b':
                        reasonable += 1
                        continue
                    # some warning
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error:
                            reasonable += 1
                            continue

                    diff += 1
                    print('InChI mismatch for PubChem Compound ' +
                          m.GetProp('PUBCHEM_COMPOUND_CID'))
                    print(MolToSmiles(m, True))
                    print(inchiDiff(x, y))
                    print()

                else:
                    same += 1

            fmt = "\n{0}InChI write Summary: {1} identical, {2} suffix variance, {3} reasonable{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 1162)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 19)
Beispiel #3
0
 def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
     # Import (if needed)
     if itype == 'smiles':
         rdmol = MolFromSmiles(idepic, sanitize=True)
     elif itype == 'inchi':
         rdmol = MolFromInchi(idepic, sanitize=True)
     else:
         raise NotImplementedError(
             '"{}" is not a valid input type'.format(itype))
     if rdmol is None:  # Check imprt
         raise self.DepictionError(
             'Import error from depiction "{}" of type "{}"'.format(
                 idepic, itype))
     # Export
     odepic = dict()
     for item in otype:
         if item == 'smiles':
             odepic[item] = MolToSmiles(
                 rdmol
             )  # MolToSmiles is tricky, one mays want to check the possible options..
         elif item == 'inchi':
             odepic[item] = MolToInchi(rdmol)
         elif item == 'inchikey':
             odepic[item] = MolToInchiKey(rdmol)
         else:
             raise NotImplementedError(
                 '"{}" is not a valid output type'.format(otype))
     return odepic
Beispiel #4
0
    def calculate(self):
        """
        Check if the SMILES is valid then update the info.

        :return: RDKit Mol object
        """
        try:
            m = MolFromSmiles("".join(p.config['long_prefix']) +
                              "".join(self.smiles.element))
            self.smiles.properties[p.s_valid] = False
            if m is not None:
                m = AddHs(m)
                AllChem.EmbedMolecule(m)
                AllChem.UFFOptimizeMolecule(m)
                self.smiles.properties["InChI"] = MolToInchi(m)
        except Exception as e:
            print("Error rdkit : " + repr(e))
            m = None
        if m is not None:
            self.smiles.properties[p.s_valid] = True
            with p.lock_update_data:
                p.tree_info[p.info_good] += 1
                self.smiles.properties[p.s_id] = p.tree_info[p.info_good]
        else:
            with p.lock_update_data:
                p.tree_info[p.info_bad] += 1
                self.smiles.properties[p.s_id] = p.tree_info[p.info_bad]
        return m
Beispiel #5
0
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}):
    """Convert chemical depiction to others type of depictions
    
    :param  idepic: string depiction to be converted, str
    :param   itype: type of depiction provided as input, str
    :param   otype: types of depiction to be generated, {"", "", ..}
    :return odepic: generated depictions, {"otype1": "odepic1", ..}
    
    Usage example:
    - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
    - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})
    """
    # Import (if needed)
    if itype == 'smiles':
        rdmol = MolFromSmiles(idepic, sanitize=True)
    elif itype == 'inchi':
        rdmol = MolFromInchi(idepic, sanitize=True)
    else:
        raise NotImplementedError('"{}" is not a valid input type'.format(itype))
    if rdmol is None:  # Check imprt
        raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
    
    # Export
    odepic = dict()
    for item in otype:
        if item == 'smiles':
            odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
        elif item == 'inchi':
            odepic[item] = MolToInchi(rdmol)
        elif item == 'inchikey':
            odepic[item] = MolToInchiKey(rdmol)
        else:
            raise NotImplementedError('"{}" is not a valid output type'.format(otype))

    return odepic
Beispiel #6
0
def test_sequence_minimal():
    # Violacein
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer().compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21'
    # L-Lactate
    mol = MolFromInchi('')
Beispiel #7
0
 def inchikey(self, m):
     """make InChi from molecule"""
     if self.hasInchi:
         from rdkit.Chem import MolToInchi, InchiToInchiKey
         return InchiToInchiKey(MolToInchi(m))
     else:
         #plpy.notice('InChi not available')
         return None
Beispiel #8
0
def test_sequence_rr_legacy():
    # Violacein
    mol = MolFromInchi(
        'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    )
    ans = Standardizer(sequence_fun='sequence_rr_legacy').compute(mol)
    assert MolToInchi(
        ans
    ) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(
        ans
    ) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
Beispiel #9
0
 def testPrechloricAcid(self):
     examples = (
         ('OCl(=O)(=O)=O', 'InChI=1S/ClHO4/c2-1(3,4)5/h(H,2,3,4,5)'),
         ('CC1=CC2=NCC(CN2C=C1)C(=O)c3ccc4cc(C)ccc4c3.OCl(=O)(=O)=O',
          'InChI=1S/C21H20N2O.ClHO4/c1-14-3-4-17-11-18(6-5-16(17)9-14)21(24)19-12-22-20-10-15(2)7-8-23(20)13-19;2-1(3,4)5/h3-11,19H,12-13H2,1-2H3;(H,2,3,4,5)'
          ),
         ('CNc1ccc2nc3ccccc3[n+](C)c2c1.[O-]Cl(=O)(=O)=O',
          'InChI=1S/C14H13N3.ClHO4/c1-15-10-7-8-12-14(9-10)17(2)13-6-4-3-5-11(13)16-12;2-1(3,4)5/h3-9H,1-2H3;(H,2,3,4,5)'
          ),
     )
     for smiles, expected in examples:
         m = MolFromSmiles(smiles)
         inchi = MolToInchi(m)
         self.assertEqual(inchi, expected)
Beispiel #10
0
    def commute_inchi(cls, mol_in):
        """Convert RDKit compound back and forth to InChi.

        Returns a new compound after the initial one has been converted
        back and forth to InChi.
        
        :param   mol_in:  RDKit Mol
        :return  mol_out: RDKit Mol
        """
        inchi = MolToInchi(mol_in, logLevel=None)  # this is talkative...
        mol_out = MolFromInchi(inchi,
                               sanitize=False,
                               removeHs=False,
                               logLevel=None,
                               treatWarningAsError=False)
        if not mol_out:
            raise ValueError("Failed InChi validity filter.")
        # Copy the properties
        cls._copy_properties(mol_in, mol_out)
        return mol_out
Beispiel #11
0
    def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
        """Convert chemical depiction to others type of depictions

        Usage example:
         - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
         - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})

        :param idepic: Input string
        :param itype: The type of input
        :param otype: Type of output. Valid options: inchi, smiles, inchikey

        :type idepic: str 
        :type itype: str
        :type otype: dict

        :rtype: dict
        :return: Dictionnary of results
        """
        # Import (if needed)
        if itype == 'smiles':
            rdmol = MolFromSmiles(idepic, sanitize=True)
        elif itype == 'inchi':
            rdmol = MolFromInchi(idepic, sanitize=True)
        else:
            raise NotImplementedError('"{}" is not a valid input type'.format(itype))
        if rdmol is None:  # Check imprt
            raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
        # Export
        odepic = dict()
        for item in otype:
            if item == 'smiles':
                odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
            elif item == 'inchi':
                odepic[item] = MolToInchi(rdmol)
            elif item == 'inchikey':
                odepic[item] = MolToInchiKey(rdmol)
            else:
                raise NotImplementedError('"{}" is not a valid output type'.format(otype))
        return odepic
Beispiel #12
0
 def test2InchiOptions(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi1 = MolToInchi(m).split('/', 1)[1]
     inchi2 = MolToInchi(m, "/SUU").split('/', 1)[1]
     self.assertEqual(inchi1 + '/b4-3?', inchi2)
Beispiel #13
0
    def test1InchiReadPubChem(self):
        for f in self.dataset.values():
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                x = MolToInchi(m)
                y = None
                RDLogger.DisableLog('rdApp.error')
                mol = MolFromInchi(x)
                RDLogger.EnableLog('rdApp.error')
                if mol is not None:
                    y = MolToInchi(
                        MolFromSmiles(MolToSmiles(mol, isomericSmiles=True)))
                if y is None:
                    # metal involved?
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error or \
                                'Charges were rearranged' in error:
                            reasonable += 1
                            continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # RDKit does not like the SMILES? use MolBlock instead
                    inchiMol = MolFromInchi(x)
                    if inchiMol:
                        rdDepictor.Compute2DCoords(inchiMol)
                        z = MolToInchi(MolFromMolBlock(
                            MolToMolBlock(inchiMol)))
                        if x == z:
                            reasonable += 1
                            continue
                    # InChI messed up the radical?
                    unsanitizedInchiMol = MolFromInchi(x, sanitize=False)
                    if sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in m.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]) != sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in unsanitizedInchiMol.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]):
                        reasonable += 1
                        continue

                    diff += 1
                    cid = m.GetProp('PUBCHEM_COMPOUND_CID')
                    print(COLOR_GREEN + 'Empty mol for PubChem Compound ' +
                          cid + '\n' + COLOR_RESET)
                    continue
                if x != y:
                    # if there was warning in the first place, then this is
                    # tolerable
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                        MolFromInchi(x, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        reasonable += 1
                        continue
                    # or if there are big rings
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # or if RDKit loses bond stereo
                    s = MolToSmiles(m, True)
                    if MolToSmiles(MolFromSmiles(s), True) != s:
                        reasonable += 1
                        continue
                    # or if it is RDKit SMILES writer unhappy about the mol
                    inchiMol = MolFromInchi(x)
                    rdDepictor.Compute2DCoords(inchiMol)
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol)))
                    if x == z:
                        reasonable += 1
                        continue

                    diff += 1
                    print(COLOR_GREEN +
                          'Molecule mismatch for PubChem Compound ' + cid +
                          COLOR_RESET)
                    print(inchiDiff(x, y))
                    print()
                else:
                    same += 1
            fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 621)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 560)
Beispiel #14
0
    def process(self,
                input_text: str = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                sdf_append: bool = False,
                input_type: str = "",
                lang: str = "eng",
                paged_text: bool = False,
                format_output: bool = True,
                opsin_types: list = None,
                standardize_mols: bool = True,
                convert_ions: bool = True,
                write_header: bool = True,
                iob_format: bool = False,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                normalize_text: bool = True,
                remove_duplicates: bool = False,
                annotate: bool = True,
                annotation_sleep: int = 2,
                chemspider_token: str = "",
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with ChemSpot.

        Parameters
        ----------
        input_text : str
            String to be processed by ChemSpot.
        input_file : str
            Path to file to be processed by ChemSpot.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in. SDF is from OPSIN converted entities.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped.
        lang : str
            | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata
            | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra".
        paged_text : bool
            If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities.
            ASCII control character 12 (Form Feed, '\f') is expected between pages.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        opsin_types : list
            | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want
              to convert entities, pass empty list.
            | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types`
              is ["SYSTEMATIC"] (these should be only IUPAC names).
            | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE"
        standardize_mols : bool
            If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN.
        convert_ions : bool
            If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted
            with OPSIN.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header:
            "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates"
        iob_format : bool
            If True, output will be in IOB format.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        normalize_text : bool
            If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization
            can ChemSpot produce unpredictable results which cannot be parsed.
        remove_duplicates : bool
            If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but
            same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for entity name, SMILES etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
            | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.).
        annotation_sleep: int
            How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it.
        continue_on_failure : bool
            | If True, continue running even if ChemSpot returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from ChemSpot
            - stderr: str ... standard error output from ChemSpot
            - exit_code: int ... exit code from ChemSpot
            - content

              - list of OrderedDicts ... when `format_output` is True
              - None ... when `format_output` is False

            - normalized_text : str
        """

        if opsin_types is None:
            opsin_types = ["SYSTEMATIC"]

        if input_text and input_file:
            input_file = ""
            self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.")
        elif not input_text and not input_file:
            raise ValueError("One of 'input_text' or 'input_file' must be set.")

        if not input_type and not input_text:
            possible_input_types = ["pdf", "image", "text"]
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types))
        elif input_type and not input_text:
            possible_input_types = ["pdf", "pdf_scan", "image", "text"]
            if input_type not in possible_input_types:
                raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types))

        if input_type in ["pdf", "pdf_scan", "image"]:
            input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"])
            input_file = ""

        if annotate and not chemspider_token:
            self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.")

        options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v},
                           self.options_internal)
        output_file_temp = None

        commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary)
        commands.insert(1, str(self.options_internal["max_memory"]))
        commands.append("-t")

        if normalize_text:
            normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True)

            if input_file:
                with open(input_file, mode="r") as f:
                    input_text = f.read()

            input_text = normalizer(input_text)

            if not input_text:
                raise UserWarning("'input_text' is empty after normalization.")

            input_text = self.normalize_text(text=input_text)
            input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8")
            input_file_normalized.write(input_text)
            input_file_normalized.flush()
            input_file = input_file_normalized.name
        else:
            if input_text:
                input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8")
                input_file_temp.write(input_text)
                input_file_temp.flush()
                input_file = input_file_temp.name

        commands.append(os.path.abspath(input_file))
        commands.append("-o")
        if format_output:
            output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8")
            commands.append(os.path.abspath(output_file_temp.name))
        else:
            commands.append(os.path.abspath(output_file))

        if dry_run:
            return " ".join(commands)

        stdout, stderr, exit_code = common_subprocess(commands)

        if "OutOfMemoryError" in stderr:
            raise RuntimeError("ChemSpot memory error: {}".format(stderr))

        to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None,
                     "normalized_text": input_text if normalize_text else None}

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("ChemSpot error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if normalize_text:
            to_return["normalized_text"] = input_text

        if not format_output:
            return to_return
        elif format_output:
            with open(output_file_temp.name, mode="r", encoding="utf-8") as f:
                output_chs = f.read()

            entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs)
            to_return["content"] = entities

            if remove_duplicates and not iob_format:
                seen = set()
                seen_add = seen.add
                to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))]

            if input_type in ["pdf", "pdf_scan"] or paged_text:
                page_ends = []
                for i, page in enumerate(input_text.split("\f")):
                    if page.strip():
                        try:
                            page_ends.append(page_ends[-1] + len(page) - 1)
                        except IndexError:
                            page_ends.append(len(page) - 1)

            if opsin_types:
                if convert_ions:
                    to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])]
                else:
                    to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types]

                if to_convert:
                    opsin = OPSIN(verbosity=self.verbosity)
                    opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"],
                                                    standardize_mols=standardize_mols, output_file_sdf=output_file_sdf,
                                                    sdf_append=sdf_append)
                    opsin_converted = iter(opsin_converted["content"])
                else:
                    self.logger.info("Nothing to convert with OPSIN.")

            if annotate:
                chemspider = ChemSpider(chemspider_token) if chemspider_token else None

            for i, ent in enumerate(to_return["content"]):
                if input_type in ["pdf", "pdf_scan"] or paged_text:
                    ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1)

                if convert_ions:
                    match_ion = self.re_ion.match(ent["entity"])
                    if match_ion:
                        match_ion = match_ion.groupdict()
                        match_charge = self.re_charge.search(match_ion["charge"])
                        if match_charge:
                            match_charge = match_charge.groupdict()
                            if match_charge["roman"]:
                                smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"]))
                            elif match_charge["digit"]:
                                if "+" in match_ion["charge"]:
                                    smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"])
                                elif "-" in match_ion["charge"]:
                                    smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"])
                            elif match_charge["signs"]:
                                smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0],
                                                           len(match_charge["signs"]))

                            mol = MolFromSmiles(smiles)
                            if mol:
                                inchi = MolToInchi(mol)
                                if inchi:
                                    ent.update(OrderedDict(
                                        [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))]))
                                else:
                                    ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")]))
                            else:
                                ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")]))
                    else:
                        ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")]))

                if opsin_types and to_convert:
                    if ent["entity"] in to_convert:
                        ent_opsin = next(opsin_converted)
                        ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]),
                                                ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])]))
                    elif convert_ions and self.re_ion.match(ent["entity"]):
                        ent.update(OrderedDict([("opsin_error", "")]))
                    elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert):
                        ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")]))

                # TODO: this should be simplified...looks like garbage code
                if annotate:
                    self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"])))
                    ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""),
                                            ("pch_cids_by_name", ""), ("chs_cids_by_name", ""),
                                            ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""),
                                            ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""),
                                            ("pch_cids_by_formula", ""),
                                            ("pch_iupac_name", ""), ("chs_common_name", ""),
                                            ("pch_synonyms", "")]))

                    # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB
                    found_in_pch = False
                    found_in_chs = False
                    for _ in range(2):
                        results = []

                        # prefer InChI key
                        if "inchikey" in ent and ent["inchikey"]:
                            try:
                                results = get_compounds(ent["inchikey"], "inchikey")
                                if results:
                                    if len(results) == 1:
                                        result = results[0]
                                        synonyms = result.synonyms
                                        if synonyms:
                                            ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms))
                                        ent["pch_iupac_name"] = result.iupac_name
                                        if not found_in_chs:
                                            ent["smiles"] = result.canonical_smiles or ent["smiles"]
                                            ent["inchi"] = result.inchi or ent["inchi"]
                                            ent["inchikey"] = result.inchikey or ent["inchikey"]
                                    ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results]))
                            except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                pass

                            results = chemspider.search(ent["inchikey"]) if chemspider_token else []
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    ent["chs_common_name"] = result.common_name
                                    if not found_in_pch:
                                        ent["smiles"] = result.smiles or ent["smiles"]
                                        ent["inchi"] = result.stdinchi or ent["inchi"]
                                        ent["inchikey"] = result.stdinchikey or ent["inchikey"]
                                ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results]))
                        else:
                            if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                try:
                                    results = get_compounds(ent["entity"] or ent["abbreviation"], "name")
                                    if results:
                                        if len(results) == 1:
                                            found_in_pch = True
                                            result = results[0]
                                            synonyms = result.synonyms
                                            if synonyms:
                                                ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms))
                                            # only update identifiers if they weren't found in second DB
                                            if not found_in_chs:
                                                ent["smiles"] = result.canonical_smiles or ent["smiles"]
                                                ent["inchi"] = result.inchi or ent["inchi"]
                                                ent["inchikey"] = result.inchikey or ent["inchikey"]
                                            ent["pch_iupac_name"] = result.iupac_name
                                        ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results]))
                                except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                    pass

                            if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else []
                                if results:
                                    if len(results) == 1:
                                        found_in_chs = True
                                        result = results[0]
                                        if not found_in_pch:
                                            ent["smiles"] = result.smiles or ent["smiles"]
                                            ent["inchi"] = result.stdinchi or ent["inchi"]
                                            ent["inchikey"] = result.stdinchikey or ent["inchikey"]
                                        ent["chs_common_name"] = result.common_name
                                    ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results]))

                            for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"),
                                                                   ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"),
                                                                   ("formula", "pch_cids_by_formula", "")]:
                                results_pch = []
                                results_chs = []

                                if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]:
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["smiles"], "smiles")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                        results_chs = chemspider.search(ent["smiles"]) if chemspider_token else []
                                elif search_field == "inchi" and "inchi" in ent and ent["inchi"]:
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["inchi"], "inchi")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs):
                                        results_chs = chemspider.search(ent["inchi"]) if chemspider_token else []
                                elif search_field == "formula":
                                    if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs):
                                        try:
                                            results_pch = get_compounds(ent["entity"], "formula")
                                        except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError):
                                            pass
                                    # ChemSpider doesn't have search field for 'formula'

                                if results_pch:
                                    ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch]))
                                if results_chs:
                                    ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs]))

                                sleep(0.5)

                        sleep(annotation_sleep)

                        if not found_in_pch and not found_in_chs:
                            break

            if output_file:
                dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header)

        return to_return
Beispiel #15
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return
Beispiel #16
0
def test_sequence_tunable():
    # Check default arguments
    args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(sequence_tunable)
    default_params = dict(zip(args[-len(defaults):], defaults))
    assert default_params == {
            'OP_REMOVE_ISOTOPE':True,
            'OP_NEUTRALISE_CHARGE': True,
            'OP_REMOVE_STEREO': False,
            'OP_COMMUTE_INCHI': False,
            'OP_KEEP_BIGGEST': True,
            'OP_ADD_HYDROGEN': True,
            'OP_KEKULIZE': True,
            'OP_NEUTRALISE_CHARGE_LATE': True
    }
    # Violacein, default parameter
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
    # Violacein, strip stereo
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
    assert MolToSmiles(ans) == '[H]OC1=C([H])C2=C(C([H])=C1[H])N([H])C([H])=C2C1=C([H])C(=C2C(=O)N([H])C3=C([H])C([H])=C([H])C([H])=C23)C(=O)N1[H]'
    # Violacien, implicit Hs
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == 'OC1=CC2=C(C=C1)NC=C2C1=C/C(=C2/C3=CC=CC=C3N=C2O)C(O)=N1'
    # Violacien, no kekulerization
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_KEKULIZE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21'
    # Violacien, strip stereo & implicit Hs & no kekulerization
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True, 'OP_ADD_HYDROGEN': False, 'OP_KEKULIZE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
    assert MolToSmiles(ans) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21'
    # Lactate, default parameter
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
    assert MolToSmiles(ans) == '[H]OC(=O)[C@@]([H])(O[H])C([H])([H])[H]'
    # L-lactate, implicit Hs
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
    assert MolToSmiles(ans) == 'C[C@H](O)C(=O)O'
    # L-lactate, no stereo
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
    assert MolToSmiles(ans) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]'
    # L-lactate, no charge neutralisation
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_NEUTRALISE_CHARGE': False, 'OP_NEUTRALISE_CHARGE_LATE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1'
    assert MolToSmiles(ans) == '[H]O[C@]([H])(C(=O)[O-])C([H])([H])[H]'
    # L-lactate, implicit Hs & no stereo
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False, 'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
    assert MolToSmiles(ans) == 'CC(O)C(=O)O'
Beispiel #17
0
def test_commute_inchi():
    inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1'
    mol = Filters.commute_inchi(MolFromInchi(inchi))
    assert MolToInchi(mol) == inchi
Beispiel #18
0
    def process(
            self,
            input_file: str,
            output_file: str = "",
            output_file_sdf: str = "",
            sdf_append: bool = False,
            #images_prefix: str = "",
            format_output: bool = True,
            write_header: bool = True,
            osra_output_format: str = "",
            output_formats: list = None,
            dry_run: bool = False,
            csv_delimiter: str = ";",
            use_gm: bool = True,
            gm_dpi: int = 300,
            gm_trim: bool = True,
            n_jobs: int = -1,
            input_type: str = "",
            standardize_mols: bool = True,
            annotate: bool = True,
            chemspider_token: str = "",
            custom_page: int = 0,
            continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OSRA.

        Parameters
        ----------
        input_file : str
            Path to file to be processed by OSRA.
        output_file : str
            File to write output in.
        output_file_sdf : str
            | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output.
            | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        NOT IMPLEMENTED | images_prefix : str
            Prefix for images of extracted compounds which will be written.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        osra_output_format : str
            | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "smi", "can", "sdf"
            | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet).
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA.
            | Default value: ["smiles"]

            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      Value      |    Source    |                                            Note                                            |
            +=================+==============+============================================================================================+
            |      smiles     |     RDKit    |                                          canonical                                         |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |   smiles_osra   | OSRA ("smi") |                                           SMILES                                           |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            | smiles_can_osra | OSRA ("can") |                                      canonical SMILES                                      |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      inchi      |     RDKit    | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     inchikey    |     RDKit    |                              The same applies as for "inchi".                              |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |       sdf       |     RDKit    |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     sdf_osra    | OSRA ("sdf") |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+

        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        use_gm : bool
            | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing.
            | If False, OSRA will use it's own conversion of PDF to image.
            | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information
              when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes
              incorrectly recognised structures.
        gm_dpi : int
            How many DPI will temporary PNG images have.
        gm_trim : bool
            If True, gm will trim the temporary PNG images.
        n_jobs : int
            | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images.
            | If -1 all CPUs are used.
            | If 1 is given, no parallel computing code is used at all, which is useful for debugging.
            | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf" or "image" and magic bytes check will be skipped.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for SMILES, InChI etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API. Make account there to obtain it.
        custom_page : bool
            When `use_gm` is False, this will set the page for all extracted compounds.
        continue_on_failure : bool
            | If True, continue running even if OSRA returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OSRA
            - stderr: str ... standard error output from OSRA
            - exit_code: int ... exit code from OSRA
            - content:

                - list of OrderedDicts ... when `format_output` is True.
                - None ... when `format_output` is False

            | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved.
            | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image
              extracted by OSRA.

        Notes
        -----
        Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise
        you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set).
        """

        options_internal = self.options_internal.copy()
        osra_smiles_outputs = ["smi", "can"]

        # OSRA output format check
        if osra_output_format:
            options_internal["output_format"] = osra_output_format
        else:
            osra_output_format = options_internal["output_format"]

        osra_valid_output_formats = {
            "can": "smiles_can_osra",
            "smi": "smiles_osra",
            "sdf": "sdf_osra"
        }
        if osra_output_format not in osra_valid_output_formats:
            raise ValueError(
                "Unknown OSRA output format. Possible values: {}".format(
                    osra_valid_output_formats.values()))

        if osra_output_format == "sdf":
            self.logger.warning(
                "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved."
            )

        # output formats check
        is_output_sdf = False
        is_output_sdf_osra = False
        if not output_formats:
            output_formats = ["smiles"]
        else:
            output_formats = sorted(list(set(output_formats)))
            possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"]
            output_formats = [
                x for x in output_formats if x in possible_output_formats
                or x == osra_valid_output_formats[osra_output_format]
            ]

            if ("sdf" in output_formats
                    or "sdf_osra" in output_formats) and not output_file_sdf:
                self.logger.warning(
                    "Cannot write SDF output: 'output_file_sdf' is not set.")
            if output_file_sdf:
                is_output_sdf = True
            if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf:
                is_output_sdf_osra = True
            if ("smiles_osra" in output_formats or "smiles_can_osra"
                    in output_formats) and osra_output_format == "sdf":
                try:
                    output_formats.remove("smiles_osra")
                except ValueError:
                    pass
                try:
                    output_formats.remove("smiles_can_osra")
                except ValueError:
                    pass
                self.logger.warning(
                    "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"."
                    .format(osra_output_format))

        # input file type check
        possible_input_types = ["pdf", "image"]
        if not input_type:
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                use_gm = False
                self.logger.warning(
                    "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)."
                    .format(input_type, possible_input_types))
        elif input_type not in possible_input_types:
            raise ValueError("Possible 'input_type' values are {}".format(
                possible_input_types))

        #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v},
        #                   options_internal)

        if annotate:
            if not chemspider_token:
                self.logger.warning(
                    "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty."
                )
            [
                output_formats.append(x)
                for x in ["smiles", "inchi", "inchikey"]
                if x not in output_formats
            ]
            output_formats = sorted(output_formats)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)
        commands.extend(
            ["--bond", "--coordinates", "--page", "--guess", "--print"])

        if dry_run:
            return " ".join(commands)

        osra_output_list = []
        if input_type == "image" or not use_gm:
            osra_output_list.append(
                self._process(input_file,
                              commands,
                              page=custom_page if custom_page else 1))
        elif input_type == "pdf":
            with tempfile.TemporaryDirectory() as temp_dir:
                stdout, stderr, exit_code = pdf_to_images(input_file,
                                                          temp_dir,
                                                          dpi=gm_dpi,
                                                          trim=gm_trim)
                osra_output_list = Parallel(n_jobs=n_jobs)(
                    delayed(self._process)(
                        temp_image_file, commands, page=page)
                    for temp_image_file, page in get_temp_images(temp_dir))

        # summarize OSRA results
        to_return = {
            "stdout": [],
            "stderr": [],
            "exit_code": [],
            "content": None,
            "pages": []
        }
        for result in osra_output_list:
            if result["stdout"]:
                to_return["stdout"].append(result["stdout"])
                to_return["stderr"].append(result["stderr"])
                to_return["exit_code"].append(result["exit_code"])
                to_return["pages"].append(result["page"])

        if not continue_on_failure:
            errors = [(page + 1, error)
                      for page, (exit_code, error) in enumerate(
                          zip(to_return["exit_code"], to_return["stderr"]))
                      if exit_code > 0]
            if errors:
                self.logger.warning("OSRA errors:")
                for page, error in errors:
                    eprint("\tError on page {}:".format(page))
                    eprint("\n\t\t".join("\n{}".format(error).splitlines()))
                return to_return

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write("\n".join(to_return["stdout"]))
            return to_return

        output_cols = OrderedDict([("bond_length", 1), ("resolution", 2),
                                   ("confidence", 3), ("page", 4),
                                   ("coordinates", 5)])

        if osra_output_format in osra_smiles_outputs:
            compound_template_dict = OrderedDict.fromkeys(
                output_formats + list(output_cols.keys()))
        else:
            compound_template_dict = OrderedDict.fromkeys(["page"] +
                                                          output_formats)

        if any(to_return["stdout"]):
            if standardize_mols:
                standardizer = Standardizer()

            compounds = []

            if is_output_sdf:
                if sdf_append:
                    if not os.path.isfile(output_file_sdf):
                        open(output_file_sdf, mode="w",
                             encoding="utf-8").close()
                    writer = SDWriter(
                        open(output_file_sdf, mode="a", encoding="utf-8"))
                else:
                    writer = SDWriter(output_file_sdf)

            for output, page in zip(to_return["stdout"], to_return["pages"]):
                if osra_output_format in osra_smiles_outputs:
                    lines = [x.strip() for x in output.split("\n") if x]
                else:
                    lines = [x for x in output.split("$$$$") if x.strip()]

                for line in lines:
                    """
                    # so much problems with --learn
                    # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1"
                    if "learn" in filtered_cols:
                        learn_start = filtered_cols.index("learn") + 1 #  "smiles" col isn't in output_cols
                        learn_end = filtered_cols.index("learn") + 1 + 3
                        line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])]
                    """

                    if not line:
                        continue

                    if osra_output_format in osra_smiles_outputs:
                        line = [x.strip() for x in line.split()]
                        if custom_page:
                            line[output_cols["page"]] = custom_page
                        elif use_gm:
                            line[output_cols["page"]] = page
                        mol = MolFromSmiles(
                            line[0],
                            sanitize=False if standardize_mols else True)
                    elif osra_output_format == "sdf":
                        line = "\n" + line.strip()
                        mol = MolFromMolBlock(
                            line,
                            strictParsing=False,
                            sanitize=False if standardize_mols else True,
                            removeHs=False if standardize_mols else True)

                    if mol:
                        compound = compound_template_dict.copy()

                        if standardize_mols:
                            try:
                                mol = standardizer.standardize(mol)
                            except ValueError as e:
                                self.logger.warning(
                                    "Cannot standardize '{}': {}".format(
                                        MolToSmiles(mol), str(e)))

                        for f in output_formats:
                            if f == "smiles":
                                compound["smiles"] = MolToSmiles(
                                    mol, isomericSmiles=True)
                            elif f == "smiles_osra" and osra_output_format == "smi":
                                compound["smiles_osra"] = line[0]
                            elif f == "smiles_can_osra" and osra_output_format == "can":
                                compound["smiles_can_osra"] = line[0]
                            elif f == "inchi":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchi"] = inchi
                                else:
                                    compound["inchi"] = ""
                                    self.logger.warning(
                                        "Cannot convert to InChI: {}".format(
                                            MolToSmiles(mol)))
                            elif f == "inchikey":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchikey"] = InchiToInchiKey(
                                        inchi)
                                else:
                                    compound["inchikey"] = ""
                                    self.logger.warning(
                                        "Cannot create InChI-key from InChI: {}"
                                        .format(MolToSmiles(mol)))
                            elif f == "sdf":
                                compound["sdf"] = MolToMolBlock(
                                    mol, includeStereo=True)
                            elif f == "sdf_osra":
                                compound["sdf_osra"] = line

                        if is_output_sdf:
                            writer.write(mol)

                        if osra_output_format in osra_smiles_outputs:
                            compound.update([(x[0], x[1]) for x in zip(
                                list(output_cols.keys()), line[1:])])
                        else:
                            compound[
                                "page"] = page if use_gm else custom_page if custom_page else 1

                        compounds.append(compound)
                    else:
                        self.logger.warning("Cannot convert to RDKit mol: " +
                                            line[0])

            if is_output_sdf_osra:
                with open(output_file_sdf + "-osra.sdf",
                          mode="w",
                          encoding="utf-8") as f:
                    f.write("".join(to_return["stdout"]))

            to_return["content"] = sorted(compounds, key=lambda x: x["page"])

            if annotate:
                chemspider = ChemSpider(
                    chemspider_token) if chemspider_token else None

                for i, ent in enumerate(to_return["content"]):
                    self.logger.info("Annotating entity {}/{}...".format(
                        i + 1, len(to_return["content"])))
                    ent.update(
                        OrderedDict([("pch_cids_by_inchikey", ""),
                                     ("chs_cids_by_inchikey", ""),
                                     ("pch_cids_by_smiles", ""),
                                     ("chs_cids_by_smiles", ""),
                                     ("pch_cids_by_inchi", ""),
                                     ("chs_cids_by_inchi", ""),
                                     ("pch_iupac_name", ""),
                                     ("chs_common_name", ""),
                                     ("pch_synonyms", "")]))

                    results = []

                    # prefer InChI key
                    if "inchikey" in ent and ent["inchikey"]:
                        try:
                            results = get_compounds(ent["inchikey"],
                                                    "inchikey")
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    synonyms = result.synonyms
                                    if synonyms:
                                        ent["pch_synonyms"] = "\"{}\"".format(
                                            "\",\"".join(synonyms))
                                    ent["pch_iupac_name"] = result.iupac_name
                                ent["pch_cids_by_inchikey"] = "\"{}\"".format(
                                    ",".join([str(c.cid) for c in results]))
                        except (BadRequestError, NotFoundError,
                                PubChemHTTPError, ResponseParseError,
                                ServerError, TimeoutError, PubChemPyError):
                            pass

                        results = chemspider.search(
                            ent["inchikey"]) if chemspider_token else []
                        if results:
                            if len(results) == 1:
                                result = results[0]
                                ent["chs_common_name"] = result.common_name
                            ent["chs_cids_by_inchikey"] = "\"{}\"".format(
                                ",".join([str(c.csid) for c in results]))
                    else:
                        for search_field, col_pch, col_chs in [
                            ("smiles", "pch_cids_by_smiles",
                             "chs_cids_by_smiles"),
                            ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi")
                        ]:
                            results_pch = []
                            results_chs = []

                            if search_field == "smiles" and "smiles" in ent and ent[
                                    "smiles"] and "*" not in ent["smiles"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["smiles"], "smiles")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["smiles"]) if chemspider_token else []
                            elif search_field == "inchi" and "inchi" in ent and ent[
                                    "inchi"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["inchi"], "inchi")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["inchi"]) if chemspider_token else []

                            if results_pch:
                                ent[col_pch] = "\"{}\"".format(",".join(
                                    [str(c.cid) for c in results_pch]))
                            if results_chs:
                                ent[col_chs] = "\"{}\"".format(",".join(
                                    [str(c.csid) for c in results_chs]))

                            sleep(0.5)

            if output_file:
                dict_to_csv(to_return["content"],
                            output_file=output_file,
                            csv_delimiter=csv_delimiter,
                            write_header=write_header)

            if is_output_sdf:
                writer.close()
        elif not any(to_return["stdout"]) and output_file:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(compound_template_dict.keys()),
                             write_header=write_header)

        return to_return
import pandas as pd
from tqdm import tqdm
import os.path as osp
from rdkit.Chem import MolFromSmiles, MolToInchi
import torch

if __name__ == '__main__':
    dd_csv_folder = "/scratch/projects/yzlab/group/temp_dd/solvation/calculated/"
    train_csv = pd.read_csv(osp.join(dd_csv_folder, "all.csv"))
    valid_csv = pd.read_csv(osp.join(dd_csv_folder, "valid.csv"))
    test_csv = pd.read_csv(osp.join(dd_csv_folder, "test.csv"))

    lipop_csv = pd.read_csv("lipop.csv")
    lipop_inchi = [
        MolToInchi(MolFromSmiles(s)) for s in lipop_csv["cano_smiles"]
    ]
    freesolv_csv = pd.read_csv("freesolv.csv")
    freesolv_inchi = [
        MolToInchi(MolFromSmiles(s)) for s in freesolv_csv["cano_smiles"]
    ]

    all_inchi = lipop_inchi
    all_inchi.extend(freesolv_inchi)

    inchi_exist_map = []

    # concatenate them in this order
    concat_csv = pd.concat([train_csv, valid_csv, test_csv], ignore_index=True)
    for inchi in tqdm(concat_csv["InChI"]):
        if inchi in all_inchi:
            inchi_exist_map.append(1)
Beispiel #20
0
 def test4MolToInchiKey(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi = MolToInchi(m)
     k1 = InchiToInchiKey(inchi)
     k2 = MolToInchiKey(m)
     self.assertEqual(k1, k2)