Beispiel #1
0
def split_data(mols, acts, test_percent, split):
    mols_train = []
    mols_test = []
    molnames_train = []
    molnames_test = []
    acts_train = []
    acts_test = []
    actnames_train = []
    actnames_test = []

    # Split molecules and activities training set into training and test sets
    m_train, m_test, a_train, a_test = train_test_split(mols,
                                                        acts,
                                                        test_size=test_percent,
                                                        random_state=split)
    # Make a list of the names of all the molecules in the training list
    names_train = []

    for mol in m_train:
        names_train.append(mol[1])

    # Iterate over all the molecules we have read in
    for i in range(len(mols)):
        # assert mols[i][1] == acts[i][1]
        if mols[i][1] in names_train:  # is the molecule in the training set?
            mols_train.append(mols[i][0])
            molnames_train.append(mols[i][1])
            acts_train.append(acts[i][0])
            actnames_train.append(acts[i][1])
        else:  # the molecule is in the test set if it isn't in the the training set
            mols_test.append(mols[i][0])
            molnames_test.append(mols[i][1])
            acts_test.append(acts[i][0])
            actnames_test.append(acts[i][1])

    assert molnames_train == actnames_train
    assert molnames_test == actnames_test

    # Standardize structures of the training set and test set
    s = Standardizer()
    standard_mols_train = []

    for mol in mols_train:
        standard_mols_train.append(s.standardize(mol))

    standard_mols_test = []

    for mol in mols_test:
        standard_mols_test.append(s.standardize(mol))

    return standard_mols_train, molnames_train, acts_train, standard_mols_test, molnames_test, acts_test
Beispiel #2
0
def filter_salts(in_lines, Verbose=False):

    # standardize structures and remove salts
    #
    # This should be called before any other filters having to do with molecular structures as it
    # affects both the molecular structure and the molecular weight of many compounds that come out of ChEMBL

    s = Standardizer()
    #salt_file = code_dir / 'Salts.txt'
    salt_file = conf_dir + '/Salts.txt'
    remover = SaltRemover.SaltRemover(defnFilename=salt_file)

    for i in range(len(in_lines)):
        mol_in = Chem.MolFromSmiles(in_lines['canonical_smiles'][i])
        mol_out = s.standardize(mol_in)
        smiles_out = Chem.MolToSmiles(remover(mol_out), isomericSmiles=False)
        if '.' in smiles_out:
            in_lines = in_lines.drop(i)
        else:
            in_lines.loc[i, 'canonical_smiles'] = smiles_out


#             in_lines['canonical_smiles'].replace(i,smiles_out)
#             ## I believe you should just use replace
# The replace function replaces values equal to i with smiles_out
# so I do not think we want to use replace

    if Verbose:
        print('Number of compounds after desalting pass: ', len(in_lines))

    return in_lines.reset_index(drop=True)
Beispiel #3
0
def sanitize_smiles_molvs(smiles, largest_fragment=False):
    """Sanitize a SMILES with MolVS

    Parameters
    ----------
    smiles : str
        SMILES string for a molecule.
    largest_fragment : bool
        Whether to select only the largest covalent unit in a molecule with
        multiple fragments. Default to False.

    Returns
    -------
    str
        SMILES string for the sanitized molecule.
    """
    standardizer = Standardizer()
    standardizer.prefer_organic = True

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return smiles
    try:
        mol = standardizer.standardize(
            mol)  # standardize functional group reps
        if largest_fragment:
            mol = standardizer.largest_fragment(
                mol)  # remove product counterions/salts/etc.
        mol = standardizer.uncharge(mol)  # neutralize, e.g., carboxylic acids
    except Exception:
        pass
    return Chem.MolToSmiles(mol)
Beispiel #4
0
def Tautomerize(mol):
    try:
        if mol.GetBoolProp('tautomerized'): return
    except KeyError:
        pass
    smi1 = Chem.MolToSmiles(mol)
    from molvs import Standardizer
    s = Standardizer()
    try:
        s.standardize(mol)
    except ValueError as e:
        MutateFail(mol)
        return False
    #from molvs.tautomer import TautomerCanonicalizer
    #t = TautomerCanonicalizer()
    #t.canonicalize(mol)
    mol.SetBoolProp('tautomerized', True)
    smi2 = Chem.MolToSmiles(mol)

    if not smi1 == smi2: print "tautomerized:", smi1, 'to:', smi2
    return True
Beispiel #5
0
def standardize_mol(mol_file):

    if Path(mol_file).exists():
        '''Chem.MolFromMolFile() only works with string, not Path object'''
        mol_file = str(mol_file)
        mol = Chem.MolFromMolFile(mol_file)

        s = Standardizer()
        smol = s.standardize(mol)

        with open(mol_file, 'w') as f:
            f.write(Chem.MolToMolBlock(smol))

    else:
        # print('file does not exist.')
        raise RuntimeError('File does not exist.')
Beispiel #6
0
def prepSMI(SMIin, defnFilename, removeMetal=1):

    mol = Chem.MolFromSmiles(SMIin)
    s = Standardizer()

    try:
        molstandardized = s.standardize(mol)
        smilestandadized = Chem.MolToSmiles(molstandardized)
    except:

        return "Error: Standardization Fail"

    # remove salt
    # 1.default
    if defnFilename != "":
        remover = SaltRemover(defnFilename=defnFilename)
    else:
        remover = SaltRemover()
    molclean = remover(molstandardized)
    smilesclean = Chem.MolToSmiles(molclean)

    # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
    lelem = smilesclean.split(".")
    # reduce double, case of several salts are included - 255
    lelem = list(set(lelem))
    try:
        lelem.remove("")
    except:
        pass

    # remove metal
    if removeMetal == 1:
        lnometal = []
        for elem in lelem:
            if is_metalorion(elem) == 0:
                lnometal.append(elem)
        lelem = lnometal

    if len(lelem) == 1:
        smilesclean = str(lelem[0])
        return smilesclean
    elif len(lelem) > 1:
        return "Error: Mixture or fragment ot check: " + smilesclean
    elif smilesclean == "":
        return "Error: SMILES empty after preparation"
    else:
        return "Error: No identified"
Beispiel #7
0
def Tautomerize(mol, aromatic=aromaticity):
    try:
        if mol.GetBoolProp('tautomerized'): return mol
    except KeyError:
        pass


    Chem.SanitizeMol(mol)
    if not (aromatic or aromaticity):
        Chem.Kekulize(mol, True)

    smi1 = Chem.MolToSmiles(mol)
    from molvs import Standardizer
    s = Standardizer()
    try:
        molnew = s.standardize(mol)
    except ValueError as e:
        raise MutateFail(mol)

    if not aromatic:
        Chem.Kekulize(molnew, True)
    smi2 = Chem.MolToSmiles(molnew)

    if smi1 == smi2:
        # we return mol because it contains some properties
        # tautomerized mols need to get the props again
        mol.SetBoolProp('tautomerized', True)
        return mol
    else:
        if mol.HasProp('failedfilter'):
            ff = mol.GetProp('failedfilter')
            molnew.SetProp('failedfilter', ff)
        #print "tautomerized:", smi1, 'to:', smi2
        with open('tautomerized.smi', 'a') as f:
            f.write("{} {}\n".format(smi1, smi2))
        molnew.SetBoolProp('tautomerized', True)
        return molnew
def standardizeMolVS(inMol):
    f = fragment.LargestFragmentChooser()
    outMol = f.choose(inMol)
    c = charge.Uncharger()
    outMol = c.uncharge(outMol)
    s = Standardizer()
    outMol = s.standardize(outMol)
    n = normalize.Normalizer()
    outMol = n.normalize(outMol)
    t = tautomer.TautomerCanonicalizer()
    outMol = t.canonicalize(outMol)

    # Transform with Inchi
    #print "inMol"
    #print Chem.MolToSmiles(inMol)
    #inchi = Chem.inchi.MolToInchi(inMol)
    #print inchi
    #print "outMol"
    #print Chem.MolToSmiles(outMol)
    #inchi = Chem.inchi.MolToInchi(outMol)
    #print inchi
    #outMol = Chem.inchi.MolFromInchi(inchi)

    return outMol
Beispiel #9
0
def normalize(mol, lout):
    s = Standardizer()
    molstandardized = s.standardize(mol)
    #print molstandardized
    lout.append(molstandardized)
Beispiel #10
0
def standardize_main(args):
    mol = _read_mol(args)
    s = Standardizer()
    mol = s.standardize(mol)
    _write_mol(mol, args)
Beispiel #11
0
#pprint (chembl_help)

#Chembl standardize
for lig in range(0, len(chembl_help)):
    #print ('Now I do this from Chembl: ' + chembl_help[lig][0])
    mol = inchi.MolFromInchi(chembl_help[lig][0], sanitize=False)
    try:
        rdmolops.RemoveStereochemistry(mol)
    except Exception:
        print("Not able to remove stereochemistry. Chembl.")
    try:
        mol = standardise.run(mol)
    except standardise.StandardiseException as e:
        logging.warn(e.message)
    try:
        mol = s.standardize(mol)
    except Exception:
        print("Not able to standardize. Chembl.")
    try:
        mol = s.tautomer_parent(mol, skip_standardize=True)
    except Exception:
        print("Not able to make tautomer parent. Chembl.")
    mol = s.stereo_parent(mol, skip_standardize=True)
    chembl_help[lig][0] = inchi.MolToInchi(mol)

#BDB preparing
bdb_help = []
list_help = []
conn = psycopg2.connect('dbname=bdb user=data host=/tmp/')
curs = conn.cursor()
curs.execute(
Beispiel #12
0
    def process(
            self,
            input_file: str,
            output_file: str = "",
            output_file_sdf: str = "",
            sdf_append: bool = False,
            #images_prefix: str = "",
            format_output: bool = True,
            write_header: bool = True,
            osra_output_format: str = "",
            output_formats: list = None,
            dry_run: bool = False,
            csv_delimiter: str = ";",
            use_gm: bool = True,
            gm_dpi: int = 300,
            gm_trim: bool = True,
            n_jobs: int = -1,
            input_type: str = "",
            standardize_mols: bool = True,
            annotate: bool = True,
            chemspider_token: str = "",
            custom_page: int = 0,
            continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OSRA.

        Parameters
        ----------
        input_file : str
            Path to file to be processed by OSRA.
        output_file : str
            File to write output in.
        output_file_sdf : str
            | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output.
            | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        NOT IMPLEMENTED | images_prefix : str
            Prefix for images of extracted compounds which will be written.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts.
            | If True and `output_file` is set, the CSV file will be written.
            | If False, the value of "content" key of returned dict will be None.
        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        osra_output_format : str
            | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "smi", "can", "sdf"
            | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet).
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA.
            | Default value: ["smiles"]

            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      Value      |    Source    |                                            Note                                            |
            +=================+==============+============================================================================================+
            |      smiles     |     RDKit    |                                          canonical                                         |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |   smiles_osra   | OSRA ("smi") |                                           SMILES                                           |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            | smiles_can_osra | OSRA ("can") |                                      canonical SMILES                                      |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |      inchi      |     RDKit    | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     inchikey    |     RDKit    |                              The same applies as for "inchi".                              |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |       sdf       |     RDKit    |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+
            |     sdf_osra    | OSRA ("sdf") |                     If present, an additional SDF file will be created.                    |
            +-----------------+--------------+--------------------------------------------------------------------------------------------+

        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        use_gm : bool
            | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing.
            | If False, OSRA will use it's own conversion of PDF to image.
            | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information
              when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes
              incorrectly recognised structures.
        gm_dpi : int
            How many DPI will temporary PNG images have.
        gm_trim : bool
            If True, gm will trim the temporary PNG images.
        n_jobs : int
            | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images.
            | If -1 all CPUs are used.
            | If 1 is given, no parallel computing code is used at all, which is useful for debugging.
            | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
        input_type : str
            | When empty, input (MIME) type will be determined from magic bytes.
            | Or you can specify "pdf" or "image" and magic bytes check will be skipped.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        annotate : bool
            | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with
              each identifier, separately for SMILES, InChI etc.
            | If entity has InChI key yet, prefer it in searching.
            | If "*" is present in SMILES, skip annotation.
        chemspider_token : str
            Your personal token for accessing the ChemSpider API. Make account there to obtain it.
        custom_page : bool
            When `use_gm` is False, this will set the page for all extracted compounds.
        continue_on_failure : bool
            | If True, continue running even if OSRA returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OSRA
            - stderr: str ... standard error output from OSRA
            - exit_code: int ... exit code from OSRA
            - content:

                - list of OrderedDicts ... when `format_output` is True.
                - None ... when `format_output` is False

            | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved.
            | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image
              extracted by OSRA.

        Notes
        -----
        Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise
        you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set).
        """

        options_internal = self.options_internal.copy()
        osra_smiles_outputs = ["smi", "can"]

        # OSRA output format check
        if osra_output_format:
            options_internal["output_format"] = osra_output_format
        else:
            osra_output_format = options_internal["output_format"]

        osra_valid_output_formats = {
            "can": "smiles_can_osra",
            "smi": "smiles_osra",
            "sdf": "sdf_osra"
        }
        if osra_output_format not in osra_valid_output_formats:
            raise ValueError(
                "Unknown OSRA output format. Possible values: {}".format(
                    osra_valid_output_formats.values()))

        if osra_output_format == "sdf":
            self.logger.warning(
                "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved."
            )

        # output formats check
        is_output_sdf = False
        is_output_sdf_osra = False
        if not output_formats:
            output_formats = ["smiles"]
        else:
            output_formats = sorted(list(set(output_formats)))
            possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"]
            output_formats = [
                x for x in output_formats if x in possible_output_formats
                or x == osra_valid_output_formats[osra_output_format]
            ]

            if ("sdf" in output_formats
                    or "sdf_osra" in output_formats) and not output_file_sdf:
                self.logger.warning(
                    "Cannot write SDF output: 'output_file_sdf' is not set.")
            if output_file_sdf:
                is_output_sdf = True
            if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf:
                is_output_sdf_osra = True
            if ("smiles_osra" in output_formats or "smiles_can_osra"
                    in output_formats) and osra_output_format == "sdf":
                try:
                    output_formats.remove("smiles_osra")
                except ValueError:
                    pass
                try:
                    output_formats.remove("smiles_can_osra")
                except ValueError:
                    pass
                self.logger.warning(
                    "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"."
                    .format(osra_output_format))

        # input file type check
        possible_input_types = ["pdf", "image"]
        if not input_type:
            input_type = get_input_file_type(input_file)
            if input_type not in possible_input_types:
                use_gm = False
                self.logger.warning(
                    "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)."
                    .format(input_type, possible_input_types))
        elif input_type not in possible_input_types:
            raise ValueError("Possible 'input_type' values are {}".format(
                possible_input_types))

        #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v},
        #                   options_internal)

        if annotate:
            if not chemspider_token:
                self.logger.warning(
                    "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty."
                )
            [
                output_formats.append(x)
                for x in ["smiles", "inchi", "inchikey"]
                if x not in output_formats
            ]
            output_formats = sorted(output_formats)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)
        commands.extend(
            ["--bond", "--coordinates", "--page", "--guess", "--print"])

        if dry_run:
            return " ".join(commands)

        osra_output_list = []
        if input_type == "image" or not use_gm:
            osra_output_list.append(
                self._process(input_file,
                              commands,
                              page=custom_page if custom_page else 1))
        elif input_type == "pdf":
            with tempfile.TemporaryDirectory() as temp_dir:
                stdout, stderr, exit_code = pdf_to_images(input_file,
                                                          temp_dir,
                                                          dpi=gm_dpi,
                                                          trim=gm_trim)
                osra_output_list = Parallel(n_jobs=n_jobs)(
                    delayed(self._process)(
                        temp_image_file, commands, page=page)
                    for temp_image_file, page in get_temp_images(temp_dir))

        # summarize OSRA results
        to_return = {
            "stdout": [],
            "stderr": [],
            "exit_code": [],
            "content": None,
            "pages": []
        }
        for result in osra_output_list:
            if result["stdout"]:
                to_return["stdout"].append(result["stdout"])
                to_return["stderr"].append(result["stderr"])
                to_return["exit_code"].append(result["exit_code"])
                to_return["pages"].append(result["page"])

        if not continue_on_failure:
            errors = [(page + 1, error)
                      for page, (exit_code, error) in enumerate(
                          zip(to_return["exit_code"], to_return["stderr"]))
                      if exit_code > 0]
            if errors:
                self.logger.warning("OSRA errors:")
                for page, error in errors:
                    eprint("\tError on page {}:".format(page))
                    eprint("\n\t\t".join("\n{}".format(error).splitlines()))
                return to_return

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write("\n".join(to_return["stdout"]))
            return to_return

        output_cols = OrderedDict([("bond_length", 1), ("resolution", 2),
                                   ("confidence", 3), ("page", 4),
                                   ("coordinates", 5)])

        if osra_output_format in osra_smiles_outputs:
            compound_template_dict = OrderedDict.fromkeys(
                output_formats + list(output_cols.keys()))
        else:
            compound_template_dict = OrderedDict.fromkeys(["page"] +
                                                          output_formats)

        if any(to_return["stdout"]):
            if standardize_mols:
                standardizer = Standardizer()

            compounds = []

            if is_output_sdf:
                if sdf_append:
                    if not os.path.isfile(output_file_sdf):
                        open(output_file_sdf, mode="w",
                             encoding="utf-8").close()
                    writer = SDWriter(
                        open(output_file_sdf, mode="a", encoding="utf-8"))
                else:
                    writer = SDWriter(output_file_sdf)

            for output, page in zip(to_return["stdout"], to_return["pages"]):
                if osra_output_format in osra_smiles_outputs:
                    lines = [x.strip() for x in output.split("\n") if x]
                else:
                    lines = [x for x in output.split("$$$$") if x.strip()]

                for line in lines:
                    """
                    # so much problems with --learn
                    # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1"
                    if "learn" in filtered_cols:
                        learn_start = filtered_cols.index("learn") + 1 #  "smiles" col isn't in output_cols
                        learn_end = filtered_cols.index("learn") + 1 + 3
                        line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])]
                    """

                    if not line:
                        continue

                    if osra_output_format in osra_smiles_outputs:
                        line = [x.strip() for x in line.split()]
                        if custom_page:
                            line[output_cols["page"]] = custom_page
                        elif use_gm:
                            line[output_cols["page"]] = page
                        mol = MolFromSmiles(
                            line[0],
                            sanitize=False if standardize_mols else True)
                    elif osra_output_format == "sdf":
                        line = "\n" + line.strip()
                        mol = MolFromMolBlock(
                            line,
                            strictParsing=False,
                            sanitize=False if standardize_mols else True,
                            removeHs=False if standardize_mols else True)

                    if mol:
                        compound = compound_template_dict.copy()

                        if standardize_mols:
                            try:
                                mol = standardizer.standardize(mol)
                            except ValueError as e:
                                self.logger.warning(
                                    "Cannot standardize '{}': {}".format(
                                        MolToSmiles(mol), str(e)))

                        for f in output_formats:
                            if f == "smiles":
                                compound["smiles"] = MolToSmiles(
                                    mol, isomericSmiles=True)
                            elif f == "smiles_osra" and osra_output_format == "smi":
                                compound["smiles_osra"] = line[0]
                            elif f == "smiles_can_osra" and osra_output_format == "can":
                                compound["smiles_can_osra"] = line[0]
                            elif f == "inchi":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchi"] = inchi
                                else:
                                    compound["inchi"] = ""
                                    self.logger.warning(
                                        "Cannot convert to InChI: {}".format(
                                            MolToSmiles(mol)))
                            elif f == "inchikey":
                                inchi = MolToInchi(mol)
                                if inchi:
                                    compound["inchikey"] = InchiToInchiKey(
                                        inchi)
                                else:
                                    compound["inchikey"] = ""
                                    self.logger.warning(
                                        "Cannot create InChI-key from InChI: {}"
                                        .format(MolToSmiles(mol)))
                            elif f == "sdf":
                                compound["sdf"] = MolToMolBlock(
                                    mol, includeStereo=True)
                            elif f == "sdf_osra":
                                compound["sdf_osra"] = line

                        if is_output_sdf:
                            writer.write(mol)

                        if osra_output_format in osra_smiles_outputs:
                            compound.update([(x[0], x[1]) for x in zip(
                                list(output_cols.keys()), line[1:])])
                        else:
                            compound[
                                "page"] = page if use_gm else custom_page if custom_page else 1

                        compounds.append(compound)
                    else:
                        self.logger.warning("Cannot convert to RDKit mol: " +
                                            line[0])

            if is_output_sdf_osra:
                with open(output_file_sdf + "-osra.sdf",
                          mode="w",
                          encoding="utf-8") as f:
                    f.write("".join(to_return["stdout"]))

            to_return["content"] = sorted(compounds, key=lambda x: x["page"])

            if annotate:
                chemspider = ChemSpider(
                    chemspider_token) if chemspider_token else None

                for i, ent in enumerate(to_return["content"]):
                    self.logger.info("Annotating entity {}/{}...".format(
                        i + 1, len(to_return["content"])))
                    ent.update(
                        OrderedDict([("pch_cids_by_inchikey", ""),
                                     ("chs_cids_by_inchikey", ""),
                                     ("pch_cids_by_smiles", ""),
                                     ("chs_cids_by_smiles", ""),
                                     ("pch_cids_by_inchi", ""),
                                     ("chs_cids_by_inchi", ""),
                                     ("pch_iupac_name", ""),
                                     ("chs_common_name", ""),
                                     ("pch_synonyms", "")]))

                    results = []

                    # prefer InChI key
                    if "inchikey" in ent and ent["inchikey"]:
                        try:
                            results = get_compounds(ent["inchikey"],
                                                    "inchikey")
                            if results:
                                if len(results) == 1:
                                    result = results[0]
                                    synonyms = result.synonyms
                                    if synonyms:
                                        ent["pch_synonyms"] = "\"{}\"".format(
                                            "\",\"".join(synonyms))
                                    ent["pch_iupac_name"] = result.iupac_name
                                ent["pch_cids_by_inchikey"] = "\"{}\"".format(
                                    ",".join([str(c.cid) for c in results]))
                        except (BadRequestError, NotFoundError,
                                PubChemHTTPError, ResponseParseError,
                                ServerError, TimeoutError, PubChemPyError):
                            pass

                        results = chemspider.search(
                            ent["inchikey"]) if chemspider_token else []
                        if results:
                            if len(results) == 1:
                                result = results[0]
                                ent["chs_common_name"] = result.common_name
                            ent["chs_cids_by_inchikey"] = "\"{}\"".format(
                                ",".join([str(c.csid) for c in results]))
                    else:
                        for search_field, col_pch, col_chs in [
                            ("smiles", "pch_cids_by_smiles",
                             "chs_cids_by_smiles"),
                            ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi")
                        ]:
                            results_pch = []
                            results_chs = []

                            if search_field == "smiles" and "smiles" in ent and ent[
                                    "smiles"] and "*" not in ent["smiles"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["smiles"], "smiles")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["smiles"]) if chemspider_token else []
                            elif search_field == "inchi" and "inchi" in ent and ent[
                                    "inchi"]:
                                try:
                                    results_pch = get_compounds(
                                        ent["inchi"], "inchi")
                                except (BadRequestError, NotFoundError,
                                        PubChemHTTPError, ResponseParseError,
                                        ServerError, TimeoutError,
                                        PubChemPyError):
                                    pass
                                results_chs = chemspider.search(
                                    ent["inchi"]) if chemspider_token else []

                            if results_pch:
                                ent[col_pch] = "\"{}\"".format(",".join(
                                    [str(c.cid) for c in results_pch]))
                            if results_chs:
                                ent[col_chs] = "\"{}\"".format(",".join(
                                    [str(c.csid) for c in results_chs]))

                            sleep(0.5)

            if output_file:
                dict_to_csv(to_return["content"],
                            output_file=output_file,
                            csv_delimiter=csv_delimiter,
                            write_header=write_header)

            if is_output_sdf:
                writer.close()
        elif not any(to_return["stdout"]) and output_file:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(compound_template_dict.keys()),
                             write_header=write_header)

        return to_return
    def __init__(self,
                 dcompound,
                 logfile,
                 writecheck=1,
                 kSMILES="CANONICAL_SMILES",
                 kID="CMPD_CHEMBLID"):
        self.compound = dcompound
        loader = pydrug.PyDrug()

        # if SMILES, load using SMILES code
        if not kSMILES in dcompound.keys():
            try:
                smile = runExternalSoft.babelConvertSDFtoSMILE(
                    dcompound["sdf"])
                self.compound[kSMILES] = smile
            except:
                print "ERROR INPUT SDF - l33"
                self.log = "ERROR"
                try:
                    logfile.write(self.compound[kID] +
                                  "\t---\tERROR-SDF ORIGINAL INPUT\n")
                except:
                    pass
                return

        #Standardize smile code
        try:
            smilestandadized = standardize_smiles(self.compound[kSMILES])
        except:
            logfile.write(self.compound[kID] + "\t" +
                          str(self.compound[kSMILES]) + "\tERROR-SMILES INPUT"
                          "\n")
            self.log = "ERROR"
            return

        #Standardize using molvs (http://molvs.readthedocs.io/en/latest/api.html#molvs-fragment)
        s = Standardizer()
        mol = Chem.MolFromSmiles(smilestandadized)
        molstandardized = s.standardize(mol)
        smilestandadized = Chem.MolToSmiles(molstandardized)

        # remove salt
        # 1.default
        remover = SaltRemover()
        mol = Chem.MolFromSmiles(smilestandadized)
        molcleandefault = remover(mol)
        # 2. Personal remover
        homeremover = SaltRemover(defnData=LSALT)
        molclean = homeremover(molcleandefault)
        smilesclean = Chem.MolToSmiles(molclean)
        # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
        lelem = smilesclean.split(".")
        if len(lelem) > 1:
            # reduce double, case of several salts are included - 255
            lelem = list(set(lelem))
            for smilesdel in LSMILESREMOVE:
                if smilesdel in lelem:
                    lelem.remove(smilesdel)
            try:
                lelem.remove("")  # case of bad smile
            except:
                pass
            if len(lelem) == 1:
                smilesclean = str(lelem[0])
            else:
                # 4. Fragments
                #Case of fragment -> stock in log file, check after to control
                logfile.write(self.compound[kID] + "\t" +
                              str(self.compound[kSMILES]) +
                              "\tFRAGMENT IN INPUT"
                              "\n")
                print ".".join(lelem), " - FRAGMENTS - l66"
                self.log = "ERROR"
                return
        else:
            pass

        print self.compound[kSMILES], "SMILES IN - l25 liganddescriptors"
        print smilesclean, "SMILES without salt and standardized"

        # case where only salt are included
        if smilesclean == "":
            logfile.write(self.compound[kID] + "\t" +
                          str(self.compound[kSMILES]) + "\tEMPTY SMILES AFTER "
                          "STANDARDIZATION\n")
            print "EMPTY SMILES AFTER STANDARDIZATION - l84"
            self.log = "ERROR"
            return

        self.compound[kSMILES] = smilesclean
        self.log = "OK"

        if writecheck == 1:
            # SMILES code
            pfileSMILES = pathFolder.PR_COMPOUNDS + str(
                dcompound[kID]) + ".smi"
            fileSMILES = open(pfileSMILES, "w")
            fileSMILES.write(self.compound[kSMILES])
            fileSMILES.close()

            # SDF input
            if "sdf" in self.compound.keys():
                pfileSDF = pathFolder.PR_COMPOUNDS + str(
                    dcompound[kID]) + ".sdf"
                fileSDF = open(pfileSDF, "w")
                fileSDF.write(self.compound["sdf"])
                fileSDF.close()

        # read mol
        self.mol = loader.ReadMolFromSmile(self.compound[kSMILES])
Beispiel #14
0
def read_mols(mode, method, basename, datadir='Default', modeldir='Default'):
    currworkdir = os.getcwd()
    if datadir == 'Default':
        datadir = os.path.join(currworkdir, 'data')
    else:
        if not os.path.isdir(datadir):
            print("error: ", datadir, " is not a directory. exiting.")
            exit(2)

    if modeldir == 'Default':
        modeldir = os.path.join(currworkdir, 'models')
    else:
        if not os.path.isdir(modeldir):
            print("error: ", modeldir, " is not a directory. exiting.")
            exit(2)
        else:
            print('setting modeldir to ', modeldir, '.')
            print(
                'Have you set the random splits to be correct for the model?')

    mol_data_filename = basename + '.smi'
    act_data_filename = basename + '.act'
    moldatafile = os.path.join(datadir, mol_data_filename)
    actdatafile = os.path.join(datadir, act_data_filename)

    # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state))
    model_filename = "model_%s.dat" % output_ext
    index_filename = "indices_%s.dat" % output_ext
    appdom_fp_filename = "training-FPs_%s.dat" % output_ext
    appdom_rad_filename = "AD-radius_%s.dat" % output_ext

    if mode.startswith('class'):
        if os.path.isfile(actdatafile):
            actfh = open(actdatafile)

            activities = []  # array of tuples: (activity, molecule name)

            for actline in actfh:
                line = actline.split()
                act = float(line[1])
                actname = line[0]
                activities.append((act, actname))

            actfh.close()

    elif mode.startswith('reg') and method == 'xgb':

        bits_filename = "sigbits_%s.dat" % output_ext
        bits_file = os.path.join(modeldir, bits_filename)
        with open(bits_file, 'rb') as f:
            significant_bits = pickle.load(f)

    model_file = os.path.join(modeldir, model_filename)
    loaded_model = pickle.load(open(model_file, "rb"))

    index_file = os.path.join(modeldir, index_filename)
    with open(index_file, 'rb') as f:
        indexes = pickle.load(f)

    appdom_fp_file = os.path.join(modeldir, appdom_fp_filename)
    with open(appdom_fp_file, 'rb') as f:
        appdom_fps = pickle.load(f)

    appdom_rad_file = os.path.join(modeldir, appdom_rad_filename)
    with open(appdom_rad_file, 'rb') as f:
        appdom_radius = pickle.load(f)

    # Read in molecules from test set
    molfh = open(moldatafile)

    molecules = []  # array of tuples: (molecule, molecule name)

    for molline in molfh:
        line = molline.split()
        mol = Chem.MolFromSmiles(line[0])
        molname = line[1]
        molecules.append((mol, molname))

    molfh.close()

    mols_train = []
    molnames_train = []

    if 'activities' in locals():
        acts_train = []
        actnames_train = []

    for i in range(len(molecules)):
        mols_train.append(molecules[i][0])
        molnames_train.append(molecules[i][1])
        if mode.startswith('class') and 'activities' in locals():
            acts_train.append(activities[i][0])
            actnames_train.append(activities[i][1])

    # Standardize structures
    s = Standardizer()
    standard_mols_train = []
    for mol in mols_train:
        standard_mols_train.append(s.standardize(mol))

    return_dict = {}

    return_dict['molnames'] = molnames_train
    return_dict['molecules'] = standard_mols_train
    return_dict['model'] = loaded_model
    return_dict['inds'] = indexes
    if mode.startswith('reg') and method == 'xgb':
        return_dict['sigbits'] = significant_bits
    elif mode.startswith('class') and 'activities' in locals():
        return_dict['activities'] = acts_train
    return_dict['ad_fps'] = appdom_fps
    return_dict['ad_radius'] = appdom_radius

    return return_dict
Beispiel #15
0
def standardize_main(args):
    mol = _read_mol(args)
    s = Standardizer()
    mol = s.standardize(mol)
    _write_mol(mol, args)
Beispiel #16
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return