def process_test_32():
    """
    # SMILES,
      Interlab.SD,
      Num.Lit.Sources,
      Experimental.MP.(*C),
      log.Poct-water.calc.in.RDKit,
      log.S0.calc.by.GSE
    """
    fname = f"{TEST_PATH}/set_32.csv"
    fout = f"{TEST_PATH}/test_32.smi"
    fout_gse = f"{TEST_PATH}/test_32.with.gse.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            if not line.startswith("#"):
                pairs = line.rstrip('\n').split(',')
                canon_smiles = canonicalize_smiles(pairs[0])
                logS = float(pairs[5])
                cmpd_list.append((canon_smiles, logS))

    with open(fout, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            fo.write(f"{smiles}\n")

    with open(fout_gse, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_POG_2007_JCIM_train():
    fname = f"{DATA_PATH}/POG.2007.JCIM.train.txt"
    fout = f"{PROCESSED_PATH}/POG.2007.JCIM.train.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            if not line.startswith("#"):
                try:
                    pairs = line.rstrip('\n').split(";")
                    canon_smiles = canonicalize_smiles(pairs[1].strip("\""))
                    logS = float(pairs[4])
                    cmpd_list.append((canon_smiles, logS))
                except:
                    smiles = pairs[1]
                    logging.info(f"Failed to process {smiles}")

    with open(fout, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_LGG_2008_JCIM_32():
    fname = f"{DATA_PATH}/LGG.2008.JCIM.32.txt"
    fout = f"{PROCESSED_PATH}/LGG.2008.JCIM.32.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        cnt = 0
        for line in fin:
            if not line.startswith("#"):
                pairs = line.rstrip('\n').split(',')
                canon_smiles = canonicalize_smiles(pairs[0])
                mg = float(pairs[1]) / 1000
                try:
                    mw = mol_wt(canon_smiles)
                except TypeError as e:
                    logging.error(f"TypeError for {canon_smiles}: {e}")
                    continue
                logS = np.log10(mg / mw)
                cmpd_list.append((canon_smiles, logS))
                cnt += 1

    with open(fout, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_HXZ_2004_JCIC_data():
    fname = f"{DATA_PATH}/HXZ.2004.JCIC.data_set.txt"
    fout = f"{PROCESSED_PATH}/HXZ.2004.JCIC.data_set.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            if not line.startswith("#"):
                try:
                    pairs = line.rstrip('\n').split()
                    canon_smiles = canonicalize_smiles(pairs[0])
                    logS = float(pairs[2])
                    cmpd_list.append((canon_smiles, logS))
                except:
                    smiles = pairs[0]
                    logging.info(f"Failed to process {smiles}")

    with open(fout, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_H_2000_JCIC_test2():
    fname = f"{DATA_PATH}/H.2000.JCIC.test2.txt"
    fout = f"{PROCESSED_PATH}/H.2000.JCIC.test2.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r', encoding="ISO-8859-1") as fin:
        for line in fin:
            if not line.startswith("#"):
                try:
                    pairs = line.rstrip('\n').split()
                    canon_smiles = canonicalize_smiles(pairs[6], SLN=True)
                    logS = float(pairs[3])
                    cmpd_list.append((canon_smiles, logS))
                except:
                    smiles = pairs[6]
                    logging.info(f"Failed to process {smiles}")

    with open(fout, 'w', encoding='ascii') as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_WKH_2007_JCIM():
    fname = f"{DATA_PATH}/WKH.2007.JCIM.solubility.sdf"
    fout = f"{PROCESSED_PATH}/WKH.2007.JCIM.smi"
    logging.info(f"Processing {fname}")

    suppl = Chem.SDMolSupplier(fname)
    with open(fout, 'w', encoding='ascii') as fo:
        for mol in suppl:
            smiles = canonicalize_smiles(Chem.MolToSmiles(mol))
            logS = str(mol.GetProp('EXPT'))
            fo.write(f'{smiles},{logS}\n')

    logging.info(f"Saved to {fout}")
def process_HXZ_2004_JCIC_test():
    fname = f"{DATA_PATH}/HXZ.2004.JCIC.test_set1.sdf"
    fout = f"{PROCESSED_PATH}/HXZ.2004.JCIC.test.smi"
    logging.info(f"Processing {fname}")

    suppl = Chem.SDMolSupplier(fname)
    with open(fout, 'w', encoding='ascii') as fo:
        for mol in suppl:
            smiles = canonicalize_smiles(Chem.MolToSmiles(mol))
            logS = str(mol.GetProp('logS'))
            fo.write(f'{smiles},{logS}\n')

    logging.info(f"Saved to {fout}")
def process_BOM_2017_JC():
    fname = f"{DATA_PATH}/BOM.2017.JC.txt"
    fout = f"{PROCESSED_PATH}/BOM.2017.JC.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            pairs = line.rstrip('\n').split(",")
            canon_smiles = canonicalize_smiles(pairs[0])
            logS = float(pairs[1])
            cmpd_list.append((canon_smiles, logS))

    with open(fout, 'w', encoding='ascii') as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
def process_D_2008_JCIC():
    fname = f"{DATA_PATH}/D.2008.JCIC.solubility.v1.txt"
    fout = f"{PROCESSED_PATH}/D.2008.JCIC.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            if not line.startswith("#"):
                pairs = line.rstrip('\n').split(",")
                canon_smiles = canonicalize_smiles(pairs[-1])
                logS = float(pairs[-3])
                cmpd_list.append((canon_smiles, logS))

    with open(fout, 'w', encoding='ascii') as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
Example #10
0
def process_ABB_2000_PR():
    fname = f"{DATA_PATH}/ABB.2000.PR.txt"
    fout = f"{PROCESSED_PATH}/ABB.2000.PR.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            if not line.startswith("#"):
                pairs = line.rstrip('\n').split(",")
                canon_smiles = canonicalize_smiles(pairs[1])
                logS = float(pairs[2])
                cmpd_list.append((canon_smiles, logS))

    with open(fout, 'w') as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
Example #11
0
def process_A_2019_ADMET_DMPK():
    """ Process A.2019_ADMET_DMPK data.
    Two values are gien in the raw file, so two output files are saved.
    """

    fname = f"{DATA_PATH}/A.2019.ADMET_DMPK.csv"
    fout_1 = f"{PROCESSED_PATH}/A.2019.ADMET_DMPK.SSF.smi"
    fout_2 = f"{PROCESSED_PATH}/A.2019.ADMET_DMPK.CS.smi"
    logging.info(f"Processing {fname}")

    try:
        import pubchempy as pcp
    except ModuleNotFoundError as e:
        print(e)
        return

    with open(fname, 'r') as fin, open(fout_1,
                                       'w') as fout1, open(fout_2,
                                                           'w') as fout2:
        fin.readline()
        for line in fin:
            if line.startswith("\""):
                pairs = line.rstrip().split("\"")
                name = pairs[1]
                pairs = pairs[2].split(',')
                logS0_SFF = pairs[0]
                logS0_CS = pairs[2]
            else:
                pairs = line.rstrip().split(',')
                name = pairs[0]
                logS0_SFF = pairs[1]
                logS0_CS = pairs[3]

            name = name.replace('\"', '')
            results = pcp.get_compounds(name, 'name')
            if len(results) > 0:
                isomeric_smiles = results[0].isomeric_smiles
                canon_smiles = canonicalize_smiles(isomeric_smiles)
                fout1.write("{},{}\n".format(canon_smiles, logS0_SFF))
                fout2.write("{},{}\n".format(canon_smiles, logS0_CS))
Example #12
0
def process_WHX_2009_JCIM():
    files_in = [
        "WHX.2009.JCIM.Set-001.csv", "WHX.2009.JCIM.Set-002.csv",
        "WHX.2009.JCIM.Set-003.csv", "WHX.2009.JCIM.Set-004.csv",
        "WHX.2009.JCIM.Set-005.csv"
    ]

    files_out = [
        "WHX.2009.JCIM.Set-001.smi", "WHX.2009.JCIM.Set-002.smi",
        "WHX.2009.JCIM.Set-003.smi", "WHX.2009.JCIM.Set-004.smi",
        "WHX.2009.JCIM.Set-005.smi"
    ]

    for i, file_in in enumerate(files_in):
        fname = f"{DATA_PATH}/{file_in}"
        file_out = files_out[i]
        fout = f"{PROCESSED_PATH}/{file_out}"
        logging.info(f"Processing {fname}")

        cmpd_list = []
        with open(fname, 'r') as fin:
            for line in fin:
                if not line.startswith("#"):
                    try:
                        pairs = line.rstrip('\n').split(",")
                        canon_smiles = canonicalize_smiles(pairs[1], SLN=True)
                        logS = float(pairs[0])
                        cmpd_list.append((canon_smiles, logS))
                    except:
                        smiles = pairs[1]
                        logging.info(f"Failed to process {smiles}")

        with open(fout, 'w', encoding="ascii") as fo:
            for el in cmpd_list:
                smiles = el[0]
                logS = el[1]
                fo.write(f"{smiles},{logS}\n")

        logging.info(f"Saved {fout}")
Example #13
0
def process_LGG_2008_JCIM_100():
    fname = f"{DATA_PATH}/LGG.2008.JCIM.100.txt"
    fout = f"{PROCESSED_PATH}/LGG.2008.JCIM.100.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        cnt = 0
        for line in fin:
            if not line.startswith("#"):
                pairs = line.rstrip('\n').split(',')
                canon_smiles = canonicalize_smiles(pairs[0])
                logS = float(pairs[2])
                cmpd_list.append((canon_smiles, logS))
                cnt += 1

    with open(fout, 'w', encoding="ascii") as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")
Example #14
0
def process_OCHEM():
    fname = f"{DATA_PATH}/OCHEM.Water.Solublity.05.27.2019.txt"
    fout = f"{PROCESSED_PATH}/OCHEM.Water.Solublity.05.27.2019.smi"
    logging.info(f"Processing {fname}")

    cmpd_list = []
    with open(fname, 'r') as fin:
        for line in fin:
            try:
                pairs = line.rstrip('\n').split(",")
                canon_smiles = canonicalize_smiles(pairs[0])
                logS = float(pairs[1])
                cmpd_list.append((canon_smiles, logS))
            except:
                smiles = pairs[0]
                logging.info(f"Failed to process {smiles}")

    with open(fout, 'w', encoding='ascii') as fo:
        for el in cmpd_list:
            smiles = el[0]
            logS = el[1]
            fo.write(f"{smiles},{logS}\n")

    logging.info(f"Saved {fout}")