def process_test_32(): """ # SMILES, Interlab.SD, Num.Lit.Sources, Experimental.MP.(*C), log.Poct-water.calc.in.RDKit, log.S0.calc.by.GSE """ fname = f"{TEST_PATH}/set_32.csv" fout = f"{TEST_PATH}/test_32.smi" fout_gse = f"{TEST_PATH}/test_32.with.gse.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): pairs = line.rstrip('\n').split(',') canon_smiles = canonicalize_smiles(pairs[0]) logS = float(pairs[5]) cmpd_list.append((canon_smiles, logS)) with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] fo.write(f"{smiles}\n") with open(fout_gse, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_POG_2007_JCIM_train(): fname = f"{DATA_PATH}/POG.2007.JCIM.train.txt" fout = f"{PROCESSED_PATH}/POG.2007.JCIM.train.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): try: pairs = line.rstrip('\n').split(";") canon_smiles = canonicalize_smiles(pairs[1].strip("\"")) logS = float(pairs[4]) cmpd_list.append((canon_smiles, logS)) except: smiles = pairs[1] logging.info(f"Failed to process {smiles}") with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_LGG_2008_JCIM_32(): fname = f"{DATA_PATH}/LGG.2008.JCIM.32.txt" fout = f"{PROCESSED_PATH}/LGG.2008.JCIM.32.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: cnt = 0 for line in fin: if not line.startswith("#"): pairs = line.rstrip('\n').split(',') canon_smiles = canonicalize_smiles(pairs[0]) mg = float(pairs[1]) / 1000 try: mw = mol_wt(canon_smiles) except TypeError as e: logging.error(f"TypeError for {canon_smiles}: {e}") continue logS = np.log10(mg / mw) cmpd_list.append((canon_smiles, logS)) cnt += 1 with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_HXZ_2004_JCIC_data(): fname = f"{DATA_PATH}/HXZ.2004.JCIC.data_set.txt" fout = f"{PROCESSED_PATH}/HXZ.2004.JCIC.data_set.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): try: pairs = line.rstrip('\n').split() canon_smiles = canonicalize_smiles(pairs[0]) logS = float(pairs[2]) cmpd_list.append((canon_smiles, logS)) except: smiles = pairs[0] logging.info(f"Failed to process {smiles}") with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_H_2000_JCIC_test2(): fname = f"{DATA_PATH}/H.2000.JCIC.test2.txt" fout = f"{PROCESSED_PATH}/H.2000.JCIC.test2.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r', encoding="ISO-8859-1") as fin: for line in fin: if not line.startswith("#"): try: pairs = line.rstrip('\n').split() canon_smiles = canonicalize_smiles(pairs[6], SLN=True) logS = float(pairs[3]) cmpd_list.append((canon_smiles, logS)) except: smiles = pairs[6] logging.info(f"Failed to process {smiles}") with open(fout, 'w', encoding='ascii') as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_WKH_2007_JCIM(): fname = f"{DATA_PATH}/WKH.2007.JCIM.solubility.sdf" fout = f"{PROCESSED_PATH}/WKH.2007.JCIM.smi" logging.info(f"Processing {fname}") suppl = Chem.SDMolSupplier(fname) with open(fout, 'w', encoding='ascii') as fo: for mol in suppl: smiles = canonicalize_smiles(Chem.MolToSmiles(mol)) logS = str(mol.GetProp('EXPT')) fo.write(f'{smiles},{logS}\n') logging.info(f"Saved to {fout}")
def process_HXZ_2004_JCIC_test(): fname = f"{DATA_PATH}/HXZ.2004.JCIC.test_set1.sdf" fout = f"{PROCESSED_PATH}/HXZ.2004.JCIC.test.smi" logging.info(f"Processing {fname}") suppl = Chem.SDMolSupplier(fname) with open(fout, 'w', encoding='ascii') as fo: for mol in suppl: smiles = canonicalize_smiles(Chem.MolToSmiles(mol)) logS = str(mol.GetProp('logS')) fo.write(f'{smiles},{logS}\n') logging.info(f"Saved to {fout}")
def process_BOM_2017_JC(): fname = f"{DATA_PATH}/BOM.2017.JC.txt" fout = f"{PROCESSED_PATH}/BOM.2017.JC.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: pairs = line.rstrip('\n').split(",") canon_smiles = canonicalize_smiles(pairs[0]) logS = float(pairs[1]) cmpd_list.append((canon_smiles, logS)) with open(fout, 'w', encoding='ascii') as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_D_2008_JCIC(): fname = f"{DATA_PATH}/D.2008.JCIC.solubility.v1.txt" fout = f"{PROCESSED_PATH}/D.2008.JCIC.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): pairs = line.rstrip('\n').split(",") canon_smiles = canonicalize_smiles(pairs[-1]) logS = float(pairs[-3]) cmpd_list.append((canon_smiles, logS)) with open(fout, 'w', encoding='ascii') as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_ABB_2000_PR(): fname = f"{DATA_PATH}/ABB.2000.PR.txt" fout = f"{PROCESSED_PATH}/ABB.2000.PR.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): pairs = line.rstrip('\n').split(",") canon_smiles = canonicalize_smiles(pairs[1]) logS = float(pairs[2]) cmpd_list.append((canon_smiles, logS)) with open(fout, 'w') as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_A_2019_ADMET_DMPK(): """ Process A.2019_ADMET_DMPK data. Two values are gien in the raw file, so two output files are saved. """ fname = f"{DATA_PATH}/A.2019.ADMET_DMPK.csv" fout_1 = f"{PROCESSED_PATH}/A.2019.ADMET_DMPK.SSF.smi" fout_2 = f"{PROCESSED_PATH}/A.2019.ADMET_DMPK.CS.smi" logging.info(f"Processing {fname}") try: import pubchempy as pcp except ModuleNotFoundError as e: print(e) return with open(fname, 'r') as fin, open(fout_1, 'w') as fout1, open(fout_2, 'w') as fout2: fin.readline() for line in fin: if line.startswith("\""): pairs = line.rstrip().split("\"") name = pairs[1] pairs = pairs[2].split(',') logS0_SFF = pairs[0] logS0_CS = pairs[2] else: pairs = line.rstrip().split(',') name = pairs[0] logS0_SFF = pairs[1] logS0_CS = pairs[3] name = name.replace('\"', '') results = pcp.get_compounds(name, 'name') if len(results) > 0: isomeric_smiles = results[0].isomeric_smiles canon_smiles = canonicalize_smiles(isomeric_smiles) fout1.write("{},{}\n".format(canon_smiles, logS0_SFF)) fout2.write("{},{}\n".format(canon_smiles, logS0_CS))
def process_WHX_2009_JCIM(): files_in = [ "WHX.2009.JCIM.Set-001.csv", "WHX.2009.JCIM.Set-002.csv", "WHX.2009.JCIM.Set-003.csv", "WHX.2009.JCIM.Set-004.csv", "WHX.2009.JCIM.Set-005.csv" ] files_out = [ "WHX.2009.JCIM.Set-001.smi", "WHX.2009.JCIM.Set-002.smi", "WHX.2009.JCIM.Set-003.smi", "WHX.2009.JCIM.Set-004.smi", "WHX.2009.JCIM.Set-005.smi" ] for i, file_in in enumerate(files_in): fname = f"{DATA_PATH}/{file_in}" file_out = files_out[i] fout = f"{PROCESSED_PATH}/{file_out}" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: if not line.startswith("#"): try: pairs = line.rstrip('\n').split(",") canon_smiles = canonicalize_smiles(pairs[1], SLN=True) logS = float(pairs[0]) cmpd_list.append((canon_smiles, logS)) except: smiles = pairs[1] logging.info(f"Failed to process {smiles}") with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_LGG_2008_JCIM_100(): fname = f"{DATA_PATH}/LGG.2008.JCIM.100.txt" fout = f"{PROCESSED_PATH}/LGG.2008.JCIM.100.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: cnt = 0 for line in fin: if not line.startswith("#"): pairs = line.rstrip('\n').split(',') canon_smiles = canonicalize_smiles(pairs[0]) logS = float(pairs[2]) cmpd_list.append((canon_smiles, logS)) cnt += 1 with open(fout, 'w', encoding="ascii") as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")
def process_OCHEM(): fname = f"{DATA_PATH}/OCHEM.Water.Solublity.05.27.2019.txt" fout = f"{PROCESSED_PATH}/OCHEM.Water.Solublity.05.27.2019.smi" logging.info(f"Processing {fname}") cmpd_list = [] with open(fname, 'r') as fin: for line in fin: try: pairs = line.rstrip('\n').split(",") canon_smiles = canonicalize_smiles(pairs[0]) logS = float(pairs[1]) cmpd_list.append((canon_smiles, logS)) except: smiles = pairs[0] logging.info(f"Failed to process {smiles}") with open(fout, 'w', encoding='ascii') as fo: for el in cmpd_list: smiles = el[0] logS = el[1] fo.write(f"{smiles},{logS}\n") logging.info(f"Saved {fout}")