def main( ): args = UserInput() if args.id is None: args.id = 'ID' if args.start is None: args.start = 0 if args.end is None: args.end = -1 df = RDkitRead(args.infile, args.id, removeHs=True, add_Hs=False)[int(args.start):int(args.end)].dropna() remover = SaltRemover.SaltRemover() normzer = rdMolStandardize.Normalizer() chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True) ## remove salts print('\033[34m## Desalting moleucles...\033[0m\n') df['mol'] = df.MOL.apply(remover.StripMol) ## choose largest fragment (most Hs) print('\033[34m## Choosing moleucles...\033[0m\n') df['mol2'] = df.mol.apply(chooser.choose) ## clean molecule (not really relevant?) print('\033[34m## Cleaning moleucles...\033[0m\n') df['mol3'] = df.mol2.apply(normzer.normalize) ## rewrite SMILES with newest mol3 print('\033[34m## Converting moleucles...\033[0m\n') df['smiles'] = df.mol3.apply(Chem.MolToSmiles) if args.format == 'sdf': rdpd.WriteSDF(df, args.outpref+'.'+args.format, molColName='mol3', idName=args.id, properties=['smiles']) elif args.format == 'smi': df.to_csv(args.outpref+'.'+args.format, index=False, sep=' ', columns=['smiles',args.id], header=True)
def df_to_sdf(compounds_df, file): with open(file, 'w') as fi: PandasTools.WriteSDF(compounds_df, fi, molColName='ROMol', idName='PID', properties=list(compounds_df.columns))
def main(args: Namespace) -> None: """ Main function of this script Parameters ---------- args : Namespace Namespace object containing the parsed commandline arguments """ df = read_dataset(args.infile) print(f'Initial: {len(df)}') df = cleaning(df, args.keep_props) print(f'After cleaning: {len(df)}') df = filtering(df) print(f'After filtering: {len(df)}') df = filter_by_temperature(df) print(f'After temperature control: {len(df)}') df = run_oe_tautomers(df) print(f'After QuacPac tautomers: {len(df)}') df = make_dataset_unique(df) print(f'After unifying dataset: {len(df)}') df = run_marvin_pka(df) print(f'After Marvin pKa: {len(df)}') PandasTools.WriteSDF(df, args.outfile, idName='RowID', properties=df.columns)
def test_all_numeric_with_no_numeric_columns(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, allNumeric=True) s = sio.getvalue() self.assertFalse(">" in s, s) self.assertNotIn("7\n\n", s) # double-check that the numeric tests don't pass by accident self.assertNotIn("8\n\n", s)
def search_pattern(self, inp): smarts, prefix = inp pattern = Chem.MolFromSmarts(smarts) m_df = self.df[self.df.mol >= pattern] m_df['SMARTS_Match'] = smarts # Matches = [m for m in self.Mols if m.HasSubstructMatch(pattern)] print('\n > Molecule matching "{0}": {1}\n'.format(smarts, len(m_df))) if self.ext == 'sdf': mol_out = (prefix + '.' + self.ext) rdpd.WriteSDF(m_df, mol_out, molColName='mol', idName='ID', properties=list(m_df.columns)) else: mol_out = (prefix + '.' + self.ext) rdpd.SaveSMILESFromFrame(m_df, mol_out, molCol='mol', NamesCol='ID', isomericSmiles=True)
def save(self, filename_sdf: str, filename_csv: str) -> None: """ Saves the results file as both and sdf and a csv. Both are used since the sdf file takes longer to load than a simple csv file. """ PandasTools.WriteSDF(self.source, filename_sdf, \ molColName='Molecule', properties=list(self.source.columns)) self.source.to_csv(filename_csv)
def test_specify_numeric_column(self): sio = StringIO() df = self.df df["len2"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, properties=["len2"]) s = sio.getvalue() self.assertEqual(s.count("<len2>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def test_all_numeric_with_numeric_columns(self): sio = StringIO() df = self.df df["len"] = df["ID"].map(len) PandasTools.WriteSDF(df, sio, allNumeric=True) s = sio.getvalue() self.assertEqual(s.count("<len>"), 2) self.assertIn("7\n\n", s) self.assertIn("8\n\n", s)
def main(): args = UserInput() df = RDkitRead(args.infile, args.id, removeHs=False, add_Hs=False) nsee_df = df[df['NeverSee_Groups'] == 'Y'] len(nsee_df) pass_df = df[df['NeverSee_Groups'] == 'N'] len(pass_df) print('\033[34m Passed NeverSee Filter: \033[32m{0}\033[0m'.format( len(pass_df))) print('\033[34m Failed NeverSee Filter: \033[31m{0}\033[0m'.format( len(nsee_df))) if re.search(r'.smi', args.nsee_file, re.IGNORECASE): nsee_df.smiles = nsee_df.MOL.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) nsee_df.to_csv(args.nsee_file, columns=['smiles', 'ID'], sep=' ', header=False, index=False) else: rdpd.WriteSDF(nsee_df, args.nsee_file, molColName='MOL', properties=list(nsee_df.columns)) if re.search(r'.smi', args.pass_file, re.IGNORECASE): pass_df.smiles = pass_df.MOL.apply( lambda m: Chem.MolToSmiles(Chem.RemoveHs(m))) pass_df.to_csv(args.pass_file, columns=['smiles', 'ID'], sep=' ', header=False, index=False) else: rdpd.WriteSDF(pass_df, args.pass_file, molColName='MOL', properties=list(pass_df.columns)) print('')
def test_write_to_sdf_gz(self): dirname = tempfile.mkdtemp() try: filename = os.path.join(dirname, "test.sdf.gz") PandasTools.WriteSDF(self.df, filename) s = gzip.open(filename).read() self.assertEqual(s.count("\n$$$$\n"), 2) self.assertEqual(s.split("\n", 1)[0], "Methane") finally: shutil.rmtree(dirname)
def update_sdf(): compounds_df = pd.DataFrame(list(Compound.objects.all().values())) if not compounds_df.isnull: compounds_df = compounds_df.drop(['id', 'created_at', 'updated_at'], axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) if not os.path.exists('media'): os.makedirs('media') with open('media/all_data.sdf', 'w') as fi: PandasTools.WriteSDF(compounds_df, fi, molColName='ROMol', idName='PID', properties=list(compounds_df.columns))
def return_files_sdf(): df = pd.read_pickle('temp.pickle') PandasTools.WriteSDF(df, 'temp.sdf', molColName='structures', properties=list(df.columns), allNumeric=False) try: result = send_file('temp.sdf', as_attachment=True) return result except Exception as e: return str(e)
def test_write_to_sdf_gz(self): dirname = tempfile.mkdtemp() try: filename = os.path.join(dirname, "test.sdf.gz") PandasTools.WriteSDF(self.df, filename) with gzip.open(filename) as f: s = f.read() s = s.decode('utf-8') s = s.replace(os.linesep, '\n') self.assertEqual(s.count("\n$$$$\n"), 2) self.assertEqual(s.split("\n", 1)[0], "Methane") finally: shutil.rmtree(dirname)
def main(): args = UserInput() if args.id_tag: id_tag = args.id_tag else: id_tag = 'Name' if args.sort_tag: sort_tag = args.sort_tag else: sort_tag = False ############### ## Read in the list of selected ligand ID n_df = pd.read_csv(args.mol_id, delimiter='\s+', header=None, comment='#').dropna() keywords = n_df.loc[:, 0].to_list() print('\n > Number of items in <{}>: {}\n'.format(args.mol_id, len(keywords))) ## Extract the selected ligands from the supplied SDFs mol_sele = [] for infile in args.infiles: df = RDkitRead(infile, removeHs=False) Items = df['ID'].apply(CheckID) df['Name'] = list(zip(*Items))[0] df['Rank'] = list(zip(*Items))[1] df['Score'] = list(zip(*Items))[2] df['Soft'] = list(zip(*Items))[3] mol_sele.append(df[df[id_tag].isin(keywords)]) del df gc.collect() all_df = pd.concat(mol_sele).reset_index(drop=True) found_id = all_df[id_tag].to_list() missed_id = [x for x in keywords if x not in set(found_id)] if missed_id is False: print( '\033[31m Info: \033[35m{0}\033[31m MOL cannot be found:\033[0m'. format(len(missed_id))) print(missed_id) ## Sort data, if needed if sort_tag: all_df.sort_values(by=[sort_tag], ascending=True, inplace=True) rdpd.WriteSDF(all_df, args.outpref + '.sdf.gz', molColName='mol', properties=list(all_df.columns))
def get_df_download_sdf(df, download_filename, link_label, structure_column): f = StringIO() PandasTools.WriteSDF(df, f, molColName=structure_column, properties=list(df.columns), allNumeric=False) #csv = df.to_csv(index=False) data = f.getvalue() b64 = base64.b64encode(data.encode()).decode( ) # some strings <-> bytes conversions necessary here #href = f'<a href="data:file/csv;base64,{b64}">{link_label}</a>' href = f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{link_label}</a>' return href
def df2sdf(df, output_sdf_name, smiles_field='canonical_smiles', id_field='chembl_id', selected_batch=None): ''' pack pd.DataFrame to sdf_file ''' if not selected_batch is None: df = df.loc[df['label'] == selected_batch] PandasTools.AddMoleculeColumnToFrame(df, smiles_field, 'ROMol') PandasTools.WriteSDF(df, output_sdf_name, idName=id_field, properties=df.columns) return
def main(): args = UserInput() if args.name: name = args.name else: name = 'ID' if args.score: score = args.score else: score = 'Chemgauss4' if args.dock: dock = args.dock else: dock = 'fred' if args.top: top = int(args.top) else: top = -1 # all df = rdpd.LoadSDF(args.infile, removeHs=False, molColName='ROMol', idName='mol_ID')[:top].fillna('') print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df))) df[score] = df[score].apply(float) df['Rank'] = df.index for idx, row in df.iterrows(): df['ROMol'][idx].SetProp( '_Name', '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1, row[score], dock)) sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock) csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock) rdpd.WriteSDF(df, sdf_out, properties=list(df.columns)) df.to_csv(csv_out, header=False, index=False, sep='\t', columns=[name, score], float_format='%.3f')
def write_sdf(self, data: pd.DataFrame, outfile_name: str, smiles_column: str): """ Prepares curated data to be converted into sdf file using PandasTools. Returns non processed molecules in excel format. :param data: Dataframe to be written :param smiles_column: SMILES column in the dataframe to be processed :param outfile_name: output file name """ output_name_format = '.'.join([outfile_name, 'sdf']) cur_data = self.prepare_data_for_sdf(data, smiles_column, copy=True) PandasTools.WriteSDF(cur_data, output_name_format, molColName='ROMol', properties=list(cur_data.columns), idName=self.identifier)
def main(): args = getArgs() print(args.infile, args.outfile) smiles_df = pd.read_csv(args.infile) pp = smiles_df[['rdkit_smiles', 'compound_id']] PandasTools.AddMoleculeColumnToFrame(pp, 'rdkit_smiles', 'Molecule') for index, row in pp.iterrows(): row['Molecule'] = Chem.AddHs(row['Molecule']) AllChem.EmbedMolecule(row['Molecule']) pp.at[index, 'i_user_TOTAL_CHARGE'] = Chem.rdmolops.GetFormalCharge( row['Molecule']) PandasTools.WriteSDF(pp, args.outfile, molColName='Molecule', idName='compound_id', properties=list(pp.columns))
def __init__(self, data, output_name): output = StringIO() compounds_df = pd.DataFrame(list(data.values())).drop('id', axis=1) PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True) PandasTools.WriteSDF(compounds_df, output, molColName='ROMol', idName='PID', properties=list(compounds_df.columns)) mimetype = 'text/plain' file_ext = 'sdf' output.seek(0) super(SDFResponse, self).__init__(content=output.getvalue(), content_type=mimetype) self['Content-Disposition'] = 'attachment;filename="%s.%s"' % \ (output_name.replace('"', '\"'), file_ext)
def main(): args = parse_args() if (".txt" or ".csv") in args.i: df = parse_text_file(args.i) elif ("sd.gz" or "sdf.gz") in args.i: df = parse_sd_file(args.i, tgz=True) elif (".sd" or ".sdf") in args.i: df = parse_sd_file(args.i) FP = fp_from_df(df) #FP = preprocessing.normalize(FP) labels = AffinityPropagation(damping=float(args.damping), max_iter=int(args.max_iter), convergence_iter=int( args.convergence)).fit(FP).labels_ print(metrics.silhouette_score(FP, labels, metric='euclidean')) df['Cluster'] = labels PandasTools.WriteSDF(df, args.o, molColName='Molecule', idName="CID", properties=list(df.columns))
def binding_affinity(self, prot_in, lig_in, outpath="results/results_affinity_binding.csv"): DF = self.preprocessing(prot_in, lig_in) X = DF.iloc[:, 2:] print(DF.columns) logger.info(X.shape) jl_filename = "models/gbdt_regression.joblib" cl_filename = "models/gbdt_model.joblib" if os.path.isfile(jl_filename) is True: with open(jl_filename, 'rb') as file: models = joblib.load(file) y = pd.Series(models.predict(X)) ya = y.rename("predicted_affinity") else: logger.info("no model available") if os.path.isfile(cl_filename) is True: with open(cl_filename, 'rb') as file: models = joblib.load(file) yb = pd.Series(models.predict_proba(X)[:, 1]) else: logger.info("no model available") smiles = DF["smiles"] prot = DF["UniProtID"] final = pd.concat([smiles, prot, ya, yb], axis=1) final.columns = ["smiles", "Uniprot ID", "affinity", "probability"] final["predicted_label"] = np.where( final.probability > 0.7, "high", np.where(final.probability < 0.4, "low", "medium")) logger.info(final.columns) logger.info(final.columns) logger.info(final[0:10]) final.to_csv(outpath) pp_out = "results/affinity_out.sdf" PandasTools.AddMoleculeColumnToFrame(final, 'smiles', 'Molecule') PandasTools.WriteSDF(final, pp_out, molColName='Molecule', properties=list(final.columns))
def mols_to_sdbuffer(df: pd.DataFrame, props: List[str] = None) -> StringIO: """ Writes a DataFrame containing a ROMol column in SD format to a StringIO buffer. Parameters ---------- df : DataFrame DataFrame that should be written to a buffer props : List[str] List of column names that should also be written to the buffer Returns ------- StringIO StringIO buffer containing data in SD format """ buffer = StringIO() PandasTools.WriteSDF(df, buffer, properties=props) return buffer
def main(args: Namespace) -> None: """ Main function of this script Parameters ---------- args : Namespace Namespace object containing the parsed commandline arguments """ df = PandasTools.LoadSDF(args.infile).set_index('ID', verify_integrity=True) print(f'Initial: {len(df)}') df = cleaning(df, args.keep_props) print(f'After cleaning: {len(df)}') df = filtering(df) print(f'After filtering: {len(df)}') df = run_oe_tautomers(df) print(f'After QuacPac tautomers: {len(df)}') df = run_marvin_pka(df) print(f'After Marvin pKa: {len(df)}') df = filter_strong_outlier_by_marvin(df) print(f'After removing strong outlier: {len(df)}') df.columns = ['ROMol'] + args.keep_props + [ 'marvin_pKa', 'marvin_atom', 'marvin_pKa_type' ] PandasTools.WriteSDF(df, args.outfile, idName='RowID', properties=df.columns)
def test_identifier_from_a_column(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio, idName="prop2") s = sio.getvalue() first_line = s.split("\n", 1)[0] self.assertEqual(first_line, "qwe")
def test_default_write_does_not_include_tags(self): sio = StringIO() PandasTools.WriteSDF(self.df, sio) s = sio.getvalue() self.assertNotIn(s, "prop2")
def main(): args = UserInput() if args.genconf: GenerateConfTemplInput() sys.exit() if args.savetop: try: savetop = int(args.savetop) args.nosort = True # force sorting except TypeError: sys.exit('\033[31m ERROR: -top must be an integer: \033[0m' + args.savetop) ######################## ## Read input configure file settings = ReadConfSettings(args.conffile) settings['receptor'] = args.receptor settings['cavity'] = args.cavity settings['rslt_pref'] = args.rslt_pref if args.lig_ref: settings['lig_ref'] = args.lig_ref if args.constr: settings['constr_file'] = args.constr ## handle sdf file in gzip/bzip2 if re.search('.gz$', args.ligand): ligand = args.ligand.split('/')[-1].split('.gz')[0] os.system('gunzip -c {0} > ./{1}'.format(args.ligand, ligand)) settings['ligand'] = '{0}/{1}'.format(cwd, ligand) elif re.search('.bz2$', args.ligand): ligand = args.ligand.split('/')[-1].split('.bz2')[0] os.system('bunzip2 -c {0} > ./{1}'.format(args.ligand, ligand)) settings['ligand'] = '{0}/{1}'.format(cwd, ligand) else: settings['ligand'] = args.ligand ## Write a list of gold.conf files Confs = GenerateConfFiles(settings) print('\033[34m## Generated subjobs: \033[33m{0}\033[0m'.format( len(Confs))) ## Run GOLD in parallel until all finished if int(args.cpu) > 0 and int(args.cpu) <= multiprocessing.cpu_count(): core = int(args.cpu) else: core = multiprocessing.cpu_count() mpi = multiprocessing.Pool(core) tmp = [x for x in tqdm(mpi.imap(RunGOLD, Confs), total=len(Confs))] mpi.close() mpi.join() ############ Post-processing ############# tmpdsf = settings['tmpdsf'] findsf = settings['findsf'] finssf = settings['finssf'] mol_id = settings['mol_id'] if settings['gold_funct'] == 'plp': score = 'Gold.PLP.Fitness' ## Modify each subjob docking result, summarize them all into 1 dataframe pref_list = [c.split('.conf')[0] for c in Confs] dock_list = [] for pref in pref_list: os.chdir(pref) in_sdf = '{0}.{1}'.format(pref, tmpdsf) out_sdf = '{0}.{1}'.format(pref, findsf) ## modify docked sdf file, collect them dock_list.append(RescaleRename(in_sdf, out_sdf, mol_id, score)) os.system('bzip2 *sdf *lst') os.chdir(cwd) ## combine all subjob data, sort by ranking, output docked sdf and rank ## save only top ligands if needed xdf = pd.concat(list(filter(None, dock_list))) if not args.nosort: xdf.sort_values(by=[score], ascending=True, inplace=True) if args.savetop: xdf = xdf[:savetop] fin_sdf = '{0}.{1}'.format(settings['rslt_pref'], findsf) fin_scr = '{0}.{1}'.format(settings['rslt_pref'], finssf) rdpd.WriteSDF(xdf, fin_sdf, properties=list(xdf.columns)) xdf.to_csv(fin_scr, index=False, sep='\t', columns=[mol_id, score], header=False, float_format='%.3f') os.system('bzip2 {0} {1}'.format(fin_sdf, fin_scr))
print('Loading model...') with open('RF_CV_FMorgan3_pKa.pkl', 'rb') as f: model = pkl.load(f) print('Start preparing dataset...') df = cleaning(df, list(df.columns[df.columns != 'ROMol'])) print(f'After cleaning: {len(df)}') df = filtering(df) print(f'After filtering: {len(df)}') df = run_oe_tautomers(df) print(f'After QuacPac tautomers: {len(df)}') print('Calculating fingerprints...') fmorgan3 = [] for mol in df.ROMol: fmorgan3.append( Chem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=4096, useFeatures=True)) fmorgan3 = np.array(fmorgan3) print('Predicting...') df['pKa_prediction'] = model.predict(fmorgan3) print('Writing result file...') PandasTools.WriteSDF(df, args.out, properties=df.columns, idName='RowID')
new_row = [ID] + list(calc.CalcDescriptors(row['ROMol'])) values.append(new_row) count += 1 df_result = pd.DataFrame(values, columns=columns) return df_result # In[5]: descriptor_df = gen_descriptors(original_df, calc) # In[22]: merged = descriptor_df.join(original_df, on='ID') merged[['MolWt', 'FW']] # In[38]: PandasTools.WriteSDF(merged, out='data/fulldata.sdf', properties=list(merged.columns)) # In[41]: loaded_df = PandasTools.LoadSDF('data/fulldata.sdf') loaded_df.info() # In[46]: loaded_df[['Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10']]
help="dataset type", type=str, choices=['regression', 'classification'], required=True) args = parser.parse_args() DATASETS = OrderedDict() DATASETS[args.data_name] = (args.dataset_type, args.data_path) os.makedirs('sdfs', exist_ok=True) for name in DATASETS: print(name) all_smiles = [] with open(DATASETS[name][1], 'r') as rf, open(os.path.join('sdfs', name + '_smiles.csv'), 'w') as wf: rf.readline() for line in rf: smiles = line.strip().split(',')[0] wf.write(smiles + '\n') all_smiles.append(smiles) filename = os.path.join('sdfs', name + '_smiles.csv') pp = pd.DataFrame(all_smiles, columns=['Smiles']) PandasTools.AddMoleculeColumnToFrame( pp, 'Smiles', 'Molecule') # pp = doesn't work for me PandasTools.WriteSDF(pp, os.path.join('sdfs', name + '.sdf'), molColName='Molecule', properties=list(pp.columns))