def generate_data(name,inchi,rt='null'): cmpd = pcp.get_compounds(inchi,'inchi') cmpd = pcp.get_compounds(inchi,'inchi') props = cmpd[0].to_dict(properties=['cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity', 'exact_mass','fingerprint']) smiles=props['isomeric_smiles'] props['mol']=Chem.MolFromSmiles(smiles) props['RT'] = rt props['Name'] = name # props['System'] = row['System'] desc = np.array(fps_plus_mw(props['mol'])) descdf = pd.DataFrame(desc) descdf = descdf.T # descdf.reindex([index]) newdf=pd.DataFrame(props,index=[0]) finaldf=pd.concat([descdf,newdf],axis=1) return finaldf
def Join_Mummichog_Matches_Molecular_Features(result_df, file_dir): # result_df = result_df.drop(['Unnamed: 0'], axis=1) running_cmpd_list = [] for index, row in result_df.iterrows(): if index == 0: name = row['Compound Name'] # print(name) if name not in running_cmpd_list: running_cmpd_list.append(name) # inchi = row['InChI'] cmpd = pcp.get_compounds(name, 'name') props = cmpd[0].to_dict(properties=[ 'cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count', 'charge', 'complexity', 'exact_mass', 'fingerprint' ]) smiles = props['isomeric_smiles'] props['mol'] = Chem.MolFromSmiles(smiles) props['RT'] = row['rtmin'] props['Name'] = name props['System'] = 'xcms' desc = np.array(fps_plus_mw(props['mol'])) descdf = pd.DataFrame(desc) descdf = descdf.T descdf.reindex([index]) newdf = pd.DataFrame(props, index=[index]) finaldf = pd.concat([descdf, newdf], axis=1) else: print('cmpd already queried') # print('test') else: name = row['Compound Name'] if name not in running_cmpd_list: running_cmpd_list.append(name) try: cmpd = pcp.get_compounds(name, 'name') except: print('line bypassed') pass try: props = cmpd[0].to_dict(properties=[ 'cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count', 'charge', 'complexity', 'exact_mass', 'fingerprint' ]) except: print('line bypassed') pass # name = row['Name'] smiles = props['isomeric_smiles'] props['mol'] = Chem.MolFromSmiles(smiles) props['RT'] = row['rtmin'] props['Name'] = name props['System'] = 'xcms' newdf = pd.DataFrame(props, index=[index]) desc = np.array(fps_plus_mw(props['mol'])) cols = range(len(desc)) descdf = pd.DataFrame(desc) descdf = descdf.T descdf.index = [index] # descdf = descdf.T # descdf = pd.DataFrame(descdf, index=[index]) interdf = pd.concat([descdf, newdf], axis=1) finaldf = finaldf.append(interdf) # else: # print('cmpd already queried') # print('on index ' + str(index+1) + ' of ' + str(len(result_df))) finaldf.to_pickle(file_dir + 'RT_Folder\\mummichog_rt_features.p') return finaldf
def SMRT_Database_Processor(smrt_file): cid_dict = {} with open(smrt_file, 'r') as f: state = '' num = 1 for line in f.readlines(): if state == 'Log CID': key = line state = 'Wait for RT Time' if state == 'Log RT': cid_dict[key] = float(line) state = '' print('On Compound {}'.format(str(num))) num += 1 if line.startswith('> <PUBCHEM_COMPOUND_CID>'): state = 'Log CID' if line.startswith('> <RETENTION_TIME>'): state = 'Log RT' num = 1 saving_count = 1 list_of_df = [] list_of_unprocessed = [] list_of_processed = [] for cid in cid_dict.keys(): print('On compound {}'.format(num)) try: cmpd = pcp.get_compounds(cid, 'cid') list_of_processed.append(cid) except: list_of_unprocessed.append(cid) time.sleep(10) pd.DataFrame(list_of_unprocessed).to_pickle( 'compounds_skipped.pickle') # props = cmpd[0].to_dict(properties=['cactvs_fingerprint', # 'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity', # 'exact_mass','fingerprint']) name = cmpd[0].iupac_name rt = cid_dict[cid] props = cmpd[0].to_dict(properties=[ 'cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count', 'charge', 'complexity', 'exact_mass', 'fingerprint' ]) smiles = props['isomeric_smiles'] props['mol'] = Chem.MolFromSmiles(smiles) props['Name'] = name props['System'] = 'SMRT DATA' props['RT'] = rt desc = np.array(fps_plus_mw(props['mol'])) descdf = pd.DataFrame(desc) descdf = descdf.T descdf.reindex([num]) newdf = pd.DataFrame(props, index=[0]) finaldf = pd.concat([descdf, newdf], axis=1) list_of_df.append(finaldf) num += 1 saving_count += 1 if saving_count > 999: final_df = pd.concat(list_of_df) final_df.to_pickle('compoundsupto' + str(num) + '.pickle') saving_count = 1 list_of_df = [] with open('processed.p', 'wb') as f: pickle.dump(list_of_processed, f) pd.DataFrame(list_of_unprocessed).to_pickle('compounds_skipped.pickle')
# props = cmpd[0].to_dict(properties=['cactvs_fingerprint', # 'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity', # 'exact_mass','fingerprint']) name = cmpd[0].iupac_name rt = cid_dict[cid] props = cmpd[0].to_dict(properties=[ 'cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count', 'charge', 'complexity', 'exact_mass', 'fingerprint' ]) smiles = props['isomeric_smiles'] props['mol'] = Chem.MolFromSmiles(smiles) props['Name'] = name props['System'] = 'SMRT DATA' props['RT'] = rt desc = np.array(fps_plus_mw(props['mol'])) descdf = pd.DataFrame(desc) descdf = descdf.T descdf.reindex([num]) newdf = pd.DataFrame(props, index=[0]) finaldf = pd.concat([descdf, newdf], axis=1) list_of_df.append(finaldf) num += 1 saving_count += 1 if saving_count > 999: final_df = pd.concat(list_of_df) final_df.to_pickle('compoundsupto' + str(num) + '.pickle') saving_count = 1 list_of_df = [] with open('processed.p', 'wb') as f: pickle.dump(list_of_processed, f)