예제 #1
0
def generate_data(name,inchi,rt='null'):
    
    cmpd = pcp.get_compounds(inchi,'inchi')
    cmpd = pcp.get_compounds(inchi,'inchi')
    props = cmpd[0].to_dict(properties=['cactvs_fingerprint',
                        'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity',
                        'exact_mass','fingerprint'])
    smiles=props['isomeric_smiles']
    props['mol']=Chem.MolFromSmiles(smiles)
    props['RT'] = rt
    props['Name'] = name
#    props['System'] = row['System']
    desc = np.array(fps_plus_mw(props['mol']))
    descdf = pd.DataFrame(desc)
    descdf = descdf.T
#    descdf.reindex([index])
    newdf=pd.DataFrame(props,index=[0])
    finaldf=pd.concat([descdf,newdf],axis=1)
    return finaldf
def Join_Mummichog_Matches_Molecular_Features(result_df, file_dir):

    # result_df = result_df.drop(['Unnamed: 0'], axis=1)
    running_cmpd_list = []
    for index, row in result_df.iterrows():
        if index == 0:
            name = row['Compound Name']
            # print(name)
            if name not in running_cmpd_list:
                running_cmpd_list.append(name)
                #            inchi = row['InChI']
                cmpd = pcp.get_compounds(name, 'name')
                props = cmpd[0].to_dict(properties=[
                    'cactvs_fingerprint', 'isomeric_smiles', 'xlogp',
                    'rotatable_bond_count', 'charge', 'complexity',
                    'exact_mass', 'fingerprint'
                ])
                smiles = props['isomeric_smiles']
                props['mol'] = Chem.MolFromSmiles(smiles)
                props['RT'] = row['rtmin']
                props['Name'] = name
                props['System'] = 'xcms'
                desc = np.array(fps_plus_mw(props['mol']))
                descdf = pd.DataFrame(desc)
                descdf = descdf.T
                descdf.reindex([index])
                newdf = pd.DataFrame(props, index=[index])
                finaldf = pd.concat([descdf, newdf], axis=1)
            else:
                print('cmpd already queried')
        #            print('test')
        else:
            name = row['Compound Name']
            if name not in running_cmpd_list:
                running_cmpd_list.append(name)
                try:
                    cmpd = pcp.get_compounds(name, 'name')
                except:
                    print('line bypassed')
                    pass
                try:
                    props = cmpd[0].to_dict(properties=[
                        'cactvs_fingerprint', 'isomeric_smiles', 'xlogp',
                        'rotatable_bond_count', 'charge', 'complexity',
                        'exact_mass', 'fingerprint'
                    ])
                except:
                    print('line bypassed')
                    pass
        #        name = row['Name']
                smiles = props['isomeric_smiles']
                props['mol'] = Chem.MolFromSmiles(smiles)
                props['RT'] = row['rtmin']
                props['Name'] = name
                props['System'] = 'xcms'
                newdf = pd.DataFrame(props, index=[index])
                desc = np.array(fps_plus_mw(props['mol']))
                cols = range(len(desc))
                descdf = pd.DataFrame(desc)
                descdf = descdf.T
                descdf.index = [index]
                #        descdf = descdf.T
                #        descdf = pd.DataFrame(descdf, index=[index])
                interdf = pd.concat([descdf, newdf], axis=1)
                finaldf = finaldf.append(interdf)
            # else:
            # print('cmpd already queried')
            # print('on index ' + str(index+1) + ' of ' + str(len(result_df)))

    finaldf.to_pickle(file_dir + 'RT_Folder\\mummichog_rt_features.p')
    return finaldf
예제 #3
0
def SMRT_Database_Processor(smrt_file):
    cid_dict = {}
    with open(smrt_file, 'r') as f:
        state = ''
        num = 1
        for line in f.readlines():
            if state == 'Log CID':
                key = line
                state = 'Wait for RT Time'
            if state == 'Log RT':
                cid_dict[key] = float(line)
                state = ''
                print('On Compound {}'.format(str(num)))
                num += 1
            if line.startswith('> <PUBCHEM_COMPOUND_CID>'):
                state = 'Log CID'
            if line.startswith('> <RETENTION_TIME>'):
                state = 'Log RT'
    num = 1
    saving_count = 1
    list_of_df = []
    list_of_unprocessed = []
    list_of_processed = []
    for cid in cid_dict.keys():

        print('On compound {}'.format(num))
        try:
            cmpd = pcp.get_compounds(cid, 'cid')
            list_of_processed.append(cid)
        except:
            list_of_unprocessed.append(cid)
            time.sleep(10)
            pd.DataFrame(list_of_unprocessed).to_pickle(
                'compounds_skipped.pickle')
    #    props = cmpd[0].to_dict(properties=['cactvs_fingerprint',
    #                        'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity',
    #                        'exact_mass','fingerprint'])
        name = cmpd[0].iupac_name
        rt = cid_dict[cid]
        props = cmpd[0].to_dict(properties=[
            'cactvs_fingerprint', 'isomeric_smiles', 'xlogp',
            'rotatable_bond_count', 'charge', 'complexity', 'exact_mass',
            'fingerprint'
        ])
        smiles = props['isomeric_smiles']
        props['mol'] = Chem.MolFromSmiles(smiles)
        props['Name'] = name
        props['System'] = 'SMRT DATA'
        props['RT'] = rt
        desc = np.array(fps_plus_mw(props['mol']))
        descdf = pd.DataFrame(desc)
        descdf = descdf.T
        descdf.reindex([num])
        newdf = pd.DataFrame(props, index=[0])
        finaldf = pd.concat([descdf, newdf], axis=1)
        list_of_df.append(finaldf)
        num += 1
        saving_count += 1
        if saving_count > 999:
            final_df = pd.concat(list_of_df)
            final_df.to_pickle('compoundsupto' + str(num) + '.pickle')
            saving_count = 1
            list_of_df = []
            with open('processed.p', 'wb') as f:
                pickle.dump(list_of_processed, f)

    pd.DataFrame(list_of_unprocessed).to_pickle('compounds_skipped.pickle')
예제 #4
0
#    props = cmpd[0].to_dict(properties=['cactvs_fingerprint',
#                        'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity',
#                        'exact_mass','fingerprint'])
    name = cmpd[0].iupac_name
    rt = cid_dict[cid]
    props = cmpd[0].to_dict(properties=[
        'cactvs_fingerprint', 'isomeric_smiles', 'xlogp',
        'rotatable_bond_count', 'charge', 'complexity', 'exact_mass',
        'fingerprint'
    ])
    smiles = props['isomeric_smiles']
    props['mol'] = Chem.MolFromSmiles(smiles)
    props['Name'] = name
    props['System'] = 'SMRT DATA'
    props['RT'] = rt
    desc = np.array(fps_plus_mw(props['mol']))
    descdf = pd.DataFrame(desc)
    descdf = descdf.T
    descdf.reindex([num])
    newdf = pd.DataFrame(props, index=[0])
    finaldf = pd.concat([descdf, newdf], axis=1)
    list_of_df.append(finaldf)
    num += 1
    saving_count += 1
    if saving_count > 999:
        final_df = pd.concat(list_of_df)
        final_df.to_pickle('compoundsupto' + str(num) + '.pickle')
        saving_count = 1
        list_of_df = []
        with open('processed.p', 'wb') as f:
            pickle.dump(list_of_processed, f)