def updatedatabase(fingerprintdf): username = '******' descriptors_index = 1448 # Query for compound_descriptor_sets.id mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.id) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index) df3 = pd.DataFrame(list(query3)) #join query with fingerprintdf mytable = pd.merge(df3, fingerprintdf, on='dsstox_compound_id') # # # CODE FOR UPDATING # # # for index, row in mytable.iterrows(): id = row[1] descriptor_string_tsv = row[2] updated_by = username mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.descriptor_string_tsv, CompoundDescriptorSets.updated_by).filter( CompoundDescriptorSets.id == id).update({ "descriptor_string_tsv": descriptor_string_tsv, "updated_by": updated_by }) mysession.commit()
def update_toxprint_database(fingerprintdf): username = '******' descriptors_index = 1445 # Query for compound_descriptor_sets.id mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.efk_dsstox_compound_id)\ .filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index) df3 = pd.DataFrame(list(query3)) #join query with fingerprintdf fingerprintdf = fingerprintdf.reset_index() fingerprintdf = fingerprintdf.rename( columns={'M_NAME': 'efk_dsstox_compound_id'}) # not sure this will work fingerprintdf['efk_dsstox_compound_id'] = [ int(x[8:]) for x in fingerprintdf['efk_dsstox_compound_id'] ] mytable = pd.merge(df3, fingerprintdf, on='efk_dsstox_compound_id') # # # CODE FOR UPDATING # # # for index, row in mytable.iterrows(): id = str(row[0]) xx = [str(x) for x in list(row[2:])] descriptor_string_tsv = '\t'.join(xx) updated_by = username mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.descriptor_string_tsv, CompoundDescriptorSets.updated_by).filter( CompoundDescriptorSets.id == id).update({ "descriptor_string_tsv": descriptor_string_tsv, "updated_by": updated_by }) mysession.commit()
def mk_special_fingerprints(idrow): # query existing toxprints dsi = 1445 mysession = SQLSession(Schemas.qsar_schema).get_session() query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi) df2 = pd.DataFrame(list(query2)) # FILTERING BY A LIST OF >1000 WON'T WORK IN MANY DATABASES (THIS IS BREAKING THE SCRIPT HERE ON FULL UPDATE) # doing a full query then a merge after df2 = pd.merge(pd.DataFrame(idrow, columns=['dsstox_compound_id']), df2, on='dsstox_compound_id') # something to separate and name fingerprint columns df2 = pd.concat([ df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True) ], axis=1) df2 = df2.drop('descriptor_string_tsv', axis=1) # # # GENERATE SPECIFIC NEW FINGERPRINTS # # # # create an empty column for the new fingerprint df2['Special_Toxprints'] = "" # iterate through datatable and create new Special Toxprints # make sure you are looking at the right index when combining Toxprints for index, row in df2.iterrows(): # have to code each special toxprint this way if row[480] == '1' and row[489] == '1': row['Special_Toxprints'] = '1' else: row['Special_Toxprints'] = '0' # make sure to add tabs before the rest of the toxprints if row[489] == '1' and row[480] == '0': row['Special_Toxprints'] += '\t1' else: row['Special_Toxprints'] += '\t0' if row[480] == '1' and row[489] == '0': row['Special_Toxprints'] += '\t1' else: row['Special_Toxprints'] += '\t0' # remove everything but fingerprints and DTXCIDs output_df = df2[['dsstox_compound_id', 'Special_Toxprints']] return output_df
def check(f): mysession = SQLSession(Schemas.qsar_schema).get_session() # pull compound ids in invitrodb CID1 = mysession.execute( 'SELECT compounds.id FROM invitrodb.mc5' ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id' ' JOIN invitrodb.sample ON sample.spid = mc4.spid' ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid' ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id' ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id' ) CID1 = set([x[0] for x in list(CID1)]) # pull compound ids in compound_descriptor_sets CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449') CID2 = [x[0] for x in list(CID2)] # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # # # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # # CID3 = list(CID1 - set(CID2)) # check dataframes # print("\n set CID1:", len(set(CID1))) # print("\n set CID2:", len(set(CID2))) # print("\n CID3", len(CID3), CID3) # print("\n CID3df", pd.DataFrame(CID3).head()) # print("\n CID1df", pd.DataFrame(list(CID1)).head()) # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # # if CID3 == [] and f is False: print(Fore.RED + 'SQL query is empty: All ToxcastFP available or no DTXCIDs') sys.exit(0) elif CID3 == [] and f is True: return pd.DataFrame([]), pd.DataFrame(CID1) else: return pd.DataFrame(CID3), pd.DataFrame(CID1)
def fillnewenrich(x_aeid): print(x_aeid) # retrive the latest dataset for the aeid mysession = SQLSession(Schemas.qsar_schema).get_session() new_df = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \ .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \ .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \ .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id)\ .filter(Datasets.name == x_aeid) new_df = pd.DataFrame(list(new_df)) # new_df = query1[query1['name'].isin([x_aeid])] # rename columns new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id'] my_dataset_id = new_df['dataset_id'].iloc[0] # make the enrichment table filluc(new_df, my_dataset_id)
from database.dsstox.compounds import Compounds from database.qsar.compound_descriptors import CompoundDescriptors from database.session import SQLSession from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds from database.dsstox.generic_substances import GenericSubstances import pandas as pd mytable = pd.read_csv('~/Desktop/Katies_data.tsv', sep='\t') dtxsid = mytable.iloc[:,4] mysession = SQLSession(Schemas.qsar_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.id, Compounds.dsstox_compound_id).join( GenericSubstanceCompounds) \ .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(dtxsid)) df = pd.DataFrame(list(query)) myids = [int(x) for x in df.iloc[:,1]] # query1 = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(dtxsid)) # df1 = pd.DataFrame(list(query1)) query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv, CompoundDescriptors.fk_descriptor_set_id).filter( CompoundDescriptors.efk_dsstox_compound_id.in_(myids)) df2 = pd.DataFrame(list(query2)) df2 = df2.rename(columns={"efk_dsstox_compound_id":'id'})
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets from database.dsstox.compounds import Compounds from database.database_schemas import Schemas from database.session import SQLSession mysession = SQLSession(Schemas.qsar_schema).get_session() x_aeid = 926 query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == int(1445)) # .filter(Compounds.dsstox_compound_id.in_(str(new_df.iloc[0]))) \ print(list(query3))
from database.dsstox.compounds import Compounds from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds from database.dsstox.generic_substances import GenericSubstances from database.invitrodb.mc4 import Mc4 from database.invitrodb.mc5 import Mc5 from database.invitrodb.sample import Sample from database.session import SQLSession # GET ALL ASSAYS FROM MC5 # QUERY MC5 data for hitcalls and chemical IDs mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query( Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid,) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) mc5_table = pd.DataFrame(list(query0)) # print(mc5_table.shape) # print( mc5_table[mc5_table['aeid']==1086] ) # sys.exit(1) # run xgboost for files in test_aeids def makeXGB(myaeid): try: # output print statements into file os.makedirs("/home/rlougee/Desktop/xgb_results/" + str(myaeid))
def filluc(invitrodbdf, mydatasetid): #set starttime starttime = time.time() username = '******' descriptor_set_id = [1445, 1447, 1446, 1448] # descriptor_set_id = [1448] # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # 1445 = toxprints # 1446 = MACCS # 1447 = pubchem # 1448 = Special Toxprints for id in descriptor_set_id: # add enrichment table to the mysql database try: filled_table = handle_duplicates(invitrodbdf.loc[:, ['dsstox_compound_id', 'hitc']]) except: print("DUPLICATE HANDLER FAILED: {}".format(mydatasetid)) sys.exit(1) try: filled_table = fillfp(filled_table, id) except: print("FILLFP FAILED: {}".format(mydatasetid)) sys.exit(1) # filled_table = pd.DataFrame(filled_table) try: my_enrichment_table = enrich(filled_table) except: print("ENRICH FAILED: {}".format(mydatasetid)) print(filled_table.head()) sys.exit(1) # add fk_descriptor_id ### CHECK THAT THESE ARE MATCHING! ### mysession2 = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession2.query(Descriptors.id).filter(Descriptors.fk_descriptor_set_id == id) query3 = list(query3) query3 = [int(i[0]) for i in query3] my_enrichment_table.insert(0, 'fk_descriptor_id', query3) for index, row in my_enrichment_table.iterrows(): fk_dataset_id = int(mydatasetid) fk_descriptor_id = int(row['fk_descriptor_id']) univariate_calc = UnivariateCalculations(fk_dataset_id=fk_dataset_id, fk_descriptor_id=fk_descriptor_id, updated_by=username, created_by=username) mysession2.add(univariate_calc) mysession2.flush() fk_univ_calc_id = int(univariate_calc.id) ### NEED TO CHANGE for loop & stat_list IF THE STATISTICS ARE CHANGED IN Enrichment_Table_Generator ### count = 0 for i in row[1:]: if math.isnan(i): value = None elif math.isinf(i): value = 99999999.9 else: value = float(i) stat_list = [9, 10, 11, 12, 13, 4, 8, 7, 14, 15] fk_statistic_id = int(stat_list[count]) uc_statistics = UcStatistics(value=value, fk_univ_calc_id=int(fk_univ_calc_id), fk_statistic_id=int(fk_statistic_id), created_by=username, updated_by=username) mysession2.add(uc_statistics) count += 1 mysession2.commit() # mysession2.close() endtime = time.time() print('run time:{}'.format(endtime-starttime))
for i in wowie['name'].unique(): bla = wowie[wowie['name'] == i] blabla = pd.DataFrame(bla.iloc[:,1:]) print(blabla.head(), blabla.shape) # for idx, row in blabla.iterrows(): # if row[1] >= 40: # # print(row[1]) # blabla.iloc[idx, 1] = 10 # # print(blabla.iloc[idx,1]) # get Txp index mysession = SQLSession(Schemas.qsar_schema).get_session() query3 = mysession.query(Descriptors.index_number, Descriptors.descriptors_name).filter(Descriptors.fk_descriptor_set_id == 1445) descriptornames = pd.DataFrame(list(query3)) # sort by TXP number sorted = pd.merge(descriptornames, blabla, on='descriptors_name') sorted = sorted.drop('index_number', axis=1) # print(sorted.head()) name = i.split('_')[0] from zMisc_Code.DATA_VISUALIZATION.barplot import barplot barplot(sorted.iloc[:,1], sorted.iloc[:,0], name) plt.tight_layout() # plt.show() plt.savefig('/home/rlougee/Desktop/images/{}.png'.format(name))
def cli(): ### HELP DOCUMENTATION ### """ checks for updates for AEIDS (new compounds / new endpoints / new AEIDS) if there are any updates qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints are updated to include this information """ #################################################################################################################### #QUERY MC5 data for hitcalls and chemical IDs mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) mc5_table = pd.DataFrame(list(query0)) #################################################################################################################### ### QUERY DATE WHEN THE TABLE WAS LAST UPDATED ### # Not very useful as datasets.name date will always be different/ not worth querying all of the dates # query1 = mysession.query(Tables.UPDATE_TIME)\ # .filter(or_(Tables.TABLE_SCHEMA == 'invitrodb'), (Tables.TABLE_NAME == 'mc5')) # # #format last_update query # last_update = str(list(query1)[0][0])[:10].replace('-','') #################################################################################################################### def filldatasets(invitrodbdf, fd_aeid): username = '******' # create a new datasets name entry datasets_name = str('aeid:{}_{}'.format( fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))) description = "The set of hit calls from the toxcast AEID: {} taken on the date:{}"\ .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d")) datasets = Datasets(name=datasets_name, label=datasets_name, updated_by=username, created_by=username, long_description=description, short_description=description) mysession.add(datasets) mysession.flush() fk_dataset_id = int(datasets.id) # add datatable to the mysql database for index, row in invitrodbdf.iterrows(): efk_dsstox_compound_id = row.loc['id'] efk_chemprop_measured_property_id = None #leave null -CG #not nullable measured_value_dn = row.loc['hitc'] created_by = username updated_by = username datapoints = Datapoints( efk_dsstox_compound_id=efk_dsstox_compound_id, efk_chemprop_measured_property_id= efk_chemprop_measured_property_id, measured_value_dn=measured_value_dn, created_by=created_by, updated_by=updated_by) mysession.add(datapoints) mysession.flush() fk_datapoint_id = int(datapoints.id) dataset_datapoints = DatasetDatapoints( fk_dataset_id=fk_dataset_id, fk_datapoint_id=fk_datapoint_id, updated_by=username, created_by=username) mysession.add(dataset_datapoints) mysession.commit() #################################################################################################################### ### CHECK 1) IF TABLE EXISTS FOR AEID 2) IF THE TABLE HAS CHANGED # begin a for loop for each unique aeid for x_aeid in mc5_table.aeid.unique(): #query latest dataset for this aeid aeid_query = mysession.query(Datasets.name) \ .filter(Datasets.name.like("aeid:{}/_%".format(str(x_aeid)), escape='/')) aeid_query = list(aeid_query) aeid_query = [x[0] for x in aeid_query] #get the latest values for aeid new_df = mc5_table[mc5_table['aeid'].isin([x_aeid])] if aeid_query == [] or aeid_query == [''] or aeid_query == None: print( "New AEID, filling mysql database for aeid: {}".format(x_aeid)) filldatasets(new_df, x_aeid) else: # find and retrieve the newest dataset name aeid_query_dates = [x.split('_')[1] for x in aeid_query] newest_aeid_date = sorted(aeid_query_dates)[0] newest_aeid = [x for x in aeid_query if str(newest_aeid_date) in x] #pull table and compare old_df = mysession.query(Datapoints.efk_dsstox_compound_id, Datapoints.measured_value_dn)\ .join(DatasetDatapoints, Datapoints.id==DatasetDatapoints.fk_datapoint_id)\ .join(Datasets, DatasetDatapoints.fk_dataset_id==Datasets.id)\ .filter(Datasets.name==newest_aeid[0]) old_df = pd.DataFrame(list(old_df)) ##FORMAT DFs FOR COMPARING #rename columns old_df.columns = ['id', 'hitc'] my_new_df = new_df.loc[:, ['id', 'hitc']] old_df['hitc'] = old_df['hitc'].astype(int) #sort dataframes my_new_df = my_new_df.sort_values(['id', 'hitc']) old_df = old_df.sort_values(['id', 'hitc']) #reset index my_new_df = my_new_df.reset_index(drop=True) old_df = old_df.reset_index(drop=True) if my_new_df.equals(old_df) == True: print("no change for aeid: {}".format(x_aeid)) pass else: print("Update, filling mysql database for aeid: {}".format( x_aeid)) filldatasets(new_df, x_aeid)
# new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id'] # my_dataset_id = new_df['dataset_id'].iloc[0] # # make the enrichment table # filluc(new_df, my_dataset_id) #################################################################################################################### # create a connection pool for multiprocessing with mysqlalchemy if __name__ == '__main__': ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ### mysession = SQLSession(Schemas.qsar_schema).get_session() # query the Unique Enrichment Table IDs query0 = mysession.query(UnivariateCalculations.fk_dataset_id) query0 = pd.DataFrame(list(query0)) if query0.empty: pass else: query0 = query0.fk_dataset_id.unique() # query the full set of data # slow ~3.5 minutes already query1 = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \ .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \ .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \ .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id) query1 = pd.DataFrame(list(query1))
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables InID = args['<InID>'].lower() OutID = args['<OutID>'].lower() tsv_input = args['<tsv_file>'] o = args['--output'] noerror = args['--noerror'] # filter InID & OutID if InID in acceptedID: pass else: print(Fore.RED + 'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID)) sys.exit(1) if OutID in acceptedID: pass else: print(Fore.RED + 'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID)) sys.exit(1) # creates table of .tsv file # takes stdin if argument is not directly given if not tsv_input: tsv_input = sys.stdin.read() mytable = pd.read_csv(StringIO(tsv_input), sep="\t") elif tsv_input: mytable = pd.read_csv(tsv_input, sep="\t") # takes the chemical ID column idrow = mytable.iloc[:, 0] colname = mytable.columns.values[0] # make an SQL query table for relevant SIDs & CIDs mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, GenericSubstances.casrn, Compounds.dsstox_compound_id)\ .join(GenericSubstanceCompounds, GenericSubstanceCompounds.fk_generic_substance_id == GenericSubstances.id)\ .join(Compounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) df = pd.DataFrame(list(query)) df.columns = ['dtxsid', 'casrn', 'dtxcid'] idrow = pd.DataFrame(idrow) idrow.columns = [InID] # do a join to filter out unwanted IDs if InID == 'dtxcid': df = pd.merge(idrow, df, on='dtxcid', how='inner') elif InID == 'casrn': df = pd.merge(idrow, df, on='casrn', how='inner') elif InID == 'dtxsid': df = pd.merge(idrow, df, on='dtxsid', how='inner') df = df.drop_duplicates(InID) # if no DTXCIDs returned if df.empty and not noerror: print(Fore.RED + "Error: No valid {} or no associated {}\n{}".format( InID, OutID, list(idrow))) sys.exit(1) elif df.empty: sys.exit(1) #creates a list of unconverted IDs noid = list(set(idrow.iloc[:, 0]) - set(list(df.iloc[:, 0]))) # creates new CID table mytable = mytable.rename(columns={colname: InID}) mytable = pd.merge(df, mytable, on=InID, how='left') for i in acceptedID: if i != OutID: mytable = mytable.drop(i, 1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += i + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' # output options if not o: print(output[:int(-1)]) else: outputtable.to_csv(o, sep='\t', index=False) # ERROR message # not actual STDERR this is for the user if not noerror: print(Fore.RED + "Error: Invalid {} or no associated {}\n{}".format( InID, OutID, noid))
def fillfp(mytable, dsi): #checks the index, and first two columns for DTXCIDs #input table should be in the correct format already try: if mytable.iloc[0, 0][0:6] == 'DTXCID': idrow = mytable.iloc[:, 0] colname = mytable.columns.values[0] except: pass try: if mytable.iloc[0, 1][0:6] == 'DTXCID': idrow = mytable.iloc[:, 1] colname = mytable.columns.values[0] except: pass try: if mytable.index.values[0][0:6] == 'DTXCID': idrow = mytable.index.values mytable.index.name = 'DTXCID' colname = mytable.index.name except: pass dsi = int(dsi) ######################################################################################################################## ######################################################################################################################## mysession1 = SQLSession(Schemas.dsstox_schema).get_session() # ### CHECKS FOR DTXCID IN DSSTOX.COMPOUNDS ### # query = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(idrow)) # df1 = pd.DataFrame(list(query)) # df1 = [int(x) for x in df1.iloc[:, 0]] # # ### CHECKS FOR ID AND TOXPRINTS IN QSAR.COMPOUND_DESCRIPTOR_SETS ### # query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv)\ # .filter(CompoundDescriptors.efk_dsstox_compound_id.in_(df1))\ # .filter(CompoundDescriptors.fk_descriptor_set_id == 1445) query2 = mysession1.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi) df2 = pd.DataFrame(list(query2)) idrow = pd.DataFrame(idrow.unique()) idrow.columns = ['dsstox_compound_id'] df2 = pd.merge(idrow, df2, on='dsstox_compound_id', how='inner') #something to separate and name fingerprint columns df2 = pd.concat([ df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True) ], axis=1) df2 = df2.drop('descriptor_string_tsv', axis=1) # print(df2) #name the columns correctly query3 = mysession1.query(Descriptors.descriptors_name).filter( Descriptors.fk_descriptor_set_id == dsi) descriptornames = list(query3) for num, name in enumerate(descriptornames, start=0): df2 = df2.rename(columns={num: name[0]}) #creates the final output table mytable = mytable.rename(columns={colname: "dsstox_compound_id"}) mytable = pd.merge(mytable, df2, on='dsstox_compound_id') # mytable = mytable.drop('dsstox_compound_id', 1) outputtable = mytable outputtable = outputtable.drop(outputtable.columns[-1], axis=1) #check for trailing column created by tab and remove if outputtable[outputtable.columns[-1]][0] == '': mytable = mytable.drop(mytable.columns[-1], axis=1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends # columnnames = mytable.columns.values # output = '' # for i in columnnames: # output += str(i) + '\t' # output += '\n' # mytable = mytable.values.tolist() # # for i in mytable:yes # a = '\t'.join(str(x) for x in i) # output += a + '\n' mysession1.close() return outputtable
def cli(tsv_input,o,noerror): ### HELP DOCUMENTATION ### """ SIDtoCID takes in a .tsv datatable with a dsstox_substance_id column (must be index or first 2 columns). The dsstox_substance_id column is converted to dsstox_compound_id. Can use a .tsv file as stdin. Default output is stdout as .tsv. \n\n Warning!: column names are needed in the input .tsv! Otherwise first row will be skipped. -- EXAMPLE I/O TABLES -- INPUT: .tsv file |DTXSID COLUMN | ENDPOINT COLUMN |\n ----------------------------------\n | DTXSID123456 | 0 |\n ----------------------------------\n | DTXSID234567 | 1 |\n ----------------------------------\n | DTXSID345678 | 0 |\n ----------------------------------\n EXPORT: .tsv file |DTXCID COLUMN | ENDPOINT COLUMN |\n ----------------------------------\n | DTXCID891011 | 0 |\n ----------------------------------\n | DTXCID910111 | 1 |\n ----------------------------------\n | DTXCID101112 | 0 |\n ----------------------------------\n """ # creates table of .tsv file # takes stdin if argument is not directly given if not tsv_input: tsv_input = sys.stdin.read() mytable = pd.read_csv(StringIO(tsv_input), sep="\t") elif tsv_input: mytable = pd.read_csv(tsv_input, sep="\t") #checks the index, and first two columns for DTXSIDs #input table should be in the correct format already try: if mytable.iloc[0,0][0:6] == 'DTXSID': idrow = mytable.iloc[:,0] colname = mytable.columns.values[0] except: pass try: if mytable.iloc[0,1][0:6] == 'DTXSID': idrow = mytable.iloc[:,1] colname = mytable.columns.values[0] except: pass try: if mytable.index.values[0][0:6] == 'DTXSID': idrow = mytable.index.values mytable.index.name = 'DTXSID' colname = mytable.index.name except: pass # drop empty columns mytable = mytable.dropna(axis='columns', how='all') # click.echo(mytable.columns.values) #make an SQL query table for relevant SIDs & CIDs mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \ .join(Compounds) df = pd.DataFrame(list(query)) idrow = pd.DataFrame(idrow) idrow.columns = ['dsstox_substance_id'] df = pd.merge(idrow, df, on='dsstox_substance_id', how='inner') #if no DTXCIDs returned if df.empty and noerror: click.secho("Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow)), fg='red', bold=True) sys.exit(1) elif df.empty: sys.exit(1) #creates new CID table mytable = mytable.rename(columns={colname : "dsstox_substance_id"}) mytable = pd.merge(df, mytable, on='dsstox_substance_id') mytable = mytable.drop('dsstox_substance_id', 1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += i + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' #output options if o =='': click.echo(output) else: outputtable.to_csv(o, sep='\t',index=False) # get IDs that were not converted noid = list(list(set(idrow.iloc[:,0]) - set(df.iloc[:,0]))) #ERROR message #not actual STDERR this is for the user if noerror: click.secho("Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid), fg='red', bold=True)
def main(): args = docopt(__doc__) print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) tsv_input = args['<tsv_file>'] o = args['--output'] noerror = args['--noerror'] # creates table of .tsv file # takes stdin if argument is not directly given if not tsv_input: tsv_input = sys.stdin.read() mytable = pd.read_csv(StringIO(tsv_input), sep="\t") elif tsv_input: mytable = pd.read_csv(tsv_input, sep="\t") # checks the index, and first two columns for DTXSIDs # input table should be in the correct format already try: if mytable.iloc[0,0][0:6] == 'DTXSID': idrow = mytable.iloc[:, 0] colname = mytable.columns.values[0] except: pass try: if mytable.iloc[0,1][0:6] == 'DTXSID': idrow = mytable.iloc[:, 1] colname = mytable.columns.values[0] except: pass try: if mytable.index.values[0][0:6] == 'DTXSID': idrow = mytable.index.values mytable.index.name = 'DTXSID' colname = mytable.index.name except: pass # make an SQL query table for relevant SIDs & CIDs mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \ .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(idrow)) # checks if DTXSID didn't exist or has no associated DTXCID df = pd.DataFrame(list(query)) # if no DTXCIDs returned if df.empty and not noerror: print(Fore.RED + "Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow))) sys.exit(1) elif df.empty: sys.exit(1) noid = list(set(idrow)-set(list(df.iloc[:, 0]))) # creates new CID table mytable = mytable.rename(columns={colname : "dsstox_substance_id"}) mytable = pd.merge(df, mytable, on='dsstox_substance_id') mytable = mytable.drop('dsstox_substance_id', 1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += i + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' # output options if not o: print(output) else: outputtable.to_csv(o, sep='\t', index=False) # ERROR message # not actual STDERR this is for the user if not noerror: print(Fore.RED + "Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid))
def cli(cidtable, o, dsi, t): ### HELP DOCUMENTATION ### """ Fills rows in a .tsv containing DTXCIDs with their toxprints. DTXCIDs must be in index, or first two columns use -o ~/mypath/myfilename.tsv to export a toxprint .tsv file for -dsi: toxprints=1445, pubchem=1447, MACCs=1446 a newer descriptor_set_id may be added or current dsis changed """ # takes stdin if argument is not directly given if not cidtable: cidtable = sys.stdin.read() mytable = pd.read_csv(StringIO(cidtable), sep="\t") elif cidtable: mytable = pd.read_csv(cidtable, sep="\t") #checks the index, and first two columns for DTXCIDs #input table should be in the correct format already try: if mytable.iloc[0, 0][0:6] == 'DTXCID': idrow = mytable.iloc[:, 0] colname = mytable.columns.values[0] except: pass try: if mytable.iloc[0, 1][0:6] == 'DTXCID': idrow = mytable.iloc[:, 1] colname = mytable.columns.values[0] except: pass try: if mytable.index.values[0][0:6] == 'DTXCID': idrow = mytable.index.values mytable.index.name = 'DTXCID' colname = mytable.index.name except: pass # exit if not idrow # if not idrow: # click.secho("DTXCID row was not found", color='red') # dsi = int(dsi) ######################################################################################################################## ######################################################################################################################## mysession = SQLSession(Schemas.qsar_schema).get_session() # ### CHECKS FOR DTXCID IN DSSTOX.COMPOUNDS ### # query = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(idrow)) # df1 = pd.DataFrame(list(query)) # df1 = [int(x) for x in df1.iloc[:, 0]] # # ### CHECKS FOR ID AND TOXPRINTS IN QSAR.COMPOUND_DESCRIPTOR_SETS ### # query2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv)\ # .filter(CompoundDescriptorSets.efk_dsstox_compound_id.in_(df1))\ # .filter(CompoundDescriptorSets.fk_descriptor_set_id == 1445) query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id)\ .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi) df2 = pd.DataFrame(list(query2)) idrow = pd.DataFrame(idrow) idrow.columns = ['dsstox_compound_id'] df2 = pd.merge(idrow, df2, on='dsstox_compound_id', how='inner') # something to separate and name fingerprint columns df2 = pd.concat([ df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True) ], axis=1) df2 = df2.drop('descriptor_string_tsv', axis=1) # print(df2) # name the columns correctly if t == 0: query3 = mysession.query(Descriptors.descriptors_name).filter( Descriptors.fk_descriptor_set_id == dsi) descriptornames = list(query3) elif t == 1: query3 = mysession.query( Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi) descriptornames = list(query3) for num, name in enumerate(descriptornames, start=0): df2 = df2.rename(columns={num: name[0]}) # print(df2) # creates the final output table mytable = mytable.rename(columns={colname: "dsstox_compound_id"}) mytable = pd.merge(mytable, df2, on='dsstox_compound_id') # mytable = mytable.drop('dsstox_compound_id', 1) # check for trailing column created by tab and remove if mytable[mytable.columns[-1]][0] == '' or mytable[ mytable.columns[-1]][0] == None: mytable = mytable.drop(mytable.columns[-1], axis=1) outputtable = mytable # generates a string with tab seperation and line breaks for row ends columnnames = mytable.columns.values output = '' for i in columnnames: output += str(i) + '\t' output += '\n' mytable = mytable.values.tolist() for i in mytable: a = '\t'.join(str(x) for x in i) output += a + '\n' #output options if o == '': click.echo(output) else: outputtable.to_csv(o, sep='\t', index=False) sys.exit(0)
# which fingerprint set do we want? # dsi is descriptor set index dsi = 1445 # 1445 = toxprints # 1446 = MACCS # 1447 = pubchem ### I WANT TO SEE THE TOTAL PREVELANCE FOR CHEMOTYPES ACROSS THE ENTIRE ASSAY SPACE ### ## Get matrix containing DTXCIDS, fingerprints mysession1 = SQLSession(Schemas.dsstox_schema).get_session() query2 = mysession1.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \ .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi) df2 = pd.DataFrame(list(query2)) # something to separate and name fingerprint columns df2 = pd.concat( [df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)], axis=1) df2 = df2.drop('descriptor_string_tsv', axis=1) # print(df2) # name the columns correctly query3 = mysession1.query(Descriptors.descriptors_name).filter( Descriptors.fk_descriptor_set_id == dsi) descriptornames = list(query3)
def cli(i, datasetname, username): ### HELP DOCUMENTATION ### """ creates a dataset in the mysql database (qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints) from a .tsv file input file can be .tsv, .xls or .xlsx if input is STDIN must be .tsv format: +-----------------+---------------+-----------\n | DTXCID1234 | 0 | ...\n +-----------------+---------------+-----------\n | DTXCID56789 | 1 | ...\n +-----------------+---------------+-----------\n to remove table header use tail -n +2 myfile.tsv | datatable2mysql """ #################################################################################################################### if not i: tsv_input = sys.stdin.read() try: myinputtable = pd.read_csv(StringIO(tsv_input), sep="\t", header=None) except: click.secho('Error: Empty Datatable', fg='red', bold=True) sys.exit(1) else: try: click.secho('-- Importing {} --'.format(i), bold=True) filename, file_extension = os.path.splitext(i) if file_extension == '.tsv': myinputtable = pd.read_csv(i, sep='\t', header=None) elif file_extension == '.xlsx' or file_extension == '.xls': myinputtable = pd.read_excel(i, header=None) except: click.secho('Error: File Import Failed', fg='red', bold=True) sys.exit(1) #clean input table myinputtable = myinputtable.iloc[:, 0:2] myinputtable.columns = ['dsstox_compound_id', 'hitc'] #QUERY database for proper ids mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(Compounds.dsstox_compound_id, Compounds.id )\ .filter(Compounds.dsstox_compound_id.in_(myinputtable.iloc[:,0])) mytable = pd.DataFrame(list(query0)) #join hitcalls mytable = pd.merge(myinputtable, mytable, how='inner', on=[myinputtable.columns[0], mytable.columns[0]]) #make sure datatable doesn't already exist? def filldatasets(invitrodbdf, fd_aeid, username): # create a new datasets name entry datasets_name = str('Imported_DataTable:{}_{}'.format( fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))) description = "Imported DataTable: {} taken on the date:{}"\ .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d")) datasets = Datasets(name=datasets_name, label=datasets_name, updated_by=username, created_by=username, long_description=description, short_description=description) mysession.add(datasets) mysession.flush() fk_dataset_id = int(datasets.id) # add datatable to the mysql database for index, row in invitrodbdf.iterrows(): efk_dsstox_compound_id = row.loc['id'] efk_chemprop_measured_property_id = None #leave null -CG #not nullable measured_value_dn = row.loc['hitc'] created_by = username updated_by = username datapoints = Datapoints( efk_dsstox_compound_id=efk_dsstox_compound_id, efk_chemprop_measured_property_id= efk_chemprop_measured_property_id, measured_value_dn=measured_value_dn, created_by=created_by, updated_by=updated_by) mysession.add(datapoints) mysession.flush() fk_datapoint_id = int(datapoints.id) dataset_datapoints = DatasetDatapoints( fk_dataset_id=fk_dataset_id, fk_datapoint_id=fk_datapoint_id, updated_by=username, created_by=username) mysession.add(dataset_datapoints) mysession.commit() filldatasets(mytable, datasetname, username)
created_by=username) mysession.add(dataset_datapoints) mysession.commit() #################################################################################################################### if __name__ == "__main__": #QUERY MC5 data for hitcalls, chemical IDs, assay subcategories mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type, AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \ .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \ .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \ .join(Sample, Sample.chid == GenericSubstances.id) \ .join(Mc4, Mc4.spid == Sample.spid) \ .join(Mc5, Mc5.m4id == Mc4.m4id) \ .join(AssayComponentEndpoint, AssayComponentEndpoint.aeid == Mc5.aeid) \ .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid) mytable = pd.DataFrame(list(query0)) #################################################################################################################### ### MAKE TABLES FOR SUBCATEGORIES ### #loop through characteristics for x in mytable.columns[9:]: for y in mytable[x].unique(): if pd.isnull(y) == True: #print('\nnull/nan value found!\n')
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445): """ Get Enrichment data for a combined set of chemotypes """ # aborts if no significant chemotypes if len(sigtxp) == 0: return None mysession = SQLSession(Schemas.qsar_schema).get_session() MyDataSet = mysession.execute( 'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets' ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id' ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id' ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id' ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id' ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi)) MyDataSet = pd.DataFrame(list(MyDataSet)) MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint'] #something to separate and name fingerprint columns MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1) MyDataSet = MyDataSet.drop('Toxprint', axis=1) #name the columns correctly query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi) descriptornames = pd.DataFrame(list(query3)) for num,name in enumerate(descriptornames['label'], start=0): MyDataSet = MyDataSet.rename(columns={num:name}) # drop columns that are not significant sigtxp = pd.DataFrame(sigtxp) sigtxp.columns = ['descriptors_name'] siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner') siglabel = list(siglabel['label']) for i in MyDataSet.columns[2:]: if i in siglabel: pass else: MyDataSet = MyDataSet.drop(i, axis=1) # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False) # return overall balanced accuracy calculations # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file # print(MyDataSet.head()) model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value']) # fill model_row confusion matrix for index, row in MyDataSet.iterrows(): rowsum = sum([int(x) for x in row.iloc[2:]]) if row['Hit_Call'] == 1 and rowsum > 0: model_row['True Positives'] += 1 elif row['Hit_Call'] == 1 and rowsum == 0: model_row['False Negatives'] += 1 elif row['Hit_Call'] == 0 and rowsum > 0: model_row['False Positives'] += 1 elif row['Hit_Call'] == 0 and rowsum == 0: model_row['True Negatives'] += 1 # fill model_row statistics oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater') model_row['P-Value'] = pvalue model_row['Odds Ratio'] = oddsratio model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives']) BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2) model_row['Balanced Accuracy'] = float(BA) inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater') model_row['Inverse P-Value'] = inv_pvalue model_row['Inverse Odds Ratio'] = inv_oddsratio # print(model_row) return model_row
from io import StringIO import re # FIRST NEED TO GET A TABLE index = AEID and columns = Assay Cats, Toxprints Enrichments (discrete hitcalls for each assay) # going to do this in two parts # part 1 get assay cats X aeids mysession = SQLSession(Schemas.information_schema).get_session() query0 = mysession.query(AssayComponentEndpoint.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type, AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \ .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid) mytable = pd.DataFrame(list(query0)) mytable.to_csv("~/Desktop/Assay_Categories/assay_cats_x_aeids_v2.tsv", sep='\t') sys.exit(0)
def mktoxprints(dsstox_id_list): mysession = SQLSession(Schemas.dsstox_schema).get_session() mydata = mysession.query(Compounds.dsstox_compound_id, Compounds.smiles)#.filter(Compounds.dsstox_compound_id.in_(dsstox_id_list)) df = pd.DataFrame(list(mydata)) # filter is too big need to merge instead idframe = pd.DataFrame(list(set(dsstox_id_list)), columns=['dsstox_compound_id']) df = pd.merge(idframe, df, on='dsstox_compound_id', how='left') print(len(dsstox_id_list), len(list(df['dsstox_compound_id']))) number_per_file = 50000 files_so_far = 0 ix = 0 file_path = "/home/rlougee/Desktop/tmptmp{0}.smi" out_file = open(file_path.format(files_so_far), 'w') file_list = [file_path.format(files_so_far)] for index, row in df.iterrows(): #handle null values if row['smiles'] == None: row['smiles'] = '' if ix % number_per_file == 0 and ix > 0: out_file.close() files_so_far += 1 out_file = open(file_path.format(files_so_far), 'w') file_list.append(file_path.format(files_so_far)) smile_file = row['smiles'] + "\t" + row['dsstox_compound_id'] + "\n" ix += 1 out_file.write(smile_file) out_file.close() ## generate fingerprints ### FOR DEBUGGING: AN ACTUAL SMILEY FACE ☻ IS USED IN THIS FILE ### this is only used in BASH commands and IS necessary ### -L is a flag that identifies a file separator and most characters break the command. Also, multiple characters in a row don't work bashstring = '' for file in file_list: bashstring += "{0}{1}".format(str(file), str('☻')) command = '/opt/CORINA_Symphony/CORINA_Symphony_14698/bin/moses -N -L ☻ symphony batch -i {0} -o /share/home/rlougee/Desktop/results.tsv descriptors -f /opt/CORINA_Symphony/CORINA_Symphony_14698/content/symphony/toxprint_V2.0.xml'.format(bashstring) a = subp.Popen(command, shell=True) a.communicate() #import the toxprint file toxprintdf = pd.DataFrame.from_csv('/share/home/rlougee/Desktop/results.tsv', sep='\t') #handle bad smiles # smiles renamed drop_list = [] for index, row in toxprintdf.iterrows(): if row['M_CORINA_SYMPHONY_ERRORS_[STRING]'] != 'No errors': drop_list.append(index) elif len(index) <= 6: drop_list.append(index) elif index[0:6] != 'DTXCID': drop_list.append(index) # drop bad rows toxprintdf = toxprintdf.drop(drop_list) # remove extra columns toxprintdf.drop('M_COMPOUND_HISTORY_[STRING]', axis=1, inplace=True) toxprintdf.drop('M_CORINA_SYMPHONY_ERRORS_[STRING]', axis=1, inplace=True) # remove temporary files b = subp.Popen('rm /share/home/rlougee/Desktop/tmptmp*.smi', shell=True) b.communicate() c = subp.Popen('rm /share/home/rlougee/Desktop/results.tsv', shell=True) c.communicate() return toxprintdf