def mk_toxcastfp(idrow): # query existing toxprints dsi = 1449 mysession = SQLSession(Schemas.qsar_schema).get_session() # pull compound ids in invitrodb CID1 = mysession.execute( 'SELECT compounds.id, mc5.aeid, compounds.dsstox_compound_id, mc5.hitc FROM invitrodb.mc5' ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id' ' JOIN invitrodb.sample ON sample.spid = mc4.spid' ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid' ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id' ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id' ) CID1 = pd.DataFrame(list(CID1)) # filter out rows without a given id output_df = pd.merge(idrow, CID1, on=0, how='inner') # sort output_df = output_df.sort_values( [output_df.columns.values[0], output_df.columns.values[1]]) output_df = output_df.reset_index(drop=True) # remove -1 values replace with 0 !!! output_df = output_df.replace(-1, 0) # pivot tables per dtxcid output_df = pd.pivot_table(output_df, index=0, columns=1, values=3, fill_value='', aggfunc=np.sum) # sets correct hitcall # if any duplicate is a hit, fingerprint is considered a hit !!! output_df = output_df.apply( lambda x: [y if y == '' else (int(y) if y <= 1 else 1) for y in x]) # not all columns are present depending on query # pull remaining columns and add empty columns to output_df dataframe full_columns = [ x[0] for x in mysession.execute( 'SELECT assay_component_endpoint.aeid FROM invitrodb.assay_component_endpoint' ) ] for x in full_columns: if x not in output_df.columns.values: output_df[x] = '' # output_df = output_df.sort_values(list(output_df.columns), axis=1, ascending=True) output_df = output_df.reindex_axis(sorted(output_df.columns), axis=1) return output_df
def check(f): mysession = SQLSession(Schemas.qsar_schema).get_session() # get all dtxcids # remove blacklist here with NOT LIKE '%|%' df1 = pd.DataFrame( list( mysession.execute( 'SELECT compounds.id, compounds.dsstox_compound_id FROM ro_stg_dsstox.compounds WHERE compounds.smiles NOT LIKE "%|%"' ))) l0 = df1[1].tolist() l1 = df1[0].tolist() # get all compound ids with toxprints l2 = [ i[0] for i in list( mysession.execute( 'SELECT efk_dsstox_compound_id FROM sbox_rlougee_qsar.compound_descriptor_sets WHERE fk_descriptor_set_id = 1445' )) ] # find difference between ids l3 = list(set(l1) - set(l2)) # merge df1 l3 df2 = pd.merge(pd.DataFrame(l3), df1, on=[0], how='left') l4 = list(df2[1]) ### IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE TOXPRINTS FOR THIS COMPOUND ### if not l4 and f == False: print('SQL query is empty: All toxprints available or no DTXCIDs') sys.exit(0) elif not l4 and f == True: return [], l0 else: return l4, l0
def check(f): mysession = SQLSession(Schemas.qsar_schema).get_session() # pull compound ids in invitrodb CID1 = mysession.execute( 'SELECT compounds.id FROM invitrodb.mc5' ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id' ' JOIN invitrodb.sample ON sample.spid = mc4.spid' ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid' ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id' ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id' ) CID1 = set([x[0] for x in list(CID1)]) # pull compound ids in compound_descriptor_sets CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \ .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449') CID2 = [x[0] for x in list(CID2)] # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # # # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # # CID3 = list(CID1 - set(CID2)) # check dataframes # print("\n set CID1:", len(set(CID1))) # print("\n set CID2:", len(set(CID2))) # print("\n CID3", len(CID3), CID3) # print("\n CID3df", pd.DataFrame(CID3).head()) # print("\n CID1df", pd.DataFrame(list(CID1)).head()) # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # # if CID3 == [] and f is False: print(Fore.RED + 'SQL query is empty: All ToxcastFP available or no DTXCIDs') sys.exit(0) elif CID3 == [] and f is True: return pd.DataFrame([]), pd.DataFrame(CID1) else: return pd.DataFrame(CID3), pd.DataFrame(CID1)
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables InputString = args['<InputString>'] if not InputString: InputString = sys.stdin.read() mysession = SQLSession(Schemas.dsstox_schema).get_session() query = mysession.execute( 'SELECT datasets.name FROM sbox_rlougee_qsar.datasets WHERE datasets.name LIKE "%{}%"' .format(InputString)) for i in pd.DataFrame(list(query))[0]: print(i)
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ### mysession = SQLSession(Schemas.qsar_schema).get_session() ### new stuff ### query0 = mysession.execute( 'SELECT DISTINCT(name) FROM sbox_rlougee_qsar.datasets WHERE id NOT IN (SELECT fk_dataset_id FROM sbox_rlougee_qsar.univariate_calculations)' ) query0 = [x[0] for x in list(query0)] # if query1 is empty exit if len(query0) == 0: print(Fore.RED + 'Enrichments are up to date') sys.exit(0) p = Pool(20) p.map(fillnewenrich, query0)
from PIL import Image from AssaySpaceIdentifier import assayspaceidentifier from ImageMaker import imagemaker from fpenrich import getenrichfp from docopt import docopt from database.database_schemas import Schemas from database.session import SQLSession import sys import os import pandas as pd from colorama import init, Fore mysession = SQLSession(Schemas.qsar_schema).get_session() enrichment_data_1 = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, GROUP_CONCAT(CASE WHEN abbreviation = \'CT-Total\' THEN value ELSE NULL END) as \'CT-Tot\',GROUP_CONCAT(CASE WHEN abbreviation = \'TP\' THEN value ELSE NULL END) as \'TP\',GROUP_CONCAT(CASE WHEN abbreviation = \'FP\' THEN value ELSE NULL END) as \'FP\', GROUP_CONCAT(CASE WHEN abbreviation = \'FN\' THEN value ELSE NULL END) as \'FN\', GROUP_CONCAT(CASE WHEN abbreviation = \'TN\' THEN value ELSE NULL END) as \'TN\', GROUP_CONCAT(CASE WHEN abbreviation = \'BA\' THEN value ELSE NULL END) as \'BA\', GROUP_CONCAT(CASE WHEN abbreviation = \'OR\' THEN value ELSE NULL END) as \'OR\', GROUP_CONCAT(CASE WHEN abbreviation = \'P-Val\' THEN value ELSE NULL END) as \'P-Val\', GROUP_CONCAT(CASE WHEN abbreviation = \'Inv OR\' THEN value ELSE NULL END) as \'Inv OR\',GROUP_CONCAT(CASE WHEN abbreviation = \'Inv P-Val\' THEN value ELSE NULL END) as \'Inv P-Val\' FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label' ) ed1 = pd.DataFrame(list(enrichment_data_1)) ed1.columns = [ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes', 'True Positives', 'False Positives', 'False Negatives', 'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio', 'Inverse P-Value' ] # print(pd.DataFrame(list(enrichment_data_1)).head()) enrichment_data_2 = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value' )
def main(): args = docopt(__doc__) # print(args) if args['--version']: print('NCCT CLI: Version 0.0.0') sys.exit(0) # set input arguments and options to variables DataSetName = args['<DataSetName>'].lower() outputpath = args['--output'] if outputpath == None: outputpath = '' image = args['--image'] allassays = args['--allassays'] fullenrich = args['--fullenrich'] fpenrich = args['--fpenrich'] a = args['--all'] OR = float(args['--OR']) PV = float(args['--PV']) TP = float(args['--TP']) if a: image = True allassays = True fpenrich = True descriptor_set_id = 1445 # # # QUERY ENRICHMENT DATA FOR YOUR DATASETNAME # # # mysession = SQLSession(Schemas.qsar_schema).get_session() enrichment_data = mysession.execute( 'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%{}%\' AND fk_descriptor_set_id = {} GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value' .format(DataSetName, descriptor_set_id)) enrichment_data = pd.DataFrame(list(enrichment_data)) if enrichment_data.empty: print(Fore.RED + 'ERROR: DataSetName string returned no results {}'.format( DataSetName)) sys.exit(1) # format table, pivot columns, reorder # null columns are dropped without dropna=False but this adds empty rows enrichment_data = pd.pivot_table(enrichment_data, values=4, index=[0, 1, 2], columns=[3], dropna=False) enrichment_data = enrichment_data.reset_index(level=[0, 1, 2]) enrichment_data.columns = [ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Balanced Accuracy', 'Total Chemotypes', 'False Negatives', 'False Positives', 'Inverse Odds Ratio', 'Inverse P-Value', 'Odds Ratio', 'P-Value', 'True Negatives', 'True Positives' ] # drop empty rows enrichment_data = enrichment_data.dropna(thresh=4) enrichment_data = enrichment_data[[ 'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes', 'True Positives', 'False Positives', 'False Negatives', 'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio', 'Inverse P-Value' ]] # FIND THE BASE FOR FILE NAMES file_name = enrichment_data['Data Table'][0] file_name = file_name.split(':')[-1] # CREATE FOLDERS FOR RESULTS AND EDIT PATH if not os.path.exists(outputpath + 'CTEW_' + file_name): os.makedirs(outputpath + 'CTEW_' + file_name) outputpath = outputpath + 'CTEW_' + file_name os.makedirs(outputpath + '/CTEW_Results') os.makedirs(outputpath + '/CTEW-INV_Results') regoutputpath = outputpath + '/CTEW_Results/' invoutputpath = outputpath + '/CTEW-INV_Results/' else: print(Fore.RED + 'ERROR: output folder {} already exists'.format(outputpath + 'CTEW_' + file_name)) sys.exit(1) # # # PULL SIGNIFICANT TOXPRINTS # # # enrichment_data2 = enrichment_data.dropna() enrichment_data2.loc[:, 'Odds Ratio'] = pd.to_numeric( enrichment_data2.loc[:, 'Odds Ratio'], errors='coerce') enrichment_data2.loc[:, 'P-Value'] = pd.to_numeric( enrichment_data2.loc[:, 'P-Value'], errors='coerce') enrichment_data2.loc[:, 'True Positives'] = pd.to_numeric( enrichment_data2.loc[:, 'True Positives'], errors='coerce') sigtxp = [] for i, x in enrichment_data2.iterrows(): if x['Odds Ratio'] >= OR and x['P-Value'] <= PV and x[ 'True Positives'] >= TP: sigtxp.append(x['Chemotype ID']) # # # PULL SIGNIFICANT INVERSE TOXPRINTS # # # enrichment_data3 = enrichment_data.dropna() enrichment_data3.loc[:, 'Inverse Odds Ratio'] = pd.to_numeric( enrichment_data3.loc[:, 'Inverse Odds Ratio'], errors='coerce') enrichment_data3.loc[:, 'Inverse P-Value'] = pd.to_numeric( enrichment_data3.loc[:, 'Inverse P-Value'], errors='coerce') invsigtxp = [] for i, x in enrichment_data3.iterrows(): if x['Inverse Odds Ratio'] >= OR and x['Inverse P-Value'] <= PV: invsigtxp.append(x['Chemotype ID']) # CREATE SIGNIFICANT TOXPRINT EXPORT TABLE AND INVERSE SIGNIFICANT TOXPRINT TABLE sigtxp = pd.DataFrame(sigtxp, columns=['Chemotype ID']) # sigtxp.columns = ['Chemotype ID'] OutputTable = pd.merge(sigtxp, enrichment_data, on='Chemotype ID', how='inner') invsigtxp = pd.DataFrame(invsigtxp, columns=['Chemotype ID']) # invsigtxp.columns = ['Chemotype ID'] InvOutputTable = pd.merge(invsigtxp, enrichment_data, on='Chemotype ID', how='inner') # CREATE FULL ENRICHMENT STATISTICS TABLE if fullenrich: enrichment_data.to_csv('{}Full_Enrichment_Stats_{}.tsv'.format( outputpath, file_name), sep='\t', index=False) # # # ENRICHED CHEMOTYPES # # # # return a table containing enriched chemotypes X DTXCID if fpenrich: try: ct_model_row = getenrichfp(DataSetName, sigtxp, regoutputpath, 'CT-Enriched_FP_' + file_name) OutputTable = OutputTable.append(ct_model_row) except: pass try: ct_inv_model_row = getenrichfp(DataSetName, invsigtxp, invoutputpath, 'CT-INV-Enriched_FP_' + file_name) InvOutputTable = InvOutputTable.append(ct_inv_model_row) except: pass # # # CREATE CT IMAGES # # # if image: imagemaker(sigtxp, 'CT-Enriched_Images_' + file_name, regoutputpath) imagemaker(invsigtxp, 'CT-INV-Enriched_Images_' + file_name, invoutputpath) # # # CREATE ADDITIONAL ASSAY INFO # # # if allassays: assayspaceidentifier(sigtxp, 'CT-Enriched_TCAssays_' + file_name, regoutputpath) assayspaceidentifier(invsigtxp, 'CT-INV-Enriched_TCAssays_' + file_name, invoutputpath) # # # EXPORT CT-ENRICHED_STATS & CT-INV-ENRICHED_STATS # # # try: OutputTable.to_csv('{}CT-Enriched_Stats_{}.tsv'.format( regoutputpath, file_name), sep='\t', index=False) except: pass try: InvOutputTable.to_csv('{}CT-INV-Enriched_Stats_{}.tsv'.format( invoutputpath, file_name), sep='\t', index=False) except: pass
from database.database_schemas import Schemas from database.session import SQLSession from database.qsar.descriptors import Descriptors import sys import os import pandas as pd import datetime # THIS CODE IS TO BE USED ONE TIME TO FILL sbox_rlougee_qsar.descriptors table for the fingerprint labels, ids, MySQL_setup # add Toxcast fingerprint data to MySQL # query AEID data mysession = SQLSession(Schemas.qsar_schema).get_session() aeid_data = mysession.execute( 'SELECT aeid, assay_component_endpoint_name FROM invitrodb.assay_component_endpoint' ) aeid_data = pd.DataFrame(list(aeid_data)) aeid_data[0] = [int(x) for x in aeid_data[0]] aeid_data = aeid_data.sort_values(0, axis=0) aeid_data = aeid_data.reset_index(drop=True) # print(aeid_data) # sys.exit(1) # add ASSAY_COMPONENT_ENDPOINT_NAME as descriptors.label # use aeid as descriptors_name for i, row in aeid_data.iterrows(): username = '******' # create a new datasets name entry
nonburst = pd.read_csv("/home/rlougee/Desktop/primary_data/Non-Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t') burst = pd.read_csv("/home/rlougee/Desktop/primary_data/Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t') nonburst['name'] = nonburst['name'].str.replace('Imported_DataTable:aeid_','').str.replace('_invitrodbv2_20180918', '').apply(int) burst['name'] = burst['name'].str.replace('Imported_DataTable:','').str.replace('_Burst_MC_v2_20181120', '') print(nonburst.tail()) print(burst.tail()) # QUERY AEID AND CORRECT ASSAY NAMES mysession = SQLSession(Schemas.information_schema).get_session() q0 = mysession.execute('SELECT aeid, assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint') q0 = pd.DataFrame(list(q0)) # q0[1] = q0[1].apply(str) print('q0 shape', q0.shape) print('nonburst shape', nonburst.shape) print('burst shape', burst.shape) nonburst2 = pd.merge(nonburst, q0, how='inner', left_on=['name'], right_on=[0]) print(nonburst2.shape) burst2 = pd.merge(burst, q0, how='inner', left_on=['name'], right_on=[1]) print(burst2.shape) # ok merges look good now nonburst2.to_csv('/home/rlougee/Desktop/nonburst.tsv', sep='\t')
import glob from multiprocessing import Pool def com(i): print(i) command = subp.Popen( 'pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True) command.communicate() # create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models # get all relevant datasets mysession = SQLSession(Schemas.qsar_schema).get_session() datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets' ' WHERE sbox_rlougee_qsar.datasets.name LIKE "Imported_DataTable:aeid\_%\_invitrodbv2\_20180918"')] # print(len(datasets)) # print(datasets[180:]) # sys.exit(1) # # get TP stats file for each dataset # for i in datasets[38:]: # print(i) # sys.exit(0) # command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True) # command.communicate() # add multiprocessing for script p = Pool(15) p.map(com, datasets)
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445): """ Get Enrichment data for a combined set of chemotypes """ # aborts if no significant chemotypes if len(sigtxp) == 0: return None mysession = SQLSession(Schemas.qsar_schema).get_session() MyDataSet = mysession.execute( 'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets' ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id' ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id' ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id' ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id' ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi)) MyDataSet = pd.DataFrame(list(MyDataSet)) MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint'] #something to separate and name fingerprint columns MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1) MyDataSet = MyDataSet.drop('Toxprint', axis=1) #name the columns correctly query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi) descriptornames = pd.DataFrame(list(query3)) for num,name in enumerate(descriptornames['label'], start=0): MyDataSet = MyDataSet.rename(columns={num:name}) # drop columns that are not significant sigtxp = pd.DataFrame(sigtxp) sigtxp.columns = ['descriptors_name'] siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner') siglabel = list(siglabel['label']) for i in MyDataSet.columns[2:]: if i in siglabel: pass else: MyDataSet = MyDataSet.drop(i, axis=1) # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False) # return overall balanced accuracy calculations # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file # print(MyDataSet.head()) model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value']) # fill model_row confusion matrix for index, row in MyDataSet.iterrows(): rowsum = sum([int(x) for x in row.iloc[2:]]) if row['Hit_Call'] == 1 and rowsum > 0: model_row['True Positives'] += 1 elif row['Hit_Call'] == 1 and rowsum == 0: model_row['False Negatives'] += 1 elif row['Hit_Call'] == 0 and rowsum > 0: model_row['False Positives'] += 1 elif row['Hit_Call'] == 0 and rowsum == 0: model_row['True Negatives'] += 1 # fill model_row statistics oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater') model_row['P-Value'] = pvalue model_row['Odds Ratio'] = oddsratio model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives']) BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2) model_row['Balanced Accuracy'] = float(BA) inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater') model_row['Inverse P-Value'] = inv_pvalue model_row['Inverse Odds Ratio'] = inv_oddsratio # print(model_row) return model_row
import glob import pandas as pd from database.database_schemas import Schemas from database.dsstox.compounds import Compounds from database.qsar.compound_descriptor_sets import CompoundDescriptorSets from database.session import SQLSession import sys for f in glob.glob( "/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays/*"): aeid = f.split('/')[-1] a = pd.read_csv(f, sep='\t') mysession = SQLSession(Schemas.qsar_schema).get_session() query0 = mysession.execute( 'SELECT assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint WHERE sbox_rlougee_invitrodb.assay_component_endpoint.aeid = {}' .format(aeid)) output_name = list(query0)[0][0] a = a.dropna(axis=1) a.to_csv( '/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays2/{}.tsv'. format(output_name), sep='\t', index=False)
from multiprocessing import Pool # pull enrichments for the BURST assays def com(i): print(i) command = subp.Popen( 'export OPM_NUM_THREADS=5 & export USE_SIMPLE_THREADED_LEVEL3=1 & pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_burst_enrichments/"'.format(i), shell=True) command.communicate() # create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models # get all relevant datasets mysession = SQLSession(Schemas.qsar_schema).get_session() datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets' ' WHERE sbox_rlougee_qsar.datasets.name LIKE "%\_Burst\_MC\_v2\_2018112%"')] # print(len(datasets)) # print(datasets[180:]) # sys.exit(1) # # get TP stats file for each dataset # for i in datasets[38:]: # print(i) # sys.exit(0) # command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True) # command.communicate() # add multiprocessing for script p = Pool(5) p.map(com, datasets[384:])