Beispiel #1
0
def mk_toxcastfp(idrow):
    # query existing toxprints
    dsi = 1449

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # pull compound ids in invitrodb
    CID1 = mysession.execute(
        'SELECT compounds.id, mc5.aeid, compounds.dsstox_compound_id, mc5.hitc FROM invitrodb.mc5'
        ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id'
        ' JOIN invitrodb.sample ON sample.spid = mc4.spid'
        ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid'
        ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id'
        ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id'
    )

    CID1 = pd.DataFrame(list(CID1))

    # filter out rows without a given id
    output_df = pd.merge(idrow, CID1, on=0, how='inner')

    # sort
    output_df = output_df.sort_values(
        [output_df.columns.values[0], output_df.columns.values[1]])
    output_df = output_df.reset_index(drop=True)

    # remove -1 values replace with 0 !!!
    output_df = output_df.replace(-1, 0)

    # pivot tables per dtxcid
    output_df = pd.pivot_table(output_df,
                               index=0,
                               columns=1,
                               values=3,
                               fill_value='',
                               aggfunc=np.sum)

    # sets correct hitcall
    # if any duplicate is a hit, fingerprint is considered a hit !!!

    output_df = output_df.apply(
        lambda x: [y if y == '' else (int(y) if y <= 1 else 1) for y in x])

    # not all columns are present depending on query
    # pull remaining columns and add empty columns to output_df dataframe
    full_columns = [
        x[0] for x in mysession.execute(
            'SELECT assay_component_endpoint.aeid FROM invitrodb.assay_component_endpoint'
        )
    ]

    for x in full_columns:
        if x not in output_df.columns.values:
            output_df[x] = ''

    # output_df = output_df.sort_values(list(output_df.columns), axis=1, ascending=True)
    output_df = output_df.reindex_axis(sorted(output_df.columns), axis=1)

    return output_df
Beispiel #2
0
def mk_special_fingerprints(idrow):
    # query existing toxprints
    dsi = 1445
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi)
    df2 = pd.DataFrame(list(query2))

    # FILTERING BY A LIST OF >1000 WON'T WORK IN MANY DATABASES (THIS IS BREAKING THE SCRIPT HERE ON FULL UPDATE)
    # doing a full query then a merge after
    df2 = pd.merge(pd.DataFrame(idrow, columns=['dsstox_compound_id']),
                   df2,
                   on='dsstox_compound_id')

    # something to separate and name fingerprint columns
    df2 = pd.concat([
        df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)
    ],
                    axis=1)
    df2 = df2.drop('descriptor_string_tsv', axis=1)

    # # # GENERATE SPECIFIC NEW FINGERPRINTS # # #
    # create an empty column for the new fingerprint
    df2['Special_Toxprints'] = ""

    # iterate through datatable and create new Special Toxprints
    # make sure you are looking at the right index when combining Toxprints
    for index, row in df2.iterrows():
        # have to code each special toxprint this way
        if row[480] == '1' and row[489] == '1':
            row['Special_Toxprints'] = '1'
        else:
            row['Special_Toxprints'] = '0'

        # make sure to add tabs before the rest of the toxprints
        if row[489] == '1' and row[480] == '0':
            row['Special_Toxprints'] += '\t1'
        else:
            row['Special_Toxprints'] += '\t0'

        if row[480] == '1' and row[489] == '0':
            row['Special_Toxprints'] += '\t1'
        else:
            row['Special_Toxprints'] += '\t0'

    # remove everything but fingerprints and DTXCIDs
    output_df = df2[['dsstox_compound_id', 'Special_Toxprints']]
    return output_df
Beispiel #3
0
def updatedatabase(fingerprintdf):
    username = '******'
    descriptors_index = 1448

    # Query for compound_descriptor_sets.id
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.id) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index)

    df3 = pd.DataFrame(list(query3))

    #join query with fingerprintdf

    mytable = pd.merge(df3, fingerprintdf, on='dsstox_compound_id')

    # # # CODE FOR UPDATING # # #
    for index, row in mytable.iterrows():
        id = row[1]
        descriptor_string_tsv = row[2]
        updated_by = username
        mysession.query(CompoundDescriptorSets.id,
                        CompoundDescriptorSets.descriptor_string_tsv,
                        CompoundDescriptorSets.updated_by).filter(
                            CompoundDescriptorSets.id == id).update({
                                "descriptor_string_tsv":
                                descriptor_string_tsv,
                                "updated_by":
                                updated_by
                            })
        mysession.commit()
def check(f):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # pull compound ids in invitrodb
    CID1 = mysession.execute(
        'SELECT compounds.id FROM invitrodb.mc5'
        ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id'
        ' JOIN invitrodb.sample ON sample.spid = mc4.spid'
        ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid'
        ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id'
        ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id'
    )

    CID1 = set([x[0] for x in list(CID1)])

    # pull compound ids in compound_descriptor_sets
    CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449')

    CID2 = [x[0] for x in list(CID2)]

    # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # #
    # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # #

    CID3 = list(CID1 - set(CID2))

    # check dataframes
    # print("\n set CID1:", len(set(CID1)))
    # print("\n set CID2:", len(set(CID2)))
    # print("\n CID3", len(CID3), CID3)
    # print("\n CID3df", pd.DataFrame(CID3).head())
    # print("\n CID1df", pd.DataFrame(list(CID1)).head())

    # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # #
    if CID3 == [] and f is False:
        print(Fore.RED +
              'SQL query is empty: All ToxcastFP available or no DTXCIDs')
        sys.exit(0)

    elif CID3 == [] and f is True:
        return pd.DataFrame([]), pd.DataFrame(CID1)

    else:
        return pd.DataFrame(CID3), pd.DataFrame(CID1)
Beispiel #5
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    InputString = args['<InputString>']
    if not InputString:
        InputString = sys.stdin.read()

    mysession = SQLSession(Schemas.dsstox_schema).get_session()
    query = mysession.execute(
        'SELECT datasets.name FROM sbox_rlougee_qsar.datasets WHERE datasets.name LIKE "%{}%"'
        .format(InputString))
    for i in pd.DataFrame(list(query))[0]:
        print(i)
Beispiel #6
0
def fillnewenrich(x_aeid):
    print(x_aeid)
    # retrive the latest dataset for the aeid
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    new_df = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \
        .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \
        .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \
        .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id)\
        .filter(Datasets.name == x_aeid)

    new_df = pd.DataFrame(list(new_df))

    # new_df = query1[query1['name'].isin([x_aeid])]
    # rename columns
    new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id']
    my_dataset_id = new_df['dataset_id'].iloc[0]
    # make the enrichment table
    filluc(new_df, my_dataset_id)
Beispiel #7
0
def filldatabase(yourlist, desc_set_id, prefix):
    username = '******'
    descriptors_index = desc_set_id
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    descriptor_objects = []
    for index, val in enumerate(yourlist):

        fk_descriptor_set_id = descriptors_index
        index_number = index + 1
        # xx = [str(x) for x in list(row)]
        descriptors_name = str(prefix.format(index + 1))
        created_by = username
        updated_by = username
        long_description = ''
        short_description = ''
        label = val
        descriptor = Descriptors(index_number=index_number,
                                 descriptors_name=descriptors_name,
                                 long_description=long_description,
                                 short_description=short_description,
                                 label=label,
                                 created_by=created_by,
                                 fk_descriptor_set_id=fk_descriptor_set_id,
                                 updated_by=updated_by)
        descriptor_objects.append(descriptor)
    mysession.bulk_save_objects(descriptor_objects)
    mysession.commit()
def filldatabase(fingerprintdf):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    username = '******'
    descriptors_index = 1448

    compound_descriptor_set_objects = []

    # print(fingerprintdf)

    for index, row in fingerprintdf.iterrows():

        efk_dsstox_compound_id = row[0].split('0', 1)[1]
        fk_descriptor_set_id = descriptors_index
        xx = [str(x) for x in list(row[1])]
        descriptor_string_tsv = '\t'.join(xx)
        created_by = username
        updated_by = username
        compound_descriptor_sets = CompoundDescriptorSets(
            efk_dsstox_compound_id=efk_dsstox_compound_id,
            descriptor_string_tsv=descriptor_string_tsv,
            created_by=created_by,
            fk_descriptor_set_id=fk_descriptor_set_id,
            updated_by=updated_by)
        compound_descriptor_set_objects.append(compound_descriptor_sets)
    mysession.bulk_save_objects(compound_descriptor_set_objects)
    mysession.commit()
Beispiel #9
0
def check(f):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # get all dtxcids
    # remove blacklist here with NOT LIKE '%|%'
    df1 = pd.DataFrame(
        list(
            mysession.execute(
                'SELECT compounds.id, compounds.dsstox_compound_id FROM ro_stg_dsstox.compounds WHERE compounds.smiles NOT LIKE "%|%"'
            )))
    l0 = df1[1].tolist()
    l1 = df1[0].tolist()

    # get all compound ids with toxprints
    l2 = [
        i[0] for i in list(
            mysession.execute(
                'SELECT efk_dsstox_compound_id FROM sbox_rlougee_qsar.compound_descriptor_sets WHERE fk_descriptor_set_id = 1445'
            ))
    ]

    # find difference between ids
    l3 = list(set(l1) - set(l2))

    # merge df1 l3
    df2 = pd.merge(pd.DataFrame(l3), df1, on=[0], how='left')
    l4 = list(df2[1])

    ### IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE TOXPRINTS FOR THIS COMPOUND ###
    if not l4 and f == False:
        print('SQL query is empty: All toxprints available or no DTXCIDs')
        sys.exit(0)
    elif not l4 and f == True:
        return [], l0
    else:
        return l4, l0
Beispiel #10
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ###
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    ### new stuff ###
    query0 = mysession.execute(
        'SELECT DISTINCT(name) FROM sbox_rlougee_qsar.datasets WHERE id NOT IN (SELECT fk_dataset_id FROM sbox_rlougee_qsar.univariate_calculations)'
    )
    query0 = [x[0] for x in list(query0)]

    # if query1 is empty exit
    if len(query0) == 0:
        print(Fore.RED + 'Enrichments are up to date')
        sys.exit(0)

    p = Pool(20)
    p.map(fillnewenrich, query0)
def update_toxprint_database(fingerprintdf):
    username = '******'
    descriptors_index = 1445

    # Query for compound_descriptor_sets.id
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.efk_dsstox_compound_id)\
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index)

    df3 = pd.DataFrame(list(query3))

    #join query with fingerprintdf
    fingerprintdf = fingerprintdf.reset_index()
    fingerprintdf = fingerprintdf.rename(
        columns={'M_NAME':
                 'efk_dsstox_compound_id'})  # not sure this will work
    fingerprintdf['efk_dsstox_compound_id'] = [
        int(x[8:]) for x in fingerprintdf['efk_dsstox_compound_id']
    ]
    mytable = pd.merge(df3, fingerprintdf, on='efk_dsstox_compound_id')

    # # # CODE FOR UPDATING # # #
    for index, row in mytable.iterrows():
        id = str(row[0])
        xx = [str(x) for x in list(row[2:])]
        descriptor_string_tsv = '\t'.join(xx)
        updated_by = username
        mysession.query(CompoundDescriptorSets.id,
                        CompoundDescriptorSets.descriptor_string_tsv,
                        CompoundDescriptorSets.updated_by).filter(
                            CompoundDescriptorSets.id == id).update({
                                "descriptor_string_tsv":
                                descriptor_string_tsv,
                                "updated_by":
                                updated_by
                            })
        mysession.commit()
Beispiel #12
0
from database.database_schemas import Schemas
from database.dsstox.compounds import Compounds
from database.qsar.compound_descriptors import CompoundDescriptors
from database.session import SQLSession
from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds
from database.dsstox.generic_substances import GenericSubstances


import pandas as pd

mytable = pd.read_csv('~/Desktop/Katies_data.tsv', sep='\t')
dtxsid = mytable.iloc[:,4]

mysession = SQLSession(Schemas.qsar_schema).get_session()

query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.id, Compounds.dsstox_compound_id).join(
    GenericSubstanceCompounds) \
    .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(dtxsid))

df = pd.DataFrame(list(query))

myids = [int(x) for x in df.iloc[:,1]]

# query1 = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(dtxsid))
# df1 = pd.DataFrame(list(query1))

query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv,
                         CompoundDescriptors.fk_descriptor_set_id).filter(
    CompoundDescriptors.efk_dsstox_compound_id.in_(myids))
df2 = pd.DataFrame(list(query2))
Beispiel #13
0
from Enrichment_MySQL.enrich_mysql0.duplicate_handler_0 import handle_duplicates
from Enrichment_MySQL.enrich_mysql0.fillfp_0 import fillfp
from database.database_schemas import Schemas
from database.dsstox.compounds import Compounds
from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds
from database.dsstox.generic_substances import GenericSubstances
from database.invitrodb.mc4 import Mc4
from database.invitrodb.mc5 import Mc5
from database.invitrodb.sample import Sample
from database.session import SQLSession

# GET ALL ASSAYS FROM MC5
# QUERY MC5 data for hitcalls and chemical IDs

mysession = SQLSession(Schemas.information_schema).get_session()

query0 = mysession.query( Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid,) \
    .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
    .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
    .join(Sample, Sample.chid == GenericSubstances.id) \
    .join(Mc4, Mc4.spid == Sample.spid) \
    .join(Mc5, Mc5.m4id == Mc4.m4id)

mc5_table = pd.DataFrame(list(query0))
# print(mc5_table.shape)
# print( mc5_table[mc5_table['aeid']==1086] )
# sys.exit(1)

# run xgboost for files in test_aeids
Beispiel #14
0
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets
from database.dsstox.compounds import Compounds
from database.database_schemas import Schemas
from database.session import SQLSession

mysession = SQLSession(Schemas.qsar_schema).get_session()

x_aeid = 926

query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
    .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \
    .filter(CompoundDescriptorSets.fk_descriptor_set_id == int(1445))
    # .filter(Compounds.dsstox_compound_id.in_(str(new_df.iloc[0]))) \


print(list(query3))
Beispiel #15
0
import pandas as pd
import subprocess as subp
import glob
from multiprocessing import Pool

def com(i):
    print(i)
    command = subp.Popen(
        'pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i),
        shell=True)
    command.communicate()

# create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models

# get all relevant datasets
mysession = SQLSession(Schemas.qsar_schema).get_session()

datasets = [x[0] for x in mysession.execute('SELECT datasets.name FROM sbox_rlougee_qsar.datasets'
                                 ' WHERE sbox_rlougee_qsar.datasets.name LIKE "Imported_DataTable:aeid\_%\_invitrodbv2\_20180918"')]

# print(len(datasets))
# print(datasets[180:])
# sys.exit(1)
# # get TP stats file for each dataset
# for i in datasets[38:]:
#     print(i)
#     sys.exit(0)
#     command = subp.Popen('pullenrichment "{}%" --fpenrich -o "/share/home/rlougee/Desktop/invitrodb_v2_enrichments/"'.format(i), shell=True)
#     command.communicate()

Beispiel #16
0
        fk_datapoint_id = int(datapoints.id)

        dataset_datapoints = DatasetDatapoints(fk_dataset_id=fk_dataset_id,
                                               fk_datapoint_id=fk_datapoint_id,
                                               updated_by=username,
                                               created_by=username)
        mysession.add(dataset_datapoints)
    mysession.commit()

    ####################################################################################################################


if __name__ == "__main__":

    #QUERY MC5 data for hitcalls, chemical IDs, assay subcategories
    mysession = SQLSession(Schemas.information_schema).get_session()

    query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type, AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \
        .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
        .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
        .join(Sample, Sample.chid == GenericSubstances.id) \
        .join(Mc4, Mc4.spid == Sample.spid) \
        .join(Mc5, Mc5.m4id == Mc4.m4id) \
        .join(AssayComponentEndpoint, AssayComponentEndpoint.aeid == Mc5.aeid) \
        .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid)

    mytable = pd.DataFrame(list(query0))

    ####################################################################################################################

    ### MAKE TABLES FOR SUBCATEGORIES ###
#     # retrive the latest dataset for the aeid
#     new_df = query1[query1['name'].isin([x_aeid])]
#     # rename columns
#     new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id']
#     my_dataset_id = new_df['dataset_id'].iloc[0]
#     # make the enrichment table
#     filluc(new_df, my_dataset_id)

####################################################################################################################

# create a connection pool for multiprocessing with mysqlalchemy
if __name__ == '__main__':

    ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ###

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # query the Unique Enrichment Table IDs
    query0 = mysession.query(UnivariateCalculations.fk_dataset_id)
    query0 = pd.DataFrame(list(query0))

    if query0.empty:
        pass
    else:
        query0 = query0.fk_dataset_id.unique()

    # query the full set of data
    # slow ~3.5 minutes already
    query1 = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \
        .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \
        .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \
Beispiel #18
0
import glob
import pandas as pd
from database.database_schemas import Schemas
from database.dsstox.compounds import Compounds
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets
from database.session import SQLSession
import sys

for f in glob.glob(
        "/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays/*"):
    aeid = f.split('/')[-1]
    a = pd.read_csv(f, sep='\t')

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query0 = mysession.execute(
        'SELECT assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint WHERE sbox_rlougee_invitrodb.assay_component_endpoint.aeid = {}'
        .format(aeid))

    output_name = list(query0)[0][0]
    a = a.dropna(axis=1)
    a.to_csv(
        '/home/rlougee/Desktop/invitrodb_v2_burst_splits/clean_assays2/{}.tsv'.
        format(output_name),
        sep='\t',
        index=False)
Beispiel #19
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    InID = args['<InID>'].lower()
    OutID = args['<OutID>'].lower()
    tsv_input = args['<tsv_file>']
    o = args['--output']
    noerror = args['--noerror']

    # filter InID & OutID
    if InID in acceptedID:
        pass
    else:
        print(Fore.RED +
              'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID))
        sys.exit(1)
    if OutID in acceptedID:
        pass
    else:
        print(Fore.RED +
              'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID))
        sys.exit(1)

    # creates table of .tsv file
    # takes stdin if argument is not directly given
    if not tsv_input:
        tsv_input = sys.stdin.read()
        mytable = pd.read_csv(StringIO(tsv_input), sep="\t")
    elif tsv_input:
        mytable = pd.read_csv(tsv_input, sep="\t")

    # takes the chemical ID column
    idrow = mytable.iloc[:, 0]
    colname = mytable.columns.values[0]

    # make an SQL query table  for relevant SIDs & CIDs
    mysession = SQLSession(Schemas.dsstox_schema).get_session()
    query = mysession.query(GenericSubstances.dsstox_substance_id, GenericSubstances.casrn, Compounds.dsstox_compound_id)\
        .join(GenericSubstanceCompounds, GenericSubstanceCompounds.fk_generic_substance_id == GenericSubstances.id)\
        .join(Compounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id)
    df = pd.DataFrame(list(query))
    df.columns = ['dtxsid', 'casrn', 'dtxcid']

    idrow = pd.DataFrame(idrow)
    idrow.columns = [InID]

    # do a join to filter out unwanted IDs
    if InID == 'dtxcid':
        df = pd.merge(idrow, df, on='dtxcid', how='inner')
    elif InID == 'casrn':
        df = pd.merge(idrow, df, on='casrn', how='inner')
    elif InID == 'dtxsid':
        df = pd.merge(idrow, df, on='dtxsid', how='inner')

    df = df.drop_duplicates(InID)

    # if no DTXCIDs returned
    if df.empty and not noerror:
        print(Fore.RED + "Error: No valid {} or no associated {}\n{}".format(
            InID, OutID, list(idrow)))
        sys.exit(1)
    elif df.empty:
        sys.exit(1)

    #creates a list of unconverted IDs
    noid = list(set(idrow.iloc[:, 0]) - set(list(df.iloc[:, 0])))

    # creates new CID table
    mytable = mytable.rename(columns={colname: InID})
    mytable = pd.merge(df, mytable, on=InID, how='left')
    for i in acceptedID:
        if i != OutID:
            mytable = mytable.drop(i, 1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += i + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    # output options
    if not o:
        print(output[:int(-1)])
    else:
        outputtable.to_csv(o, sep='\t', index=False)

    # ERROR message
    # not actual STDERR this is for the user
    if not noerror:
        print(Fore.RED + "Error: Invalid {} or no associated {}\n{}".format(
            InID, OutID, noid))
Beispiel #20
0
from database.database_schemas import Schemas
from database.session import SQLSession
from database.qsar.descriptors import Descriptors
import sys
import os
import pandas as pd
import datetime

# THIS CODE IS TO BE USED ONE TIME TO FILL sbox_rlougee_qsar.descriptors table for the fingerprint labels, ids, MySQL_setup

# add Toxcast fingerprint data to MySQL
# query AEID data
mysession = SQLSession(Schemas.qsar_schema).get_session()
aeid_data = mysession.execute(
    'SELECT aeid, assay_component_endpoint_name FROM invitrodb.assay_component_endpoint'
)

aeid_data = pd.DataFrame(list(aeid_data))
aeid_data[0] = [int(x) for x in aeid_data[0]]
aeid_data = aeid_data.sort_values(0, axis=0)

aeid_data = aeid_data.reset_index(drop=True)
# print(aeid_data)
# sys.exit(1)

# add ASSAY_COMPONENT_ENDPOINT_NAME as descriptors.label
# use aeid as descriptors_name

for i, row in aeid_data.iterrows():
    username = '******'
    # create a new datasets name entry
Beispiel #21
0
from database.database_schemas import Schemas
from database.dsstox.compounds import Compounds
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets
from database.session import SQLSession
import sys
import pandas as pd
import subprocess as subp
import glob

# create a tsv with dataset name x TP/TOTAL_CHEM so that we can see the coverage of our Toxprint Models

# get all relevant datasets
mysesson = mysession = SQLSession(Schemas.qsar_schema).get_session()

datasets = [
    x[0] for x in mysession.execute(
        'SELECT datasets.name FROM sbox_rlougee_qsar.datasets'
        ' WHERE sbox_rlougee_qsar.datasets.name LIKE "%Ori\_MC%"')
]

print(len(datasets))

# get TP stats file for each dataset
# for i in datasets[1053:]:
#     print(i)
#     command = subp.Popen('pullenrichment {} --fpenrich -o "/share/home/rlougee/Desktop/ori_mc_percent_coverage/"'.format(i), shell=True)
#     command.communicate()
Beispiel #22
0

# IMPORT AND CLEAN UP DATATABLES

nonburst = pd.read_csv("/home/rlougee/Desktop/primary_data/Non-Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t')
burst = pd.read_csv("/home/rlougee/Desktop/primary_data/Burst_MC_v2_2018_full_enrichment_results.tsv", sep='\t')

nonburst['name'] = nonburst['name'].str.replace('Imported_DataTable:aeid_','').str.replace('_invitrodbv2_20180918', '').apply(int)
burst['name'] = burst['name'].str.replace('Imported_DataTable:','').str.replace('_Burst_MC_v2_20181120', '')

print(nonburst.tail())
print(burst.tail())

# QUERY AEID AND CORRECT ASSAY NAMES

mysession = SQLSession(Schemas.information_schema).get_session()


q0 = mysession.execute('SELECT aeid, assay_component_endpoint_name FROM sbox_rlougee_invitrodb.assay_component_endpoint')
q0 = pd.DataFrame(list(q0))
# q0[1] = q0[1].apply(str)
print('q0 shape', q0.shape)
print('nonburst shape', nonburst.shape)
print('burst shape', burst.shape)

nonburst2 = pd.merge(nonburst, q0, how='inner', left_on=['name'], right_on=[0])
print(nonburst2.shape)

burst2 = pd.merge(burst, q0, how='inner', left_on=['name'], right_on=[1])
print(burst2.shape)
Beispiel #23
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    DataSetName = args['<DataSetName>'].lower()
    outputpath = args['--output']
    if outputpath == None:
        outputpath = ''
    image = args['--image']
    allassays = args['--allassays']
    fullenrich = args['--fullenrich']
    fpenrich = args['--fpenrich']
    a = args['--all']
    OR = float(args['--OR'])
    PV = float(args['--PV'])
    TP = float(args['--TP'])

    if a:
        image = True
        allassays = True
        fpenrich = True

    descriptor_set_id = 1445

    # # # QUERY ENRICHMENT DATA FOR YOUR DATASETNAME # # #

    mysession = SQLSession(Schemas.qsar_schema).get_session()
    enrichment_data = mysession.execute(
        'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%{}%\' AND fk_descriptor_set_id = {} GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value'
        .format(DataSetName, descriptor_set_id))

    enrichment_data = pd.DataFrame(list(enrichment_data))

    if enrichment_data.empty:
        print(Fore.RED +
              'ERROR: DataSetName string returned no results {}'.format(
                  DataSetName))
        sys.exit(1)

    # format table, pivot columns, reorder
    # null columns are dropped without dropna=False but this adds empty rows
    enrichment_data = pd.pivot_table(enrichment_data,
                                     values=4,
                                     index=[0, 1, 2],
                                     columns=[3],
                                     dropna=False)
    enrichment_data = enrichment_data.reset_index(level=[0, 1, 2])
    enrichment_data.columns = [
        'Data Table', 'Chemotype ID', 'Chemotype Label', 'Balanced Accuracy',
        'Total Chemotypes', 'False Negatives', 'False Positives',
        'Inverse Odds Ratio', 'Inverse P-Value', 'Odds Ratio', 'P-Value',
        'True Negatives', 'True Positives'
    ]

    # drop empty rows
    enrichment_data = enrichment_data.dropna(thresh=4)

    enrichment_data = enrichment_data[[
        'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes',
        'True Positives', 'False Positives', 'False Negatives',
        'True Negatives', 'Balanced Accuracy', 'Odds Ratio', 'P-Value',
        'Inverse Odds Ratio', 'Inverse P-Value'
    ]]

    # FIND THE BASE FOR FILE NAMES
    file_name = enrichment_data['Data Table'][0]
    file_name = file_name.split(':')[-1]

    # CREATE FOLDERS FOR RESULTS AND EDIT PATH
    if not os.path.exists(outputpath + 'CTEW_' + file_name):
        os.makedirs(outputpath + 'CTEW_' + file_name)
        outputpath = outputpath + 'CTEW_' + file_name
        os.makedirs(outputpath + '/CTEW_Results')
        os.makedirs(outputpath + '/CTEW-INV_Results')
        regoutputpath = outputpath + '/CTEW_Results/'
        invoutputpath = outputpath + '/CTEW-INV_Results/'
    else:
        print(Fore.RED +
              'ERROR: output folder {} already exists'.format(outputpath +
                                                              'CTEW_' +
                                                              file_name))
        sys.exit(1)

    # # # PULL SIGNIFICANT TOXPRINTS # # #
    enrichment_data2 = enrichment_data.dropna()
    enrichment_data2.loc[:, 'Odds Ratio'] = pd.to_numeric(
        enrichment_data2.loc[:, 'Odds Ratio'], errors='coerce')
    enrichment_data2.loc[:, 'P-Value'] = pd.to_numeric(
        enrichment_data2.loc[:, 'P-Value'], errors='coerce')
    enrichment_data2.loc[:, 'True Positives'] = pd.to_numeric(
        enrichment_data2.loc[:, 'True Positives'], errors='coerce')

    sigtxp = []
    for i, x in enrichment_data2.iterrows():
        if x['Odds Ratio'] >= OR and x['P-Value'] <= PV and x[
                'True Positives'] >= TP:
            sigtxp.append(x['Chemotype ID'])

    # # # PULL SIGNIFICANT INVERSE TOXPRINTS # # #
    enrichment_data3 = enrichment_data.dropna()
    enrichment_data3.loc[:, 'Inverse Odds Ratio'] = pd.to_numeric(
        enrichment_data3.loc[:, 'Inverse Odds Ratio'], errors='coerce')
    enrichment_data3.loc[:, 'Inverse P-Value'] = pd.to_numeric(
        enrichment_data3.loc[:, 'Inverse P-Value'], errors='coerce')

    invsigtxp = []
    for i, x in enrichment_data3.iterrows():
        if x['Inverse Odds Ratio'] >= OR and x['Inverse P-Value'] <= PV:
            invsigtxp.append(x['Chemotype ID'])

    # CREATE SIGNIFICANT TOXPRINT EXPORT TABLE AND INVERSE SIGNIFICANT TOXPRINT TABLE
    sigtxp = pd.DataFrame(sigtxp, columns=['Chemotype ID'])
    # sigtxp.columns = ['Chemotype ID']
    OutputTable = pd.merge(sigtxp,
                           enrichment_data,
                           on='Chemotype ID',
                           how='inner')

    invsigtxp = pd.DataFrame(invsigtxp, columns=['Chemotype ID'])
    # invsigtxp.columns = ['Chemotype ID']
    InvOutputTable = pd.merge(invsigtxp,
                              enrichment_data,
                              on='Chemotype ID',
                              how='inner')

    # CREATE FULL ENRICHMENT STATISTICS TABLE
    if fullenrich:
        enrichment_data.to_csv('{}Full_Enrichment_Stats_{}.tsv'.format(
            outputpath, file_name),
                               sep='\t',
                               index=False)

    # # # ENRICHED CHEMOTYPES # # #
    # return a table containing enriched chemotypes X DTXCID
    if fpenrich:
        try:
            ct_model_row = getenrichfp(DataSetName, sigtxp, regoutputpath,
                                       'CT-Enriched_FP_' + file_name)
            OutputTable = OutputTable.append(ct_model_row)
        except:
            pass
        try:
            ct_inv_model_row = getenrichfp(DataSetName, invsigtxp,
                                           invoutputpath,
                                           'CT-INV-Enriched_FP_' + file_name)
            InvOutputTable = InvOutputTable.append(ct_inv_model_row)
        except:
            pass

    # # # CREATE CT IMAGES # # #
    if image:
        imagemaker(sigtxp, 'CT-Enriched_Images_' + file_name, regoutputpath)
        imagemaker(invsigtxp, 'CT-INV-Enriched_Images_' + file_name,
                   invoutputpath)

    # # # CREATE ADDITIONAL ASSAY INFO # # #
    if allassays:
        assayspaceidentifier(sigtxp, 'CT-Enriched_TCAssays_' + file_name,
                             regoutputpath)
        assayspaceidentifier(invsigtxp,
                             'CT-INV-Enriched_TCAssays_' + file_name,
                             invoutputpath)

    # # # EXPORT CT-ENRICHED_STATS & CT-INV-ENRICHED_STATS # # #
    try:
        OutputTable.to_csv('{}CT-Enriched_Stats_{}.tsv'.format(
            regoutputpath, file_name),
                           sep='\t',
                           index=False)
    except:
        pass
    try:
        InvOutputTable.to_csv('{}CT-INV-Enriched_Stats_{}.tsv'.format(
            invoutputpath, file_name),
                              sep='\t',
                              index=False)
    except:
        pass
def filluc(invitrodbdf, mydatasetid):
    #set starttime
    starttime = time.time()

    username = '******'
    descriptor_set_id = [1445, 1447, 1446, 1448]
    # descriptor_set_id = [1448] # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
    # 1445 = toxprints
    # 1446 = MACCS
    # 1447 = pubchem
    # 1448 = Special Toxprints

    for id in descriptor_set_id:
        # add enrichment table to the mysql database
        try:
            filled_table = handle_duplicates(invitrodbdf.loc[:, ['dsstox_compound_id', 'hitc']])

        except:
            print("DUPLICATE HANDLER FAILED: {}".format(mydatasetid))
            sys.exit(1)

        try:
            filled_table = fillfp(filled_table, id)
        except:
            print("FILLFP FAILED: {}".format(mydatasetid))
            sys.exit(1)

        # filled_table = pd.DataFrame(filled_table)

        try:
            my_enrichment_table = enrich(filled_table)

        except:
            print("ENRICH FAILED: {}".format(mydatasetid))
            print(filled_table.head())
            sys.exit(1)


        # add fk_descriptor_id
        ### CHECK THAT THESE ARE MATCHING! ###
        mysession2 = SQLSession(Schemas.qsar_schema).get_session()
        query3 = mysession2.query(Descriptors.id).filter(Descriptors.fk_descriptor_set_id == id)
        query3 = list(query3)
        query3 = [int(i[0]) for i in query3]
        my_enrichment_table.insert(0, 'fk_descriptor_id', query3)

        for index, row in my_enrichment_table.iterrows():
            fk_dataset_id = int(mydatasetid)
            fk_descriptor_id = int(row['fk_descriptor_id'])

            univariate_calc = UnivariateCalculations(fk_dataset_id=fk_dataset_id,
                                                     fk_descriptor_id=fk_descriptor_id,
                                                     updated_by=username,
                                                     created_by=username)
            mysession2.add(univariate_calc)
            mysession2.flush()

            fk_univ_calc_id = int(univariate_calc.id)

            ### NEED TO CHANGE for loop & stat_list IF THE STATISTICS ARE CHANGED IN Enrichment_Table_Generator ###
            count = 0
            for i in row[1:]:

                if math.isnan(i):
                    value = None
                elif math.isinf(i):
                    value = 99999999.9
                else:
                    value = float(i)

                stat_list = [9, 10, 11, 12, 13, 4, 8, 7, 14, 15]
                fk_statistic_id = int(stat_list[count])

                uc_statistics = UcStatistics(value=value,
                                             fk_univ_calc_id=int(fk_univ_calc_id),
                                             fk_statistic_id=int(fk_statistic_id),
                                             created_by=username,
                                             updated_by=username)

                mysession2.add(uc_statistics)
                count += 1
        mysession2.commit()
        # mysession2.close()
    endtime = time.time()
    print('run time:{}'.format(endtime-starttime))
Beispiel #25
0
def cli(tsv_input,o,noerror):
    ### HELP DOCUMENTATION ###

    """
    SIDtoCID takes in a .tsv datatable with a dsstox_substance_id column (must be index or first 2 columns).
    The dsstox_substance_id column is converted to dsstox_compound_id.
    Can use a .tsv file as stdin. Default output is stdout as .tsv.
    \n\n
    Warning!: column names are needed in the input .tsv! Otherwise first row will be skipped.



    -- EXAMPLE I/O TABLES --

    INPUT:  .tsv file

    |DTXSID COLUMN | ENDPOINT COLUMN |\n
    ----------------------------------\n
    | DTXSID123456 |        0        |\n
    ----------------------------------\n
    | DTXSID234567 |        1        |\n
    ----------------------------------\n
    | DTXSID345678 |        0        |\n
    ----------------------------------\n

    EXPORT: .tsv file

    |DTXCID COLUMN | ENDPOINT COLUMN |\n
    ----------------------------------\n
    | DTXCID891011 |        0        |\n
    ----------------------------------\n
    | DTXCID910111 |        1        |\n
    ----------------------------------\n
    | DTXCID101112 |        0        |\n
    ----------------------------------\n



    """
    # creates table of .tsv file
    # takes stdin if argument is not directly given
    if not tsv_input:
        tsv_input = sys.stdin.read()
        mytable = pd.read_csv(StringIO(tsv_input), sep="\t")
    elif tsv_input:
        mytable = pd.read_csv(tsv_input, sep="\t")

    #checks the index, and first two columns for DTXSIDs
    #input table should be in the correct format already
    try:
        if mytable.iloc[0,0][0:6] == 'DTXSID':
            idrow = mytable.iloc[:,0]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.iloc[0,1][0:6] == 'DTXSID':
            idrow = mytable.iloc[:,1]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.index.values[0][0:6] == 'DTXSID':
            idrow = mytable.index.values
            mytable.index.name = 'DTXSID'
            colname = mytable.index.name
    except:
        pass

    # drop empty columns
    mytable = mytable.dropna(axis='columns', how='all')

    # click.echo(mytable.columns.values)
    #make an SQL query table  for relevant SIDs & CIDs
    mysession = SQLSession(Schemas.dsstox_schema).get_session()

    query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \
        .join(Compounds)

    df = pd.DataFrame(list(query))
    idrow = pd.DataFrame(idrow)
    idrow.columns = ['dsstox_substance_id']
    df = pd.merge(idrow, df, on='dsstox_substance_id', how='inner')


    #if no DTXCIDs returned
    if df.empty and noerror:
        click.secho("Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow)), fg='red', bold=True)
        sys.exit(1)
    elif df.empty:
        sys.exit(1)

    #creates new CID table
    mytable = mytable.rename(columns={colname : "dsstox_substance_id"})
    mytable = pd.merge(df, mytable, on='dsstox_substance_id')
    mytable = mytable.drop('dsstox_substance_id', 1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += i + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    #output options
    if o =='':
        click.echo(output)
    else:
        outputtable.to_csv(o, sep='\t',index=False)

    # get IDs that were not converted
    noid = list(list(set(idrow.iloc[:,0]) - set(df.iloc[:,0])))

    #ERROR message
    #not actual STDERR this is for the user
    if noerror:
        click.secho("Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid), fg='red', bold=True)
Beispiel #26
0
# print(wow.head())

for i in wowie['name'].unique():
    bla = wowie[wowie['name'] == i]
    blabla = pd.DataFrame(bla.iloc[:,1:])
    print(blabla.head(), blabla.shape)

    # for idx, row in blabla.iterrows():
        # if row[1] >= 40:
        #     # print(row[1])
        #     blabla.iloc[idx, 1] = 10
        #     # print(blabla.iloc[idx,1])


    # get Txp index
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(Descriptors.index_number, Descriptors.descriptors_name).filter(Descriptors.fk_descriptor_set_id == 1445)
    descriptornames = pd.DataFrame(list(query3))

    # sort by TXP number
    sorted = pd.merge(descriptornames, blabla, on='descriptors_name')
    sorted = sorted.drop('index_number', axis=1)
    # print(sorted.head())

    name =  i.split('_')[0]

    from zMisc_Code.DATA_VISUALIZATION.barplot import barplot
    barplot(sorted.iloc[:,1], sorted.iloc[:,0], name)
    plt.tight_layout()
    # plt.show()
Beispiel #27
0
from PIL import Image
from AssaySpaceIdentifier import assayspaceidentifier
from ImageMaker import imagemaker
from fpenrich import getenrichfp
from docopt import docopt
from database.database_schemas import Schemas
from database.session import SQLSession
import sys
import os
import pandas as pd
from colorama import init, Fore

mysession = SQLSession(Schemas.qsar_schema).get_session()
enrichment_data_1 = mysession.execute(
    'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, GROUP_CONCAT(CASE WHEN abbreviation = \'CT-Total\' THEN value ELSE NULL END) as \'CT-Tot\',GROUP_CONCAT(CASE WHEN abbreviation = \'TP\' THEN value ELSE NULL END) as \'TP\',GROUP_CONCAT(CASE WHEN abbreviation = \'FP\' THEN value ELSE NULL END) as \'FP\', GROUP_CONCAT(CASE WHEN abbreviation = \'FN\' THEN value ELSE NULL END) as \'FN\', GROUP_CONCAT(CASE WHEN abbreviation = \'TN\' THEN value ELSE NULL END) as \'TN\', GROUP_CONCAT(CASE WHEN abbreviation = \'BA\' THEN value ELSE NULL END) as \'BA\', GROUP_CONCAT(CASE WHEN abbreviation = \'OR\' THEN value ELSE NULL END) as \'OR\', GROUP_CONCAT(CASE WHEN abbreviation = \'P-Val\' THEN value ELSE NULL END) as \'P-Val\', GROUP_CONCAT(CASE WHEN abbreviation = \'Inv OR\' THEN value ELSE NULL END) as \'Inv OR\',GROUP_CONCAT(CASE WHEN abbreviation = \'Inv P-Val\' THEN value ELSE NULL END) as \'Inv P-Val\' FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label'
)

ed1 = pd.DataFrame(list(enrichment_data_1))

ed1.columns = [
    'Data Table', 'Chemotype ID', 'Chemotype Label', 'Total Chemotypes',
    'True Positives', 'False Positives', 'False Negatives', 'True Negatives',
    'Balanced Accuracy', 'Odds Ratio', 'P-Value', 'Inverse Odds Ratio',
    'Inverse P-Value'
]

# print(pd.DataFrame(list(enrichment_data_1)).head())

enrichment_data_2 = mysession.execute(
    'SELECT datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value FROM sbox_rlougee_qsar.datasets JOIN sbox_rlougee_qsar.univariate_calculations ON sbox_rlougee_qsar.univariate_calculations.fk_dataset_id = sbox_rlougee_qsar.datasets.id JOIN sbox_rlougee_qsar.uc_statistics ON sbox_rlougee_qsar.uc_statistics.fk_univ_calc_id = sbox_rlougee_qsar.univariate_calculations.id JOIN sbox_rlougee_qsar.statistics ON sbox_rlougee_qsar.statistics.id = sbox_rlougee_qsar.uc_statistics.fk_statistic_id JOIN sbox_rlougee_qsar.descriptors ON sbox_rlougee_qsar.descriptors.id = sbox_rlougee_qsar.univariate_calculations.fk_descriptor_id WHERE datasets.name LIKE \'%aeid\_9\_invitrodbv2%\' AND fk_descriptor_set_id = 1445 GROUP BY datasets.name, descriptors.descriptors_name, descriptors.label, statistics.abbreviation, uc_statistics.value'
)
Beispiel #28
0
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445):
    """ Get Enrichment data for a combined set of chemotypes """

    # aborts if no significant chemotypes
    if len(sigtxp) == 0:
        return None

    mysession = SQLSession(Schemas.qsar_schema).get_session()
    MyDataSet = mysession.execute(
        'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets'
        ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id'
        ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id'
        ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id'
        ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id'
        ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi))
    MyDataSet = pd.DataFrame(list(MyDataSet))

    MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint']

    #something to separate and name fingerprint columns
    MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1)
    MyDataSet = MyDataSet.drop('Toxprint', axis=1)

    #name the columns correctly
    query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi)
    descriptornames = pd.DataFrame(list(query3))

    for num,name in enumerate(descriptornames['label'], start=0):
        MyDataSet = MyDataSet.rename(columns={num:name})

    # drop columns that are not significant
    sigtxp = pd.DataFrame(sigtxp)
    sigtxp.columns = ['descriptors_name']
    siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner')
    siglabel = list(siglabel['label'])

    for i in MyDataSet.columns[2:]:
        if i in siglabel:
            pass
        else:
            MyDataSet = MyDataSet.drop(i, axis=1)

    # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False)

    # return overall balanced accuracy calculations
    # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file
    # print(MyDataSet.head())
    model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value'])

    # fill model_row confusion matrix
    for index, row in MyDataSet.iterrows():
        rowsum = sum([int(x) for x in row.iloc[2:]])
        if row['Hit_Call'] == 1 and rowsum > 0:
            model_row['True Positives'] += 1
        elif row['Hit_Call'] == 1 and rowsum == 0:
            model_row['False Negatives'] += 1
        elif row['Hit_Call'] == 0 and rowsum > 0:
            model_row['False Positives'] += 1
        elif row['Hit_Call'] == 0 and rowsum == 0:
            model_row['True Negatives'] += 1

    # fill model_row statistics
    oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater')
    model_row['P-Value'] = pvalue
    model_row['Odds Ratio'] = oddsratio
    model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives'])
    BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2)
    model_row['Balanced Accuracy'] = float(BA)
    inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater')
    model_row['Inverse P-Value'] = inv_pvalue
    model_row['Inverse Odds Ratio'] = inv_oddsratio

    # print(model_row)
    return model_row
from database.qsar.statistics import Statistics
from database.qsar.uc_statistics import UcStatistics
from database.qsar.univariate_calculations import UnivariateCalculations
from database.invitrodb.assay_component import AssayComponent
from database.invitrodb.assay_component_endpoint import AssayComponentEndpoint
from sqlalchemy import or_, and_
import sys
import click
import pandas as pd
from io import StringIO
import re

# FIRST NEED TO GET A TABLE index = AEID and columns = Assay Cats, Toxprints Enrichments (discrete hitcalls for each assay)
# going to do this in two parts
# part 1 get assay cats X aeids
mysession = SQLSession(Schemas.information_schema).get_session()

query0 = mysession.query(AssayComponentEndpoint.aeid,
                         AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc,
                         AssayComponent.assay_component_target_desc,
                         AssayComponentEndpoint.assay_component_endpoint_desc,
                         AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type,
                         AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay,
                         AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction,
                         AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub,
                         AssayComponentEndpoint.intended_target_family,
                         AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type,
                         AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target,
                         AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub,
                         AssayComponent.detection_technology, AssayComponent.signal_direction_type,
                         AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type,
def cli():
    ### HELP DOCUMENTATION ###
    """
    checks for updates for AEIDS (new compounds / new endpoints / new AEIDS)

    if there are any updates qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints are updated to include this information

    """

    ####################################################################################################################

    #QUERY MC5 data for hitcalls and chemical IDs

    mysession = SQLSession(Schemas.information_schema).get_session()

    query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid) \
        .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
        .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
        .join(Sample, Sample.chid == GenericSubstances.id) \
        .join(Mc4, Mc4.spid == Sample.spid) \
        .join(Mc5, Mc5.m4id == Mc4.m4id)

    mc5_table = pd.DataFrame(list(query0))

    ####################################################################################################################

    ### QUERY DATE WHEN THE TABLE WAS LAST UPDATED ###
    # Not very useful as datasets.name date will always be different/ not worth querying all of the dates

    # query1 = mysession.query(Tables.UPDATE_TIME)\
    #     .filter(or_(Tables.TABLE_SCHEMA == 'invitrodb'), (Tables.TABLE_NAME == 'mc5'))
    #
    # #format last_update query
    # last_update = str(list(query1)[0][0])[:10].replace('-','')
    ####################################################################################################################

    def filldatasets(invitrodbdf, fd_aeid):
        username = '******'
        # create a new datasets name entry
        datasets_name = str('aeid:{}_{}'.format(
            fd_aeid,
            datetime.datetime.today().strftime("%Y%m%d")))
        description = "The set of hit calls from the toxcast AEID: {} taken on the date:{}"\
            .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))
        datasets = Datasets(name=datasets_name,
                            label=datasets_name,
                            updated_by=username,
                            created_by=username,
                            long_description=description,
                            short_description=description)
        mysession.add(datasets)
        mysession.flush()
        fk_dataset_id = int(datasets.id)

        # add datatable to the mysql database
        for index, row in invitrodbdf.iterrows():
            efk_dsstox_compound_id = row.loc['id']
            efk_chemprop_measured_property_id = None  #leave null -CG #not nullable
            measured_value_dn = row.loc['hitc']
            created_by = username
            updated_by = username

            datapoints = Datapoints(
                efk_dsstox_compound_id=efk_dsstox_compound_id,
                efk_chemprop_measured_property_id=
                efk_chemprop_measured_property_id,
                measured_value_dn=measured_value_dn,
                created_by=created_by,
                updated_by=updated_by)

            mysession.add(datapoints)
            mysession.flush()

            fk_datapoint_id = int(datapoints.id)

            dataset_datapoints = DatasetDatapoints(
                fk_dataset_id=fk_dataset_id,
                fk_datapoint_id=fk_datapoint_id,
                updated_by=username,
                created_by=username)
            mysession.add(dataset_datapoints)
        mysession.commit()

    ####################################################################################################################

    ### CHECK 1) IF TABLE EXISTS FOR AEID 2) IF THE TABLE HAS CHANGED

    # begin a for loop for each unique aeid
    for x_aeid in mc5_table.aeid.unique():
        #query latest dataset for this aeid
        aeid_query = mysession.query(Datasets.name) \
            .filter(Datasets.name.like("aeid:{}/_%".format(str(x_aeid)), escape='/'))

        aeid_query = list(aeid_query)
        aeid_query = [x[0] for x in aeid_query]

        #get the latest values for aeid
        new_df = mc5_table[mc5_table['aeid'].isin([x_aeid])]

        if aeid_query == [] or aeid_query == [''] or aeid_query == None:

            print(
                "New AEID, filling mysql database for aeid: {}".format(x_aeid))

            filldatasets(new_df, x_aeid)

        else:
            # find and retrieve the newest dataset name
            aeid_query_dates = [x.split('_')[1] for x in aeid_query]
            newest_aeid_date = sorted(aeid_query_dates)[0]
            newest_aeid = [x for x in aeid_query if str(newest_aeid_date) in x]

            #pull table and compare
            old_df = mysession.query(Datapoints.efk_dsstox_compound_id, Datapoints.measured_value_dn)\
                .join(DatasetDatapoints, Datapoints.id==DatasetDatapoints.fk_datapoint_id)\
                .join(Datasets, DatasetDatapoints.fk_dataset_id==Datasets.id)\
                .filter(Datasets.name==newest_aeid[0])

            old_df = pd.DataFrame(list(old_df))

            ##FORMAT DFs FOR COMPARING
            #rename columns
            old_df.columns = ['id', 'hitc']
            my_new_df = new_df.loc[:, ['id', 'hitc']]
            old_df['hitc'] = old_df['hitc'].astype(int)

            #sort dataframes
            my_new_df = my_new_df.sort_values(['id', 'hitc'])
            old_df = old_df.sort_values(['id', 'hitc'])

            #reset index
            my_new_df = my_new_df.reset_index(drop=True)
            old_df = old_df.reset_index(drop=True)

            if my_new_df.equals(old_df) == True:

                print("no change for aeid: {}".format(x_aeid))

                pass

            else:

                print("Update, filling mysql database for aeid: {}".format(
                    x_aeid))

                filldatasets(new_df, x_aeid)