Example #1
0
def updatedatabase(fingerprintdf):
    username = '******'
    descriptors_index = 1448

    # Query for compound_descriptor_sets.id
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.id) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index)

    df3 = pd.DataFrame(list(query3))

    #join query with fingerprintdf

    mytable = pd.merge(df3, fingerprintdf, on='dsstox_compound_id')

    # # # CODE FOR UPDATING # # #
    for index, row in mytable.iterrows():
        id = row[1]
        descriptor_string_tsv = row[2]
        updated_by = username
        mysession.query(CompoundDescriptorSets.id,
                        CompoundDescriptorSets.descriptor_string_tsv,
                        CompoundDescriptorSets.updated_by).filter(
                            CompoundDescriptorSets.id == id).update({
                                "descriptor_string_tsv":
                                descriptor_string_tsv,
                                "updated_by":
                                updated_by
                            })
        mysession.commit()
def update_toxprint_database(fingerprintdf):
    username = '******'
    descriptors_index = 1445

    # Query for compound_descriptor_sets.id
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(CompoundDescriptorSets.id, CompoundDescriptorSets.efk_dsstox_compound_id)\
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == descriptors_index)

    df3 = pd.DataFrame(list(query3))

    #join query with fingerprintdf
    fingerprintdf = fingerprintdf.reset_index()
    fingerprintdf = fingerprintdf.rename(
        columns={'M_NAME':
                 'efk_dsstox_compound_id'})  # not sure this will work
    fingerprintdf['efk_dsstox_compound_id'] = [
        int(x[8:]) for x in fingerprintdf['efk_dsstox_compound_id']
    ]
    mytable = pd.merge(df3, fingerprintdf, on='efk_dsstox_compound_id')

    # # # CODE FOR UPDATING # # #
    for index, row in mytable.iterrows():
        id = str(row[0])
        xx = [str(x) for x in list(row[2:])]
        descriptor_string_tsv = '\t'.join(xx)
        updated_by = username
        mysession.query(CompoundDescriptorSets.id,
                        CompoundDescriptorSets.descriptor_string_tsv,
                        CompoundDescriptorSets.updated_by).filter(
                            CompoundDescriptorSets.id == id).update({
                                "descriptor_string_tsv":
                                descriptor_string_tsv,
                                "updated_by":
                                updated_by
                            })
        mysession.commit()
Example #3
0
def mk_special_fingerprints(idrow):
    # query existing toxprints
    dsi = 1445
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id).filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi)
    df2 = pd.DataFrame(list(query2))

    # FILTERING BY A LIST OF >1000 WON'T WORK IN MANY DATABASES (THIS IS BREAKING THE SCRIPT HERE ON FULL UPDATE)
    # doing a full query then a merge after
    df2 = pd.merge(pd.DataFrame(idrow, columns=['dsstox_compound_id']),
                   df2,
                   on='dsstox_compound_id')

    # something to separate and name fingerprint columns
    df2 = pd.concat([
        df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)
    ],
                    axis=1)
    df2 = df2.drop('descriptor_string_tsv', axis=1)

    # # # GENERATE SPECIFIC NEW FINGERPRINTS # # #
    # create an empty column for the new fingerprint
    df2['Special_Toxprints'] = ""

    # iterate through datatable and create new Special Toxprints
    # make sure you are looking at the right index when combining Toxprints
    for index, row in df2.iterrows():
        # have to code each special toxprint this way
        if row[480] == '1' and row[489] == '1':
            row['Special_Toxprints'] = '1'
        else:
            row['Special_Toxprints'] = '0'

        # make sure to add tabs before the rest of the toxprints
        if row[489] == '1' and row[480] == '0':
            row['Special_Toxprints'] += '\t1'
        else:
            row['Special_Toxprints'] += '\t0'

        if row[480] == '1' and row[489] == '0':
            row['Special_Toxprints'] += '\t1'
        else:
            row['Special_Toxprints'] += '\t0'

    # remove everything but fingerprints and DTXCIDs
    output_df = df2[['dsstox_compound_id', 'Special_Toxprints']]
    return output_df
Example #4
0
def check(f):
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # pull compound ids in invitrodb
    CID1 = mysession.execute(
        'SELECT compounds.id FROM invitrodb.mc5'
        ' JOIN invitrodb.mc4 ON mc4.m4id = mc5.m4id'
        ' JOIN invitrodb.sample ON sample.spid = mc4.spid'
        ' JOIN ro_stg_dsstox.generic_substances ON generic_substances.id = sample.chid'
        ' JOIN ro_stg_dsstox.generic_substance_compounds ON generic_substance_compounds.fk_generic_substance_id = generic_substances.id'
        ' JOIN ro_stg_dsstox.compounds ON compounds.id = generic_substance_compounds.fk_compound_id'
    )

    CID1 = set([x[0] for x in list(CID1)])

    # pull compound ids in compound_descriptor_sets
    CID2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id) \
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == '1449')

    CID2 = [x[0] for x in list(CID2)]

    # # # CHECKS FOR ID AND ToxcastFPs IN QSAR.COMPOUND_DESCRIPTOR_SETS # # #
    # # # MAKE A LIST OF THE DSSTOX.COMPOUNDS.ID THAT DON'T HAVE SPECIAL ToxcastFPs # # #

    CID3 = list(CID1 - set(CID2))

    # check dataframes
    # print("\n set CID1:", len(set(CID1)))
    # print("\n set CID2:", len(set(CID2)))
    # print("\n CID3", len(CID3), CID3)
    # print("\n CID3df", pd.DataFrame(CID3).head())
    # print("\n CID1df", pd.DataFrame(list(CID1)).head())

    # # # # IF QSAR.COMPOUND_DESCRIPTOR_SETS IS MISSING EITHER THEN GENERATE ToxcastFPs FOR THIS COMPOUND # # #
    if CID3 == [] and f is False:
        print(Fore.RED +
              'SQL query is empty: All ToxcastFP available or no DTXCIDs')
        sys.exit(0)

    elif CID3 == [] and f is True:
        return pd.DataFrame([]), pd.DataFrame(CID1)

    else:
        return pd.DataFrame(CID3), pd.DataFrame(CID1)
Example #5
0
def fillnewenrich(x_aeid):
    print(x_aeid)
    # retrive the latest dataset for the aeid
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    new_df = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \
        .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \
        .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \
        .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id)\
        .filter(Datasets.name == x_aeid)

    new_df = pd.DataFrame(list(new_df))

    # new_df = query1[query1['name'].isin([x_aeid])]
    # rename columns
    new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id']
    my_dataset_id = new_df['dataset_id'].iloc[0]
    # make the enrichment table
    filluc(new_df, my_dataset_id)
Example #6
0
from database.dsstox.compounds import Compounds
from database.qsar.compound_descriptors import CompoundDescriptors
from database.session import SQLSession
from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds
from database.dsstox.generic_substances import GenericSubstances


import pandas as pd

mytable = pd.read_csv('~/Desktop/Katies_data.tsv', sep='\t')
dtxsid = mytable.iloc[:,4]

mysession = SQLSession(Schemas.qsar_schema).get_session()

query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.id, Compounds.dsstox_compound_id).join(
    GenericSubstanceCompounds) \
    .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(dtxsid))

df = pd.DataFrame(list(query))

myids = [int(x) for x in df.iloc[:,1]]

# query1 = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(dtxsid))
# df1 = pd.DataFrame(list(query1))

query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv,
                         CompoundDescriptors.fk_descriptor_set_id).filter(
    CompoundDescriptors.efk_dsstox_compound_id.in_(myids))
df2 = pd.DataFrame(list(query2))

df2 = df2.rename(columns={"efk_dsstox_compound_id":'id'})
Example #7
0
from database.qsar.compound_descriptor_sets import CompoundDescriptorSets
from database.dsstox.compounds import Compounds
from database.database_schemas import Schemas
from database.session import SQLSession

mysession = SQLSession(Schemas.qsar_schema).get_session()

x_aeid = 926

query3 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
    .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \
    .filter(CompoundDescriptorSets.fk_descriptor_set_id == int(1445))
    # .filter(Compounds.dsstox_compound_id.in_(str(new_df.iloc[0]))) \


print(list(query3))
Example #8
0
from database.dsstox.compounds import Compounds
from database.dsstox.generic_substance_compounds import GenericSubstanceCompounds
from database.dsstox.generic_substances import GenericSubstances
from database.invitrodb.mc4 import Mc4
from database.invitrodb.mc5 import Mc5
from database.invitrodb.sample import Sample
from database.session import SQLSession

# GET ALL ASSAYS FROM MC5
# QUERY MC5 data for hitcalls and chemical IDs

mysession = SQLSession(Schemas.information_schema).get_session()

query0 = mysession.query( Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid,) \
    .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
    .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
    .join(Sample, Sample.chid == GenericSubstances.id) \
    .join(Mc4, Mc4.spid == Sample.spid) \
    .join(Mc5, Mc5.m4id == Mc4.m4id)

mc5_table = pd.DataFrame(list(query0))
# print(mc5_table.shape)
# print( mc5_table[mc5_table['aeid']==1086] )
# sys.exit(1)

# run xgboost for files in test_aeids


def makeXGB(myaeid):
    try:
        # output print statements into file
        os.makedirs("/home/rlougee/Desktop/xgb_results/" + str(myaeid))
def filluc(invitrodbdf, mydatasetid):
    #set starttime
    starttime = time.time()

    username = '******'
    descriptor_set_id = [1445, 1447, 1446, 1448]
    # descriptor_set_id = [1448] # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
    # 1445 = toxprints
    # 1446 = MACCS
    # 1447 = pubchem
    # 1448 = Special Toxprints

    for id in descriptor_set_id:
        # add enrichment table to the mysql database
        try:
            filled_table = handle_duplicates(invitrodbdf.loc[:, ['dsstox_compound_id', 'hitc']])

        except:
            print("DUPLICATE HANDLER FAILED: {}".format(mydatasetid))
            sys.exit(1)

        try:
            filled_table = fillfp(filled_table, id)
        except:
            print("FILLFP FAILED: {}".format(mydatasetid))
            sys.exit(1)

        # filled_table = pd.DataFrame(filled_table)

        try:
            my_enrichment_table = enrich(filled_table)

        except:
            print("ENRICH FAILED: {}".format(mydatasetid))
            print(filled_table.head())
            sys.exit(1)


        # add fk_descriptor_id
        ### CHECK THAT THESE ARE MATCHING! ###
        mysession2 = SQLSession(Schemas.qsar_schema).get_session()
        query3 = mysession2.query(Descriptors.id).filter(Descriptors.fk_descriptor_set_id == id)
        query3 = list(query3)
        query3 = [int(i[0]) for i in query3]
        my_enrichment_table.insert(0, 'fk_descriptor_id', query3)

        for index, row in my_enrichment_table.iterrows():
            fk_dataset_id = int(mydatasetid)
            fk_descriptor_id = int(row['fk_descriptor_id'])

            univariate_calc = UnivariateCalculations(fk_dataset_id=fk_dataset_id,
                                                     fk_descriptor_id=fk_descriptor_id,
                                                     updated_by=username,
                                                     created_by=username)
            mysession2.add(univariate_calc)
            mysession2.flush()

            fk_univ_calc_id = int(univariate_calc.id)

            ### NEED TO CHANGE for loop & stat_list IF THE STATISTICS ARE CHANGED IN Enrichment_Table_Generator ###
            count = 0
            for i in row[1:]:

                if math.isnan(i):
                    value = None
                elif math.isinf(i):
                    value = 99999999.9
                else:
                    value = float(i)

                stat_list = [9, 10, 11, 12, 13, 4, 8, 7, 14, 15]
                fk_statistic_id = int(stat_list[count])

                uc_statistics = UcStatistics(value=value,
                                             fk_univ_calc_id=int(fk_univ_calc_id),
                                             fk_statistic_id=int(fk_statistic_id),
                                             created_by=username,
                                             updated_by=username)

                mysession2.add(uc_statistics)
                count += 1
        mysession2.commit()
        # mysession2.close()
    endtime = time.time()
    print('run time:{}'.format(endtime-starttime))
Example #10
0
for i in wowie['name'].unique():
    bla = wowie[wowie['name'] == i]
    blabla = pd.DataFrame(bla.iloc[:,1:])
    print(blabla.head(), blabla.shape)

    # for idx, row in blabla.iterrows():
        # if row[1] >= 40:
        #     # print(row[1])
        #     blabla.iloc[idx, 1] = 10
        #     # print(blabla.iloc[idx,1])


    # get Txp index
    mysession = SQLSession(Schemas.qsar_schema).get_session()

    query3 = mysession.query(Descriptors.index_number, Descriptors.descriptors_name).filter(Descriptors.fk_descriptor_set_id == 1445)
    descriptornames = pd.DataFrame(list(query3))

    # sort by TXP number
    sorted = pd.merge(descriptornames, blabla, on='descriptors_name')
    sorted = sorted.drop('index_number', axis=1)
    # print(sorted.head())

    name =  i.split('_')[0]

    from zMisc_Code.DATA_VISUALIZATION.barplot import barplot
    barplot(sorted.iloc[:,1], sorted.iloc[:,0], name)
    plt.tight_layout()
    # plt.show()
    plt.savefig('/home/rlougee/Desktop/images/{}.png'.format(name))
def cli():
    ### HELP DOCUMENTATION ###
    """
    checks for updates for AEIDS (new compounds / new endpoints / new AEIDS)

    if there are any updates qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints are updated to include this information

    """

    ####################################################################################################################

    #QUERY MC5 data for hitcalls and chemical IDs

    mysession = SQLSession(Schemas.information_schema).get_session()

    query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid) \
        .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
        .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
        .join(Sample, Sample.chid == GenericSubstances.id) \
        .join(Mc4, Mc4.spid == Sample.spid) \
        .join(Mc5, Mc5.m4id == Mc4.m4id)

    mc5_table = pd.DataFrame(list(query0))

    ####################################################################################################################

    ### QUERY DATE WHEN THE TABLE WAS LAST UPDATED ###
    # Not very useful as datasets.name date will always be different/ not worth querying all of the dates

    # query1 = mysession.query(Tables.UPDATE_TIME)\
    #     .filter(or_(Tables.TABLE_SCHEMA == 'invitrodb'), (Tables.TABLE_NAME == 'mc5'))
    #
    # #format last_update query
    # last_update = str(list(query1)[0][0])[:10].replace('-','')
    ####################################################################################################################

    def filldatasets(invitrodbdf, fd_aeid):
        username = '******'
        # create a new datasets name entry
        datasets_name = str('aeid:{}_{}'.format(
            fd_aeid,
            datetime.datetime.today().strftime("%Y%m%d")))
        description = "The set of hit calls from the toxcast AEID: {} taken on the date:{}"\
            .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))
        datasets = Datasets(name=datasets_name,
                            label=datasets_name,
                            updated_by=username,
                            created_by=username,
                            long_description=description,
                            short_description=description)
        mysession.add(datasets)
        mysession.flush()
        fk_dataset_id = int(datasets.id)

        # add datatable to the mysql database
        for index, row in invitrodbdf.iterrows():
            efk_dsstox_compound_id = row.loc['id']
            efk_chemprop_measured_property_id = None  #leave null -CG #not nullable
            measured_value_dn = row.loc['hitc']
            created_by = username
            updated_by = username

            datapoints = Datapoints(
                efk_dsstox_compound_id=efk_dsstox_compound_id,
                efk_chemprop_measured_property_id=
                efk_chemprop_measured_property_id,
                measured_value_dn=measured_value_dn,
                created_by=created_by,
                updated_by=updated_by)

            mysession.add(datapoints)
            mysession.flush()

            fk_datapoint_id = int(datapoints.id)

            dataset_datapoints = DatasetDatapoints(
                fk_dataset_id=fk_dataset_id,
                fk_datapoint_id=fk_datapoint_id,
                updated_by=username,
                created_by=username)
            mysession.add(dataset_datapoints)
        mysession.commit()

    ####################################################################################################################

    ### CHECK 1) IF TABLE EXISTS FOR AEID 2) IF THE TABLE HAS CHANGED

    # begin a for loop for each unique aeid
    for x_aeid in mc5_table.aeid.unique():
        #query latest dataset for this aeid
        aeid_query = mysession.query(Datasets.name) \
            .filter(Datasets.name.like("aeid:{}/_%".format(str(x_aeid)), escape='/'))

        aeid_query = list(aeid_query)
        aeid_query = [x[0] for x in aeid_query]

        #get the latest values for aeid
        new_df = mc5_table[mc5_table['aeid'].isin([x_aeid])]

        if aeid_query == [] or aeid_query == [''] or aeid_query == None:

            print(
                "New AEID, filling mysql database for aeid: {}".format(x_aeid))

            filldatasets(new_df, x_aeid)

        else:
            # find and retrieve the newest dataset name
            aeid_query_dates = [x.split('_')[1] for x in aeid_query]
            newest_aeid_date = sorted(aeid_query_dates)[0]
            newest_aeid = [x for x in aeid_query if str(newest_aeid_date) in x]

            #pull table and compare
            old_df = mysession.query(Datapoints.efk_dsstox_compound_id, Datapoints.measured_value_dn)\
                .join(DatasetDatapoints, Datapoints.id==DatasetDatapoints.fk_datapoint_id)\
                .join(Datasets, DatasetDatapoints.fk_dataset_id==Datasets.id)\
                .filter(Datasets.name==newest_aeid[0])

            old_df = pd.DataFrame(list(old_df))

            ##FORMAT DFs FOR COMPARING
            #rename columns
            old_df.columns = ['id', 'hitc']
            my_new_df = new_df.loc[:, ['id', 'hitc']]
            old_df['hitc'] = old_df['hitc'].astype(int)

            #sort dataframes
            my_new_df = my_new_df.sort_values(['id', 'hitc'])
            old_df = old_df.sort_values(['id', 'hitc'])

            #reset index
            my_new_df = my_new_df.reset_index(drop=True)
            old_df = old_df.reset_index(drop=True)

            if my_new_df.equals(old_df) == True:

                print("no change for aeid: {}".format(x_aeid))

                pass

            else:

                print("Update, filling mysql database for aeid: {}".format(
                    x_aeid))

                filldatasets(new_df, x_aeid)
#     new_df.columns = ['dsstox_compound_id', 'hitc', 'name', 'dataset_id']
#     my_dataset_id = new_df['dataset_id'].iloc[0]
#     # make the enrichment table
#     filluc(new_df, my_dataset_id)

####################################################################################################################

# create a connection pool for multiprocessing with mysqlalchemy
if __name__ == '__main__':

    ### QUERY THE MYSQL DB 4 A COMPLETE LIST OF AEIDS, ENDPOINTS & DTXCIDS ###

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # query the Unique Enrichment Table IDs
    query0 = mysession.query(UnivariateCalculations.fk_dataset_id)
    query0 = pd.DataFrame(list(query0))

    if query0.empty:
        pass
    else:
        query0 = query0.fk_dataset_id.unique()

    # query the full set of data
    # slow ~3.5 minutes already
    query1 = mysession.query(Compounds.dsstox_compound_id, Datapoints.measured_value_dn, Datasets.name, Datasets.id) \
        .join(Datapoints, Datapoints.efk_dsstox_compound_id == Compounds.id) \
        .join(DatasetDatapoints, Datapoints.id == DatasetDatapoints.fk_datapoint_id) \
        .join(Datasets, DatasetDatapoints.fk_dataset_id == Datasets.id)

    query1 = pd.DataFrame(list(query1))
Example #13
0
def main():
    args = docopt(__doc__)
    # print(args)

    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)

    # set input arguments and options to variables
    InID = args['<InID>'].lower()
    OutID = args['<OutID>'].lower()
    tsv_input = args['<tsv_file>']
    o = args['--output']
    noerror = args['--noerror']

    # filter InID & OutID
    if InID in acceptedID:
        pass
    else:
        print(Fore.RED +
              'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID))
        sys.exit(1)
    if OutID in acceptedID:
        pass
    else:
        print(Fore.RED +
              'Invalid InID: {}\n InID must be {}'.format(InID, acceptedID))
        sys.exit(1)

    # creates table of .tsv file
    # takes stdin if argument is not directly given
    if not tsv_input:
        tsv_input = sys.stdin.read()
        mytable = pd.read_csv(StringIO(tsv_input), sep="\t")
    elif tsv_input:
        mytable = pd.read_csv(tsv_input, sep="\t")

    # takes the chemical ID column
    idrow = mytable.iloc[:, 0]
    colname = mytable.columns.values[0]

    # make an SQL query table  for relevant SIDs & CIDs
    mysession = SQLSession(Schemas.dsstox_schema).get_session()
    query = mysession.query(GenericSubstances.dsstox_substance_id, GenericSubstances.casrn, Compounds.dsstox_compound_id)\
        .join(GenericSubstanceCompounds, GenericSubstanceCompounds.fk_generic_substance_id == GenericSubstances.id)\
        .join(Compounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id)
    df = pd.DataFrame(list(query))
    df.columns = ['dtxsid', 'casrn', 'dtxcid']

    idrow = pd.DataFrame(idrow)
    idrow.columns = [InID]

    # do a join to filter out unwanted IDs
    if InID == 'dtxcid':
        df = pd.merge(idrow, df, on='dtxcid', how='inner')
    elif InID == 'casrn':
        df = pd.merge(idrow, df, on='casrn', how='inner')
    elif InID == 'dtxsid':
        df = pd.merge(idrow, df, on='dtxsid', how='inner')

    df = df.drop_duplicates(InID)

    # if no DTXCIDs returned
    if df.empty and not noerror:
        print(Fore.RED + "Error: No valid {} or no associated {}\n{}".format(
            InID, OutID, list(idrow)))
        sys.exit(1)
    elif df.empty:
        sys.exit(1)

    #creates a list of unconverted IDs
    noid = list(set(idrow.iloc[:, 0]) - set(list(df.iloc[:, 0])))

    # creates new CID table
    mytable = mytable.rename(columns={colname: InID})
    mytable = pd.merge(df, mytable, on=InID, how='left')
    for i in acceptedID:
        if i != OutID:
            mytable = mytable.drop(i, 1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += i + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    # output options
    if not o:
        print(output[:int(-1)])
    else:
        outputtable.to_csv(o, sep='\t', index=False)

    # ERROR message
    # not actual STDERR this is for the user
    if not noerror:
        print(Fore.RED + "Error: Invalid {} or no associated {}\n{}".format(
            InID, OutID, noid))
Example #14
0
def fillfp(mytable, dsi):

    #checks the index, and first two columns for DTXCIDs
    #input table should be in the correct format already
    try:
        if mytable.iloc[0, 0][0:6] == 'DTXCID':
            idrow = mytable.iloc[:, 0]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.iloc[0, 1][0:6] == 'DTXCID':
            idrow = mytable.iloc[:, 1]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.index.values[0][0:6] == 'DTXCID':
            idrow = mytable.index.values
            mytable.index.name = 'DTXCID'
            colname = mytable.index.name
    except:
        pass

    dsi = int(dsi)
    ########################################################################################################################
    ########################################################################################################################

    mysession1 = SQLSession(Schemas.dsstox_schema).get_session()

    # ### CHECKS FOR DTXCID IN DSSTOX.COMPOUNDS ###
    # query = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(idrow))
    # df1 = pd.DataFrame(list(query))
    # df1 = [int(x) for x in df1.iloc[:, 0]]
    #
    # ### CHECKS FOR ID AND TOXPRINTS IN QSAR.COMPOUND_DESCRIPTOR_SETS ###
    # query2 = mysession.query(CompoundDescriptors.efk_dsstox_compound_id, CompoundDescriptors.descriptor_string_tsv)\
    #     .filter(CompoundDescriptors.efk_dsstox_compound_id.in_(df1))\
    #     .filter(CompoundDescriptors.fk_descriptor_set_id == 1445)

    query2 = mysession1.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi)

    df2 = pd.DataFrame(list(query2))
    idrow = pd.DataFrame(idrow.unique())
    idrow.columns = ['dsstox_compound_id']
    df2 = pd.merge(idrow, df2, on='dsstox_compound_id', how='inner')

    #something to separate and name fingerprint columns
    df2 = pd.concat([
        df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)
    ],
                    axis=1)
    df2 = df2.drop('descriptor_string_tsv', axis=1)
    # print(df2)

    #name the columns correctly
    query3 = mysession1.query(Descriptors.descriptors_name).filter(
        Descriptors.fk_descriptor_set_id == dsi)
    descriptornames = list(query3)

    for num, name in enumerate(descriptornames, start=0):
        df2 = df2.rename(columns={num: name[0]})

    #creates the final output table
    mytable = mytable.rename(columns={colname: "dsstox_compound_id"})
    mytable = pd.merge(mytable, df2, on='dsstox_compound_id')
    # mytable = mytable.drop('dsstox_compound_id', 1)
    outputtable = mytable

    outputtable = outputtable.drop(outputtable.columns[-1], axis=1)

    #check for trailing column created by tab and remove
    if outputtable[outputtable.columns[-1]][0] == '':
        mytable = mytable.drop(mytable.columns[-1], axis=1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends

    # columnnames = mytable.columns.values
    # output = ''
    # for i in columnnames:
    #     output += str(i) + '\t'
    # output += '\n'
    # mytable = mytable.values.tolist()
    #
    # for i in mytable:yes
    #     a = '\t'.join(str(x) for x in i)
    #     output += a + '\n'
    mysession1.close()
    return outputtable
Example #15
0
def cli(tsv_input,o,noerror):
    ### HELP DOCUMENTATION ###

    """
    SIDtoCID takes in a .tsv datatable with a dsstox_substance_id column (must be index or first 2 columns).
    The dsstox_substance_id column is converted to dsstox_compound_id.
    Can use a .tsv file as stdin. Default output is stdout as .tsv.
    \n\n
    Warning!: column names are needed in the input .tsv! Otherwise first row will be skipped.



    -- EXAMPLE I/O TABLES --

    INPUT:  .tsv file

    |DTXSID COLUMN | ENDPOINT COLUMN |\n
    ----------------------------------\n
    | DTXSID123456 |        0        |\n
    ----------------------------------\n
    | DTXSID234567 |        1        |\n
    ----------------------------------\n
    | DTXSID345678 |        0        |\n
    ----------------------------------\n

    EXPORT: .tsv file

    |DTXCID COLUMN | ENDPOINT COLUMN |\n
    ----------------------------------\n
    | DTXCID891011 |        0        |\n
    ----------------------------------\n
    | DTXCID910111 |        1        |\n
    ----------------------------------\n
    | DTXCID101112 |        0        |\n
    ----------------------------------\n



    """
    # creates table of .tsv file
    # takes stdin if argument is not directly given
    if not tsv_input:
        tsv_input = sys.stdin.read()
        mytable = pd.read_csv(StringIO(tsv_input), sep="\t")
    elif tsv_input:
        mytable = pd.read_csv(tsv_input, sep="\t")

    #checks the index, and first two columns for DTXSIDs
    #input table should be in the correct format already
    try:
        if mytable.iloc[0,0][0:6] == 'DTXSID':
            idrow = mytable.iloc[:,0]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.iloc[0,1][0:6] == 'DTXSID':
            idrow = mytable.iloc[:,1]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.index.values[0][0:6] == 'DTXSID':
            idrow = mytable.index.values
            mytable.index.name = 'DTXSID'
            colname = mytable.index.name
    except:
        pass

    # drop empty columns
    mytable = mytable.dropna(axis='columns', how='all')

    # click.echo(mytable.columns.values)
    #make an SQL query table  for relevant SIDs & CIDs
    mysession = SQLSession(Schemas.dsstox_schema).get_session()

    query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \
        .join(Compounds)

    df = pd.DataFrame(list(query))
    idrow = pd.DataFrame(idrow)
    idrow.columns = ['dsstox_substance_id']
    df = pd.merge(idrow, df, on='dsstox_substance_id', how='inner')


    #if no DTXCIDs returned
    if df.empty and noerror:
        click.secho("Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow)), fg='red', bold=True)
        sys.exit(1)
    elif df.empty:
        sys.exit(1)

    #creates new CID table
    mytable = mytable.rename(columns={colname : "dsstox_substance_id"})
    mytable = pd.merge(df, mytable, on='dsstox_substance_id')
    mytable = mytable.drop('dsstox_substance_id', 1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += i + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    #output options
    if o =='':
        click.echo(output)
    else:
        outputtable.to_csv(o, sep='\t',index=False)

    # get IDs that were not converted
    noid = list(list(set(idrow.iloc[:,0]) - set(df.iloc[:,0])))

    #ERROR message
    #not actual STDERR this is for the user
    if noerror:
        click.secho("Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid), fg='red', bold=True)
Example #16
0
def main():
    args = docopt(__doc__)
    print(args)
    if args['--version']:
        print('NCCT CLI: Version 0.0.0')
        sys.exit(0)
    tsv_input = args['<tsv_file>']
    o = args['--output']
    noerror = args['--noerror']


    # creates table of .tsv file
    # takes stdin if argument is not directly given
    if not tsv_input:
        tsv_input = sys.stdin.read()
        mytable = pd.read_csv(StringIO(tsv_input), sep="\t")
    elif tsv_input:
        mytable = pd.read_csv(tsv_input, sep="\t")

    # checks the index, and first two columns for DTXSIDs
    # input table should be in the correct format already
    try:
        if mytable.iloc[0,0][0:6] == 'DTXSID':
            idrow = mytable.iloc[:, 0]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.iloc[0,1][0:6] == 'DTXSID':
            idrow = mytable.iloc[:, 1]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.index.values[0][0:6] == 'DTXSID':
            idrow = mytable.index.values
            mytable.index.name = 'DTXSID'
            colname = mytable.index.name
    except:
        pass

    # make an SQL query table  for relevant SIDs & CIDs
    mysession = SQLSession(Schemas.dsstox_schema).get_session()

    query = mysession.query(GenericSubstances.dsstox_substance_id, Compounds.dsstox_compound_id).join(GenericSubstanceCompounds) \
        .join(Compounds).filter(GenericSubstances.dsstox_substance_id.in_(idrow))

    # checks if DTXSID didn't exist or has no associated DTXCID
    df = pd.DataFrame(list(query))

    # if no DTXCIDs returned
    if df.empty and not noerror:
        print(Fore.RED + "Error: No valid DTXSIDs or no associated DTXCIDs\n{}".format(list(idrow)))
        sys.exit(1)
    elif df.empty:
        sys.exit(1)

    noid = list(set(idrow)-set(list(df.iloc[:, 0])))

    # creates new CID table
    mytable = mytable.rename(columns={colname : "dsstox_substance_id"})
    mytable = pd.merge(df, mytable, on='dsstox_substance_id')
    mytable = mytable.drop('dsstox_substance_id', 1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += i + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    # output options
    if not o:
        print(output)
    else:
        outputtable.to_csv(o, sep='\t', index=False)

    # ERROR message
    # not actual STDERR this is for the user
    if not noerror:
        print(Fore.RED + "Error: Invalid DTXSID or no associated DTXCID\n{}".format(noid))
Example #17
0
def cli(cidtable, o, dsi, t):
    ### HELP DOCUMENTATION ###
    """

    Fills rows in a .tsv containing DTXCIDs with their toxprints.
    DTXCIDs must be in index, or first two columns

    use -o ~/mypath/myfilename.tsv to export a toxprint .tsv file

    for -dsi: toxprints=1445, pubchem=1447, MACCs=1446
    a newer descriptor_set_id may be added or current dsis changed
    """

    # takes stdin if argument is not directly given

    if not cidtable:
        cidtable = sys.stdin.read()
        mytable = pd.read_csv(StringIO(cidtable), sep="\t")
    elif cidtable:
        mytable = pd.read_csv(cidtable, sep="\t")

    #checks the index, and first two columns for DTXCIDs
    #input table should be in the correct format already
    try:
        if mytable.iloc[0, 0][0:6] == 'DTXCID':
            idrow = mytable.iloc[:, 0]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.iloc[0, 1][0:6] == 'DTXCID':
            idrow = mytable.iloc[:, 1]
            colname = mytable.columns.values[0]

    except:
        pass
    try:
        if mytable.index.values[0][0:6] == 'DTXCID':
            idrow = mytable.index.values
            mytable.index.name = 'DTXCID'
            colname = mytable.index.name
    except:
        pass

    # exit if not idrow
    # if not idrow:
    #     click.secho("DTXCID row was not found", color='red')
    # dsi = int(dsi)
########################################################################################################################
########################################################################################################################

    mysession = SQLSession(Schemas.qsar_schema).get_session()

    # ### CHECKS FOR DTXCID IN DSSTOX.COMPOUNDS ###
    # query = mysession.query(Compounds.id, Compounds.dsstox_compound_id).filter(Compounds.dsstox_compound_id.in_(idrow))
    # df1 = pd.DataFrame(list(query))
    # df1 = [int(x) for x in df1.iloc[:, 0]]
    #
    # ### CHECKS FOR ID AND TOXPRINTS IN QSAR.COMPOUND_DESCRIPTOR_SETS ###
    # query2 = mysession.query(CompoundDescriptorSets.efk_dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv)\
    #     .filter(CompoundDescriptorSets.efk_dsstox_compound_id.in_(df1))\
    #     .filter(CompoundDescriptorSets.fk_descriptor_set_id == 1445)

    query2 = mysession.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
        .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id)\
        .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi)

    df2 = pd.DataFrame(list(query2))
    idrow = pd.DataFrame(idrow)
    idrow.columns = ['dsstox_compound_id']
    df2 = pd.merge(idrow, df2, on='dsstox_compound_id', how='inner')

    # something to separate and name fingerprint columns
    df2 = pd.concat([
        df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)
    ],
                    axis=1)
    df2 = df2.drop('descriptor_string_tsv', axis=1)
    # print(df2)

    # name the columns correctly
    if t == 0:
        query3 = mysession.query(Descriptors.descriptors_name).filter(
            Descriptors.fk_descriptor_set_id == dsi)
        descriptornames = list(query3)
    elif t == 1:
        query3 = mysession.query(
            Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi)
        descriptornames = list(query3)

    for num, name in enumerate(descriptornames, start=0):
        df2 = df2.rename(columns={num: name[0]})

    # print(df2)
    # creates the final output table
    mytable = mytable.rename(columns={colname: "dsstox_compound_id"})
    mytable = pd.merge(mytable, df2, on='dsstox_compound_id')
    # mytable = mytable.drop('dsstox_compound_id', 1)

    # check for trailing column created by tab and remove
    if mytable[mytable.columns[-1]][0] == '' or mytable[
            mytable.columns[-1]][0] == None:
        mytable = mytable.drop(mytable.columns[-1], axis=1)
    outputtable = mytable

    # generates a string with tab seperation and line breaks for row ends
    columnnames = mytable.columns.values
    output = ''
    for i in columnnames:
        output += str(i) + '\t'
    output += '\n'
    mytable = mytable.values.tolist()

    for i in mytable:
        a = '\t'.join(str(x) for x in i)
        output += a + '\n'

    #output options
    if o == '':
        click.echo(output)
    else:
        outputtable.to_csv(o, sep='\t', index=False)

    sys.exit(0)
Example #18
0
# which fingerprint set do we want?
# dsi is descriptor set index
dsi = 1445
# 1445 = toxprints
# 1446 = MACCS
# 1447 = pubchem

### I WANT TO SEE THE TOTAL PREVELANCE FOR CHEMOTYPES ACROSS THE ENTIRE ASSAY SPACE ###

## Get matrix containing DTXCIDS, fingerprints

mysession1 = SQLSession(Schemas.dsstox_schema).get_session()

query2 = mysession1.query(Compounds.dsstox_compound_id, CompoundDescriptorSets.descriptor_string_tsv) \
    .join(CompoundDescriptorSets, Compounds.id == CompoundDescriptorSets.efk_dsstox_compound_id) \
    .filter(CompoundDescriptorSets.fk_descriptor_set_id == dsi)

df2 = pd.DataFrame(list(query2))

# something to separate and name fingerprint columns
df2 = pd.concat(
    [df2, df2['descriptor_string_tsv'].str[:].str.split('\t', expand=True)],
    axis=1)
df2 = df2.drop('descriptor_string_tsv', axis=1)
# print(df2)

# name the columns correctly
query3 = mysession1.query(Descriptors.descriptors_name).filter(
    Descriptors.fk_descriptor_set_id == dsi)
descriptornames = list(query3)
Example #19
0
def cli(i, datasetname, username):
    ### HELP DOCUMENTATION ###
    """
    creates a dataset in the mysql database (qsar.datapoints, qsar.datasets, and qsar.datasetdatapoints) from a .tsv file

    input file can be .tsv, .xls or .xlsx
    if input is STDIN must be .tsv
    
    format:

    +-----------------+---------------+-----------\n
    |   DTXCID1234    |       0       | ...\n
    +-----------------+---------------+-----------\n
    |   DTXCID56789   |       1       | ...\n
    +-----------------+---------------+-----------\n


    to remove table header use tail -n +2 myfile.tsv | datatable2mysql

    """

    ####################################################################################################################

    if not i:
        tsv_input = sys.stdin.read()
        try:
            myinputtable = pd.read_csv(StringIO(tsv_input),
                                       sep="\t",
                                       header=None)
        except:
            click.secho('Error: Empty Datatable', fg='red', bold=True)
            sys.exit(1)
    else:
        try:
            click.secho('-- Importing {} --'.format(i), bold=True)
            filename, file_extension = os.path.splitext(i)

            if file_extension == '.tsv':
                myinputtable = pd.read_csv(i, sep='\t', header=None)
            elif file_extension == '.xlsx' or file_extension == '.xls':
                myinputtable = pd.read_excel(i, header=None)
        except:
            click.secho('Error: File Import Failed', fg='red', bold=True)
            sys.exit(1)

    #clean input table
    myinputtable = myinputtable.iloc[:, 0:2]
    myinputtable.columns = ['dsstox_compound_id', 'hitc']

    #QUERY database for proper ids

    mysession = SQLSession(Schemas.information_schema).get_session()

    query0 = mysession.query(Compounds.dsstox_compound_id, Compounds.id )\
        .filter(Compounds.dsstox_compound_id.in_(myinputtable.iloc[:,0]))

    mytable = pd.DataFrame(list(query0))

    #join hitcalls
    mytable = pd.merge(myinputtable,
                       mytable,
                       how='inner',
                       on=[myinputtable.columns[0], mytable.columns[0]])

    #make sure datatable doesn't already exist?

    def filldatasets(invitrodbdf, fd_aeid, username):
        # create a new datasets name entry
        datasets_name = str('Imported_DataTable:{}_{}'.format(
            fd_aeid,
            datetime.datetime.today().strftime("%Y%m%d")))
        description = "Imported DataTable: {} taken on the date:{}"\
            .format(fd_aeid, datetime.datetime.today().strftime("%Y%m%d"))
        datasets = Datasets(name=datasets_name,
                            label=datasets_name,
                            updated_by=username,
                            created_by=username,
                            long_description=description,
                            short_description=description)
        mysession.add(datasets)
        mysession.flush()
        fk_dataset_id = int(datasets.id)

        # add datatable to the mysql database
        for index, row in invitrodbdf.iterrows():
            efk_dsstox_compound_id = row.loc['id']
            efk_chemprop_measured_property_id = None  #leave null -CG #not nullable
            measured_value_dn = row.loc['hitc']
            created_by = username
            updated_by = username

            datapoints = Datapoints(
                efk_dsstox_compound_id=efk_dsstox_compound_id,
                efk_chemprop_measured_property_id=
                efk_chemprop_measured_property_id,
                measured_value_dn=measured_value_dn,
                created_by=created_by,
                updated_by=updated_by)

            mysession.add(datapoints)
            mysession.flush()

            fk_datapoint_id = int(datapoints.id)

            dataset_datapoints = DatasetDatapoints(
                fk_dataset_id=fk_dataset_id,
                fk_datapoint_id=fk_datapoint_id,
                updated_by=username,
                created_by=username)
            mysession.add(dataset_datapoints)
        mysession.commit()

    filldatasets(mytable, datasetname, username)
Example #20
0
                                               created_by=username)
        mysession.add(dataset_datapoints)
    mysession.commit()

    ####################################################################################################################


if __name__ == "__main__":

    #QUERY MC5 data for hitcalls, chemical IDs, assay subcategories
    mysession = SQLSession(Schemas.information_schema).get_session()

    query0 = mysession.query(Compounds.id, Compounds.dsstox_compound_id, Mc5.hitc, Mc5.aeid, AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc, AssayComponent.assay_component_target_desc, AssayComponentEndpoint.assay_component_endpoint_desc, AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type, AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay, AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction, AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub, AssayComponentEndpoint.intended_target_family, AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type, AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target, AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub, AssayComponent.detection_technology, AssayComponent.signal_direction_type, AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type, AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \
        .join(GenericSubstanceCompounds, Compounds.id == GenericSubstanceCompounds.fk_compound_id) \
        .join(GenericSubstances, GenericSubstances.id == GenericSubstanceCompounds.fk_generic_substance_id) \
        .join(Sample, Sample.chid == GenericSubstances.id) \
        .join(Mc4, Mc4.spid == Sample.spid) \
        .join(Mc5, Mc5.m4id == Mc4.m4id) \
        .join(AssayComponentEndpoint, AssayComponentEndpoint.aeid == Mc5.aeid) \
        .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid)

    mytable = pd.DataFrame(list(query0))

    ####################################################################################################################

    ### MAKE TABLES FOR SUBCATEGORIES ###

    #loop through characteristics
    for x in mytable.columns[9:]:
        for y in mytable[x].unique():
            if pd.isnull(y) == True:
                #print('\nnull/nan value found!\n')
Example #21
0
def getenrichfp(DataSetName, sigtxp, mypath, myfilename, dsi=1445):
    """ Get Enrichment data for a combined set of chemotypes """

    # aborts if no significant chemotypes
    if len(sigtxp) == 0:
        return None

    mysession = SQLSession(Schemas.qsar_schema).get_session()
    MyDataSet = mysession.execute(
        'SELECT dsstox_compound_id, measured_value_dn, descriptor_string_tsv FROM sbox_rlougee_qsar.datasets'
        ' JOIN sbox_rlougee_qsar.dataset_datapoints ON sbox_rlougee_qsar.dataset_datapoints.fk_dataset_id = sbox_rlougee_qsar.datasets.id'
        ' JOIN sbox_rlougee_qsar.datapoints ON sbox_rlougee_qsar.datapoints.id = sbox_rlougee_qsar.dataset_datapoints.fk_datapoint_id'
        ' JOIN ro_stg_dsstox.compounds ON sbox_rlougee_qsar.datapoints.efk_dsstox_compound_id = ro_stg_dsstox.compounds.id'
        ' JOIN sbox_rlougee_qsar.compound_descriptor_sets ON ro_stg_dsstox.compounds.id = sbox_rlougee_qsar.compound_descriptor_sets.efk_dsstox_compound_id'
        ' WHERE sbox_rlougee_qsar.datasets.name LIKE \'%{}%\' AND sbox_rlougee_qsar.compound_descriptor_sets.fk_descriptor_set_id = {}'.format(DataSetName, dsi))
    MyDataSet = pd.DataFrame(list(MyDataSet))

    MyDataSet.columns = ['Dsstox_Compound_ID', 'Hit_Call', 'Toxprint']

    #something to separate and name fingerprint columns
    MyDataSet = pd.concat([MyDataSet, MyDataSet['Toxprint'].str[:].str.split('\t', expand=True)], axis=1)
    MyDataSet = MyDataSet.drop('Toxprint', axis=1)

    #name the columns correctly
    query3 = mysession.query(Descriptors.descriptors_name, Descriptors.label).filter(Descriptors.fk_descriptor_set_id == dsi)
    descriptornames = pd.DataFrame(list(query3))

    for num,name in enumerate(descriptornames['label'], start=0):
        MyDataSet = MyDataSet.rename(columns={num:name})

    # drop columns that are not significant
    sigtxp = pd.DataFrame(sigtxp)
    sigtxp.columns = ['descriptors_name']
    siglabel = pd.merge(sigtxp, descriptornames, on='descriptors_name', how='inner')
    siglabel = list(siglabel['label'])

    for i in MyDataSet.columns[2:]:
        if i in siglabel:
            pass
        else:
            MyDataSet = MyDataSet.drop(i, axis=1)

    # MyDataSet.to_csv('{}{}.tsv'.format(mypath, myfilename), sep='\t', index=False)

    # return overall balanced accuracy calculations
    # can just make a unique confusion matrix for significant toxprints and add to CT-Enriched Stats file
    # print(MyDataSet.head())
    model_row = pd.DataFrame([['Chemotype Full Model Coverage', myfilename, " ".join(sigtxp['descriptors_name']), 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0]], columns = ['Chemotype ID','Data Table','Chemotype Label','Total Chemotypes','True Positives','False Positives','False Negatives','True Negatives','Balanced Accuracy','Odds Ratio','P-Value','Inverse Odds Ratio','Inverse P-Value'])

    # fill model_row confusion matrix
    for index, row in MyDataSet.iterrows():
        rowsum = sum([int(x) for x in row.iloc[2:]])
        if row['Hit_Call'] == 1 and rowsum > 0:
            model_row['True Positives'] += 1
        elif row['Hit_Call'] == 1 and rowsum == 0:
            model_row['False Negatives'] += 1
        elif row['Hit_Call'] == 0 and rowsum > 0:
            model_row['False Positives'] += 1
        elif row['Hit_Call'] == 0 and rowsum == 0:
            model_row['True Negatives'] += 1

    # fill model_row statistics
    oddsratio, pvalue = stats.fisher_exact([ [int(model_row['True Positives']), int(model_row['False Positives'])], [int(model_row['False Negatives']), int(model_row['True Negatives'])]], alternative='greater')
    model_row['P-Value'] = pvalue
    model_row['Odds Ratio'] = oddsratio
    model_row['Total Chemotypes'] = (model_row['True Positives'] + model_row['False Positives'])
    BA = (((model_row['True Positives'] / (model_row['True Positives'] + model_row['False Negatives'])) + (model_row['True Negatives'] / (model_row['True Negatives'] + model_row['False Positives']))) / 2)
    model_row['Balanced Accuracy'] = float(BA)
    inv_oddsratio, inv_pvalue = stats.fisher_exact([ [int(model_row['False Positives']), int(model_row['True Positives'])], [int(model_row['True Negatives']), int(model_row['False Negatives'])] ],alternative='greater')
    model_row['Inverse P-Value'] = inv_pvalue
    model_row['Inverse Odds Ratio'] = inv_oddsratio

    # print(model_row)
    return model_row
from io import StringIO
import re

# FIRST NEED TO GET A TABLE index = AEID and columns = Assay Cats, Toxprints Enrichments (discrete hitcalls for each assay)
# going to do this in two parts
# part 1 get assay cats X aeids
mysession = SQLSession(Schemas.information_schema).get_session()

query0 = mysession.query(AssayComponentEndpoint.aeid,
                         AssayComponentEndpoint.assay_component_endpoint_name, AssayComponent.assay_component_desc,
                         AssayComponent.assay_component_target_desc,
                         AssayComponentEndpoint.assay_component_endpoint_desc,
                         AssayComponentEndpoint.assay_function_type, AssayComponentEndpoint.normalized_data_type,
                         AssayComponentEndpoint.analysis_direction, AssayComponentEndpoint.burst_assay,
                         AssayComponentEndpoint.key_positive_control, AssayComponentEndpoint.signal_direction,
                         AssayComponentEndpoint.intended_target_type, AssayComponentEndpoint.intended_target_type_sub,
                         AssayComponentEndpoint.intended_target_family,
                         AssayComponentEndpoint.intended_target_family_sub, AssayComponent.assay_design_type,
                         AssayComponent.assay_design_type_sub, AssayComponent.biological_process_target,
                         AssayComponent.detection_technology_type, AssayComponent.detection_technology_type_sub,
                         AssayComponent.detection_technology, AssayComponent.signal_direction_type,
                         AssayComponent.key_assay_reagent, AssayComponent.key_assay_reagent_type,
                         AssayComponent.technological_target_type, AssayComponent.technological_target_type_sub) \
    .join(AssayComponent, AssayComponent.acid == AssayComponentEndpoint.acid)

mytable = pd.DataFrame(list(query0))

mytable.to_csv("~/Desktop/Assay_Categories/assay_cats_x_aeids_v2.tsv",
               sep='\t')

sys.exit(0)
Example #23
0
def mktoxprints(dsstox_id_list):
    mysession = SQLSession(Schemas.dsstox_schema).get_session()
    mydata = mysession.query(Compounds.dsstox_compound_id, Compounds.smiles)#.filter(Compounds.dsstox_compound_id.in_(dsstox_id_list))
    df = pd.DataFrame(list(mydata))
    # filter is too big need to merge instead
    idframe = pd.DataFrame(list(set(dsstox_id_list)), columns=['dsstox_compound_id'])
    df = pd.merge(idframe, df, on='dsstox_compound_id', how='left')
    print(len(dsstox_id_list), len(list(df['dsstox_compound_id'])))

    number_per_file = 50000
    files_so_far = 0
    ix = 0
    file_path = "/home/rlougee/Desktop/tmptmp{0}.smi"
    out_file = open(file_path.format(files_so_far), 'w')
    file_list = [file_path.format(files_so_far)]
    for index, row in df.iterrows():
        #handle null values
        if row['smiles'] == None:
            row['smiles'] = ''
        if ix % number_per_file == 0 and ix > 0:
            out_file.close()
            files_so_far += 1
            out_file = open(file_path.format(files_so_far), 'w')
            file_list.append(file_path.format(files_so_far))
        smile_file = row['smiles'] + "\t" + row['dsstox_compound_id'] + "\n"
        ix += 1
        out_file.write(smile_file)
    out_file.close()
    ## generate fingerprints
    ### FOR DEBUGGING: AN ACTUAL SMILEY FACE ☻ IS USED IN THIS FILE
    ### this is only used in BASH commands and IS necessary
    ### -L is a flag that identifies a file separator and most characters break the command. Also, multiple characters in a row don't work
    bashstring = ''
    for file in file_list:
        bashstring += "{0}{1}".format(str(file), str('☻'))
    command = '/opt/CORINA_Symphony/CORINA_Symphony_14698/bin/moses -N -L ☻ symphony batch -i {0} -o /share/home/rlougee/Desktop/results.tsv descriptors -f /opt/CORINA_Symphony/CORINA_Symphony_14698/content/symphony/toxprint_V2.0.xml'.format(bashstring)
    a = subp.Popen(command, shell=True)
    a.communicate()
    #import the toxprint file
    toxprintdf = pd.DataFrame.from_csv('/share/home/rlougee/Desktop/results.tsv', sep='\t')

    #handle bad smiles
    # smiles renamed
    drop_list = []
    for index, row in toxprintdf.iterrows():
        if row['M_CORINA_SYMPHONY_ERRORS_[STRING]'] != 'No errors':
            drop_list.append(index)
        elif len(index) <= 6:
            drop_list.append(index)
        elif index[0:6] != 'DTXCID':
            drop_list.append(index)

    # drop bad rows
    toxprintdf = toxprintdf.drop(drop_list)

    # remove extra columns
    toxprintdf.drop('M_COMPOUND_HISTORY_[STRING]', axis=1, inplace=True)
    toxprintdf.drop('M_CORINA_SYMPHONY_ERRORS_[STRING]', axis=1, inplace=True)

    # remove temporary files
    b = subp.Popen('rm /share/home/rlougee/Desktop/tmptmp*.smi', shell=True)
    b.communicate()
    c = subp.Popen('rm /share/home/rlougee/Desktop/results.tsv', shell=True)
    c.communicate()

    return toxprintdf