Esempio n. 1
0
def ssmd_overview(norm_paths, pert_outfile, pert, threshold=2, exclude_wells = []):

    ssmds = pd.DataFrame()
    for path in norm_paths:
        gct = pe(path)
        x = get_ssmd(gct, exclude_wells=exclude_wells)
        ssmds[os.path.basename(path)] = x
    get_names_gct = pe(norm_paths[0])
    names = get_names_gct.row_metadata_df['name'][~get_names_gct.row_metadata_df.index.isin(invariants)]

    all_ssmds = ssmds.unstack().tolist()

    failures = ssmds.loc[ssmds.median(axis=1) < threshold]

    failure_names = names.loc[ssmds.median(axis=1) < threshold]

    failures.set_index(failure_names, inplace=True)

    data = [all_ssmds]
    labels = ['All SSMDs \n n={}'.format(len(all_ssmds))]

    meds = failures.median(axis=1)
    meds.sort(ascending=True, inplace=True)
    failures2 = failures.loc[meds.index]

    iterrator = failures2.iterrows()

    for index, row in iterrator:
        data.append(row.values)
        labels.append(index)

    plt.boxplot(data, labels=labels)
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.xticks(rotation=70)
    plt.ylim(0, 10)
    plt.ylabel('SSMD Values')

    if len(data) > 1:

        plt.title('{} SSMD Distribution (n={}) with {} Failed Cell Lines (n={})'.format(pert, len(all_ssmds), len(meds),
                                                                                    len(data[1])))
    else:
        plt.title('{} SSMD Distribution (n={}) with {} Failed Cell Lines'.format(pert, len(all_ssmds), len(meds)))

    plt.savefig(os.path.join(pert_outfile, '{}_SSMD_overview.png'.format(pert)))

    plt.clf()

    return len(data) - 1
Esempio n. 2
0
    def test_modz(self):
        l2 = pe(
            'functional_tests/test_merino/assemble/test_CS0_X2/test_CS0_X2_MEDIAN.gct'
        )
        l3 = pe(
            'functional_tests/test_merino/assemble/test_CS0_X3/test_CS0_X3_MEDIAN.gct'
        )

        r = norm.normalize(l)
        r2 = norm.normalize(l2)
        r3 = norm.normalize(l3)

        z = zscore.calculate_zscore(r, plate_control=True)
        z2 = zscore.calculate_zscore(r2, plate_control=True)
        z3 = zscore.calculate_zscore(r3, plate_control=True)

        modz_gct, ccq74, weights = distil.calculate_modz([z, z2, z3])

        assert round(modz_gct.data_df['test_CS0X1:A'][0], 4) == -0.5816
        assert round(modz_gct.data_df['test_CS0X1:B'][2], 4) == -0.4991
        assert round(modz_gct.data_df['test_CS0X1:D'][1], 4) == 1.001
Esempio n. 3
0
def get_failed_cell_lines(filepaths, exclude_wells=[]):
    failed_lines_2_map = {}
    failed_lines_1_5_map = {}

    for x in filepaths:
        gct = pe(x)
        ssmds = get_ssmd(gct, exclude_wells=exclude_wells)
        failed_lines2 = gct.row_metadata_df.loc[ssmds[ssmds < 2].index]['name']
        failed_lines1_5 = gct.row_metadata_df.loc[ssmds[ssmds < 1.5].index]['name']
        failed_lines_2_map[os.path.basename(x)] = failed_lines2
        failed_lines_1_5_map[os.path.basename(x)] = failed_lines1_5

    return failed_lines_2_map, failed_lines_1_5_map
Esempio n. 4
0
import weave
import merino
import cmapPy.pandasGEXpress.GCToo as GCToo
import cmapPy.pandasGEXpress.parse as pe
import unittest
import merino.setup_logger as setup_logger
import logging
import numpy
import os
import glob
import pandas as pd
import shutil

logger = logging.getLogger(setup_logger.LOGGER_NAME)

l = pe(
    'functional_tests/test_merino/assemble/test_CS0_X1/test_CS0_X1_MEDIAN.gct')


class TestMerino(unittest.TestCase):
    def test_norm(self):
        r = norm.normalize(l)

        # Column 'C' should have been removed due to low median invariant
        assert len(r.data_df.columns) == 4
        # Check values
        assert round(r.data_df['test_CS0_X1:A'][0], 4) == 1.415
        assert round(r.data_df['test_CS0_X1:B'][2], 4) == 0.7776
        assert round(r.data_df['test_CS0_X1:D'][1], 4) == 3.4919

    def test_zscore(self):
        #TODO split plate control and vehicle control into two functions
Esempio n. 5
0
import glob
import cmapPy.pandasGEXpress.parse as pe
import os


for path in glob.glob('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/*ZSPC*/*/*.gct'):
    plate_name = os.path.basename(os.path.dirname(path))
    gct = pe(path)
    for pool in gct.row_metadata_df['pool_id'].unique():

        if not os.path.exists('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}'.format(pool)):
            os.mkdir('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}'.format(pool))

        pool_df = gct.data_df.loc[gct.row_metadata_df[gct.row_metadata_df['pool_id'] == pool].index]

        pool_df.columns = [x.split(':')[0] + ':' + x.split(':')[1] for x in pool_df.columns]

        map.mk_heatmap(pool_df, 'Heatmap of Median ZSCORE Values for {}'.format(plate_name),
                       '/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}/{}.png'.format(pool,
                                                                                                                plate_name),
                       lims=[-10, 10], colormap='coolwarm')
Esempio n. 6
0
def longform_table(
    siginfo_path,
    row_meta_path,
    pcl_connect_path,
    pert_connect_path,
    pcl_gmt_path='/Volumes/cmap_obelix/pod/custom/ASG/sas/ASG_C_V3/pcl_connectivity/ASG_PCL_xicon_n106.gmt'
):

    siginfo = pd.read_table(siginfo_path)
    row_meta = pd.read_table(row_meta_path, index_col='rid')
    row_meta.index = [str(x) for x in row_meta.index]
    row_meta['minipool'] = [
        'P' + x.split(' ')[2] for x in row_meta['minipool_id']
    ]
    siginfo.set_index('sig_id', inplace=True)
    siginfo['pert_well'] = siginfo['det_well']

    diff_tas_dict = {}
    for x in siginfo['cell_id'].unique():
        print ' '
        print x
        temp_col = siginfo[siginfo['cell_id'] == x]

        data = GCToo.GCToo(data_df=pd.DataFrame(temp_col['distil_tas']).T,
                           col_metadata_df=temp_col,
                           row_metadata_df=pd.DataFrame(index=['distil_tas']))
        diff_tas_dict[x] = {}
        for y in siginfo[siginfo['pert_type'] ==
                         'trt_cp']['pert_iname'].unique():
            ssmd = ssmd_an.get_ssmd(df=data, pos_field='pert_iname', pos_val=y)
            diff_tas_dict[x][y] = ssmd.values[0]

    diff_tas = pd.DataFrame(diff_tas_dict)

    dicto = {}
    for comp in siginfo.index:
        if siginfo.loc[comp,
                       'cell_id'].split('.')[1] in row_meta['pool_id'].values:
            vals = siginfo.loc[comp, ['pert_id', 'pert_idose']].values
            if vals[1] == '0.1 um':
                vals[1] = '100 nm'
            if vals[0] == 'DMSO':
                continue
            dicto[comp] = {}
            dicto[comp]['tas'] = siginfo.loc[comp, 'distil_tas']
            dicto[comp]['pool_id'] = siginfo.loc[comp, 'cell_id'].split('.')[1]
            dicto[comp]['dose'] = vals[1]
            dicto[comp]['ssmd'] = diff_tas.loc[siginfo.loc[comp, 'pert_iname'],
                                               siginfo.loc[comp, 'cell_id']]

    master_table = pd.DataFrame(dicto).T

    pcl_map = gmt.read(pcl_gmt_path)
    pcl_map_df = pd.DataFrame(pcl_map)
    pcl_map_df.set_index('id', inplace=True)
    pcl = pe(pcl_connect_path)
    pert_ps = pe(pert_connect_path)
    pcl_rankz = pcl.data_df.rank(method='min', ascending=False)
    pert_rankz = pert_ps.data_df.rank(method='min', ascending=False)

    dicto = {}
    for x in pcl.data_df.columns:
        pert_id = siginfo.loc[x, 'pert_id']
        pert_iname = siginfo.loc[x, 'pert_iname']
        dose = siginfo.loc[x, 'pert_idose']
        if pert_id not in pcl_map_df.index.values:
            continue
        lookup = pcl_map_df.loc[pert_id, 'sig']
        for thing in lookup:
            ident = x + ' ' + thing
            dicto[ident] = {}
            dicto[ident]['sig_id'] = x
            dicto[ident]['pert_id'] = pert_id
            dicto[ident]['pert_iname'] = pert_iname
            dicto[ident]['dose'] = dose
            dicto[ident]['pcl_score'] = pcl.data_df.loc[thing, x]
            dicto[ident]['pcl_rank'] = pcl_rankz.loc[thing, x]
            dicto[ident]['pool_id'] = siginfo.loc[x, 'cell_id'].split('.')[1]
    pcl_scrank_df = pd.DataFrame(dicto).T
    pcl_scrank_df['PCLs'] = [x.split(' ')[1] for x in pcl_scrank_df.index]
    pcl_scrank_df.index = [x.split(' ')[0] for x in pcl_scrank_df.index]

    dicto = {}
    for x in pert_ps.data_df.columns:
        pert_id = siginfo.loc[x, 'pert_id']
        pert_iname = siginfo.loc[x, 'pert_iname']
        dose = siginfo.loc[x, 'pert_idose']
        if pert_id not in pert_ps.data_df.index.values:
            continue
        dicto[x] = {}
        dicto[x]['pert_id'] = pert_id
        dicto[x]['pert_iname'] = pert_iname
        dicto[x]['dose'] = dose
        dicto[x]['cp_score'] = pert_ps.data_df.loc[pert_id, x]
        dicto[x]['cp_rank'] = pert_rankz.loc[pert_id, x]
        dicto[x]['pool_id'] = siginfo.loc[x, 'cell_id'].split('.')[1]
    pert_scrank_df = pd.DataFrame(dicto).T

    master_table = master_table.join(pert_scrank_df[['cp_rank', 'cp_score']])
    master_table = master_table.join(
        pcl_scrank_df[['pcl_rank', 'pcl_score', 'PCLs']])

    return master_table
Esempio n. 7
0
import glob
import cmapPy.pandasGEXpress.parse as pe
import cmapPy.pandasGEXpress.write_gct as wgct

for path in glob.glob(
        '/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/assemble/*/*.gct'
):
    print path
    ztest = pe(path)
    ztest.col_metadata_df['gb_id'] = ztest.col_metadata_df['pert_id']
    ztest.col_metadata_df.loc[ztest.col_metadata_df['pert_iname'] ==
                              'Bortezomib', 'pert_type'] = 'trt_poscon'
    ztest.col_metadata_df.loc[
        ztest.col_metadata_df['pert_type'] == 'trt_poscon',
        'gb_id'] = ztest.col_metadata_df.loc[
            ztest.col_metadata_df['pert_type'] == 'trt_poscon', 'gb_id'] + [
                ':' + str(x) for x in range(
                    1,
                    len(ztest.col_metadata_df.loc[
                        ztest.col_metadata_df['pert_type'] == 'trt_poscon',
                        'gb_id']) + 1)
            ]
    ztest.col_metadata_df.loc[
        ztest.col_metadata_df['pert_type'] == 'ctl_vehicle',
        'gb_id'] = ztest.col_metadata_df.loc[
            ztest.col_metadata_df['pert_type'] == 'ctl_vehicle', 'gb_id'] + [
                ':' + str(x) for x in range(
                    1,
                    len(ztest.col_metadata_df.loc[
                        ztest.col_metadata_df['pert_type'] == 'ctl_vehicle',
                        'gb_id']) + 1)