def ssmd_overview(norm_paths, pert_outfile, pert, threshold=2, exclude_wells = []): ssmds = pd.DataFrame() for path in norm_paths: gct = pe(path) x = get_ssmd(gct, exclude_wells=exclude_wells) ssmds[os.path.basename(path)] = x get_names_gct = pe(norm_paths[0]) names = get_names_gct.row_metadata_df['name'][~get_names_gct.row_metadata_df.index.isin(invariants)] all_ssmds = ssmds.unstack().tolist() failures = ssmds.loc[ssmds.median(axis=1) < threshold] failure_names = names.loc[ssmds.median(axis=1) < threshold] failures.set_index(failure_names, inplace=True) data = [all_ssmds] labels = ['All SSMDs \n n={}'.format(len(all_ssmds))] meds = failures.median(axis=1) meds.sort(ascending=True, inplace=True) failures2 = failures.loc[meds.index] iterrator = failures2.iterrows() for index, row in iterrator: data.append(row.values) labels.append(index) plt.boxplot(data, labels=labels) plt.gcf().subplots_adjust(bottom=0.15) plt.xticks(rotation=70) plt.ylim(0, 10) plt.ylabel('SSMD Values') if len(data) > 1: plt.title('{} SSMD Distribution (n={}) with {} Failed Cell Lines (n={})'.format(pert, len(all_ssmds), len(meds), len(data[1]))) else: plt.title('{} SSMD Distribution (n={}) with {} Failed Cell Lines'.format(pert, len(all_ssmds), len(meds))) plt.savefig(os.path.join(pert_outfile, '{}_SSMD_overview.png'.format(pert))) plt.clf() return len(data) - 1
def test_modz(self): l2 = pe( 'functional_tests/test_merino/assemble/test_CS0_X2/test_CS0_X2_MEDIAN.gct' ) l3 = pe( 'functional_tests/test_merino/assemble/test_CS0_X3/test_CS0_X3_MEDIAN.gct' ) r = norm.normalize(l) r2 = norm.normalize(l2) r3 = norm.normalize(l3) z = zscore.calculate_zscore(r, plate_control=True) z2 = zscore.calculate_zscore(r2, plate_control=True) z3 = zscore.calculate_zscore(r3, plate_control=True) modz_gct, ccq74, weights = distil.calculate_modz([z, z2, z3]) assert round(modz_gct.data_df['test_CS0X1:A'][0], 4) == -0.5816 assert round(modz_gct.data_df['test_CS0X1:B'][2], 4) == -0.4991 assert round(modz_gct.data_df['test_CS0X1:D'][1], 4) == 1.001
def get_failed_cell_lines(filepaths, exclude_wells=[]): failed_lines_2_map = {} failed_lines_1_5_map = {} for x in filepaths: gct = pe(x) ssmds = get_ssmd(gct, exclude_wells=exclude_wells) failed_lines2 = gct.row_metadata_df.loc[ssmds[ssmds < 2].index]['name'] failed_lines1_5 = gct.row_metadata_df.loc[ssmds[ssmds < 1.5].index]['name'] failed_lines_2_map[os.path.basename(x)] = failed_lines2 failed_lines_1_5_map[os.path.basename(x)] = failed_lines1_5 return failed_lines_2_map, failed_lines_1_5_map
import weave import merino import cmapPy.pandasGEXpress.GCToo as GCToo import cmapPy.pandasGEXpress.parse as pe import unittest import merino.setup_logger as setup_logger import logging import numpy import os import glob import pandas as pd import shutil logger = logging.getLogger(setup_logger.LOGGER_NAME) l = pe( 'functional_tests/test_merino/assemble/test_CS0_X1/test_CS0_X1_MEDIAN.gct') class TestMerino(unittest.TestCase): def test_norm(self): r = norm.normalize(l) # Column 'C' should have been removed due to low median invariant assert len(r.data_df.columns) == 4 # Check values assert round(r.data_df['test_CS0_X1:A'][0], 4) == 1.415 assert round(r.data_df['test_CS0_X1:B'][2], 4) == 0.7776 assert round(r.data_df['test_CS0_X1:D'][1], 4) == 3.4919 def test_zscore(self): #TODO split plate control and vehicle control into two functions
import glob import cmapPy.pandasGEXpress.parse as pe import os for path in glob.glob('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/*ZSPC*/*/*.gct'): plate_name = os.path.basename(os.path.dirname(path)) gct = pe(path) for pool in gct.row_metadata_df['pool_id'].unique(): if not os.path.exists('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}'.format(pool)): os.mkdir('/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}'.format(pool)) pool_df = gct.data_df.loc[gct.row_metadata_df[gct.row_metadata_df['pool_id'] == pool].index] pool_df.columns = [x.split(':')[0] + ':' + x.split(':')[1] for x in pool_df.columns] map.mk_heatmap(pool_df, 'Heatmap of Median ZSCORE Values for {}'.format(plate_name), '/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/QC/pool_heatmaps/{}/{}.png'.format(pool, plate_name), lims=[-10, 10], colormap='coolwarm')
def longform_table( siginfo_path, row_meta_path, pcl_connect_path, pert_connect_path, pcl_gmt_path='/Volumes/cmap_obelix/pod/custom/ASG/sas/ASG_C_V3/pcl_connectivity/ASG_PCL_xicon_n106.gmt' ): siginfo = pd.read_table(siginfo_path) row_meta = pd.read_table(row_meta_path, index_col='rid') row_meta.index = [str(x) for x in row_meta.index] row_meta['minipool'] = [ 'P' + x.split(' ')[2] for x in row_meta['minipool_id'] ] siginfo.set_index('sig_id', inplace=True) siginfo['pert_well'] = siginfo['det_well'] diff_tas_dict = {} for x in siginfo['cell_id'].unique(): print ' ' print x temp_col = siginfo[siginfo['cell_id'] == x] data = GCToo.GCToo(data_df=pd.DataFrame(temp_col['distil_tas']).T, col_metadata_df=temp_col, row_metadata_df=pd.DataFrame(index=['distil_tas'])) diff_tas_dict[x] = {} for y in siginfo[siginfo['pert_type'] == 'trt_cp']['pert_iname'].unique(): ssmd = ssmd_an.get_ssmd(df=data, pos_field='pert_iname', pos_val=y) diff_tas_dict[x][y] = ssmd.values[0] diff_tas = pd.DataFrame(diff_tas_dict) dicto = {} for comp in siginfo.index: if siginfo.loc[comp, 'cell_id'].split('.')[1] in row_meta['pool_id'].values: vals = siginfo.loc[comp, ['pert_id', 'pert_idose']].values if vals[1] == '0.1 um': vals[1] = '100 nm' if vals[0] == 'DMSO': continue dicto[comp] = {} dicto[comp]['tas'] = siginfo.loc[comp, 'distil_tas'] dicto[comp]['pool_id'] = siginfo.loc[comp, 'cell_id'].split('.')[1] dicto[comp]['dose'] = vals[1] dicto[comp]['ssmd'] = diff_tas.loc[siginfo.loc[comp, 'pert_iname'], siginfo.loc[comp, 'cell_id']] master_table = pd.DataFrame(dicto).T pcl_map = gmt.read(pcl_gmt_path) pcl_map_df = pd.DataFrame(pcl_map) pcl_map_df.set_index('id', inplace=True) pcl = pe(pcl_connect_path) pert_ps = pe(pert_connect_path) pcl_rankz = pcl.data_df.rank(method='min', ascending=False) pert_rankz = pert_ps.data_df.rank(method='min', ascending=False) dicto = {} for x in pcl.data_df.columns: pert_id = siginfo.loc[x, 'pert_id'] pert_iname = siginfo.loc[x, 'pert_iname'] dose = siginfo.loc[x, 'pert_idose'] if pert_id not in pcl_map_df.index.values: continue lookup = pcl_map_df.loc[pert_id, 'sig'] for thing in lookup: ident = x + ' ' + thing dicto[ident] = {} dicto[ident]['sig_id'] = x dicto[ident]['pert_id'] = pert_id dicto[ident]['pert_iname'] = pert_iname dicto[ident]['dose'] = dose dicto[ident]['pcl_score'] = pcl.data_df.loc[thing, x] dicto[ident]['pcl_rank'] = pcl_rankz.loc[thing, x] dicto[ident]['pool_id'] = siginfo.loc[x, 'cell_id'].split('.')[1] pcl_scrank_df = pd.DataFrame(dicto).T pcl_scrank_df['PCLs'] = [x.split(' ')[1] for x in pcl_scrank_df.index] pcl_scrank_df.index = [x.split(' ')[0] for x in pcl_scrank_df.index] dicto = {} for x in pert_ps.data_df.columns: pert_id = siginfo.loc[x, 'pert_id'] pert_iname = siginfo.loc[x, 'pert_iname'] dose = siginfo.loc[x, 'pert_idose'] if pert_id not in pert_ps.data_df.index.values: continue dicto[x] = {} dicto[x]['pert_id'] = pert_id dicto[x]['pert_iname'] = pert_iname dicto[x]['dose'] = dose dicto[x]['cp_score'] = pert_ps.data_df.loc[pert_id, x] dicto[x]['cp_rank'] = pert_rankz.loc[pert_id, x] dicto[x]['pool_id'] = siginfo.loc[x, 'cell_id'].split('.')[1] pert_scrank_df = pd.DataFrame(dicto).T master_table = master_table.join(pert_scrank_df[['cp_rank', 'cp_score']]) master_table = master_table.join( pcl_scrank_df[['pcl_rank', 'pcl_score', 'PCLs']]) return master_table
import glob import cmapPy.pandasGEXpress.parse as pe import cmapPy.pandasGEXpress.write_gct as wgct for path in glob.glob( '/Volumes/cmap_obelix/pod/custom/PCAL/elwork/PCAL_T3A/assemble/*/*.gct' ): print path ztest = pe(path) ztest.col_metadata_df['gb_id'] = ztest.col_metadata_df['pert_id'] ztest.col_metadata_df.loc[ztest.col_metadata_df['pert_iname'] == 'Bortezomib', 'pert_type'] = 'trt_poscon' ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'trt_poscon', 'gb_id'] = ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'trt_poscon', 'gb_id'] + [ ':' + str(x) for x in range( 1, len(ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'trt_poscon', 'gb_id']) + 1) ] ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'ctl_vehicle', 'gb_id'] = ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'ctl_vehicle', 'gb_id'] + [ ':' + str(x) for x in range( 1, len(ztest.col_metadata_df.loc[ ztest.col_metadata_df['pert_type'] == 'ctl_vehicle', 'gb_id']) + 1)