def mk_report(proj_folder, out_folder): mar_sense = pd.read_table(os.path.join(out_folder,'sense/expected_sensitivity_ranks.txt'), index_col='det_plate') mar_sense = mar_sense / 384 mar_sense = mar_sense * 100 for x in pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique(): mar_sense[x] = mar_sense[[p for p in mar_sense.columns if p.startswith(x)]].median(axis=1) mar_sense = mar_sense[pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique()] gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'card/*/*NORM*'))]] norm_gct = concat.hstack(gct_list) gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'assemble/*/*MEDIAN*'))]] mfi_gct = concat.hstack(gct_list) n_recovered = [] invs = [] beadsets = [] plate = [] med_rank = [] dropouts = [] for det_plate in mar_sense.index: temp = norm_gct.data_df[[x for x in norm_gct.data_df.columns if x.startswith(det_plate)]] dropouts.append(384 - temp.shape[1]) sigs_recovered = mar_sense.loc[det_plate].dropna()[mar_sense.loc[det_plate].dropna() < 50].count() median_rank = mar_sense.loc[det_plate].median() temp = mfi_gct.data_df[[x for x in mfi_gct.data_df.columns if x.startswith(det_plate)]] median_inv = temp.loc[['c-661', 'c-662', 'c-663', 'c-664']].median(axis=1).median() beadset = det_plate.split('_')[-1].split(':')[0] n_recovered.append(sigs_recovered) invs.append(median_inv) beadsets.append(beadset) plate.append(det_plate) med_rank.append(median_rank) mar_df = pd.concat([pd.Series(plate).rename('det_plate'), pd.Series(n_recovered).rename('sigs_recovered_core'), pd.Series(med_rank).rename('median_rank_core'), pd.Series(invs).rename('median_inv'), pd.Series(dropouts).rename('n_dropouts'), pd.Series(beadsets).rename('beadset')], axis=1) mar_df.set_index('det_plate', inplace=True) mar_ssmd = ssmd_an.ssmd_matrix( norm_paths=glob.glob(os.path.join(proj_folder,'card/*/*NORM*'))) mar_df = mar_df.join(mar_ssmd[mar_ssmd < 2].count().rename('ssmd_failures')) return mar_df
def run_sensitivities(proj_folder, gmt_path, out_folder): gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'card/*/*ZSPC.gct'))]] fail_gct = concat.hstack(gct_list) if not os.path.exists(os.path.join(args.outfolder, 'sense')): os.mkdir(os.path.join(args.outfolder, 'sense')) sense.wtks(gct=fail_gct, metadata=fail_gct.col_metadata_df, outfolder=os.path.join(out_folder, 'sense'), group_col='prism_replicate', gmt_path=gmt_path)
def combat_by_group(gct_list, col_group='pert_well', batch_field='pool_id', use_col_group_as_batch=True): """ Applies ComBat batch adjustment algorithm to a list of input GCT objects grouped by the specified column grouping. The method first concatenates the input list of GCT objects, splits columns by specified col_group and applies ComBat on the unwrapped matrix values using the either batch_field + replicate id if use_col_group_as batch is True or batch_field alone otherwise. Args: gct_list: list of GCT objects col_group: Column metadata field(s) to group columns by batch_field: Row metadata field used to specify the batches use_col_group_as_batch: if True the the column identity is appended to the batch vector such that the number of batches for a group = Number of unique batch_field entries * Number of columns in the group Returns: all_ds: Concatenated Combat adjusted values adj_list: list of gct objects of the adjusted values of the subsets of all_ds that match gct_list """ # concatenate replicate datasets by column LOGGER.info("now running ComBat batch adjustment") fields_to_remove = [x for x in gct_list[0].row_metadata_df.columns if x in ['det_plate', 'det_plate_scan_time', 'assay_plate_barcode']] all_ds = cg.hstack(gct_list, remove_all_metadata_fields=False,error_report_file=None, fields_to_remove=fields_to_remove) # column groups #col_groups = all_ds.col_metadata_df.groupby(col_group).groups #row_groups = all_ds.row_metadata_df[row_group] pool = mp.Pool(processes=mp.cpu_count()) chunks = data_splitter(all_ds, col_group, batch_field, use_col_group_as_batch) LOGGER.info('Here 1') adjusted_data = pool.map(combat_worker, chunks) LOGGER.info('Here 2') for res in adjusted_data: all_ds.data_df[res.columns] = res LOGGER.info('Here 3') combat_adjusted_gcts = [] for _, input_ds in enumerate(gct_list): this_ds = gct_slice(all_ds, rid=input_ds.data_df.index.tolist(), cid=input_ds.data_df.columns.tolist()) this_ds.src = input_ds.src this_ds.data_df = this_ds.data_df.astype(float) combat_adjusted_gcts.append(this_ds) LOGGER.info('Here 4') return all_ds, combat_adjusted_gcts
def build(search_pattern, outfile, file_suffix, cut=True, check_size=False): gct_list = glob.glob(search_pattern) old_len = len(gct_list) if cut == True: gct_list = cut_to_l2.cut_l1(gct_list) new_len = len(gct_list) logger.info('Number of old lysate plates removed = {}'.format(old_len - new_len)) if new_len == 0: return gcts = [] failure_list = [] for gct in gct_list: temp = pe.parse(gct) gcts.append(temp) if temp.data_df.shape[1] <= 349 and check_size == True: failure_list.append(os.path.basename(gct).replace('_NORM.gct', '')) for ct in gcts: ct.row_metadata_df = gcts[0].row_metadata_df fields_to_remove = [ x for x in gcts[0].row_metadata_df.columns if x in ['det_plate', 'det_plate_scan_time', 'assay_plate_barcode'] ] concat_gct = cg.hstack(gcts, False, None, fields_to_remove=fields_to_remove) concat_gct_wo_meta = GCToo.GCToo( data_df=concat_gct.data_df, row_metadata_df=pd.DataFrame(index=concat_gct.data_df.index), col_metadata_df=pd.DataFrame(index=concat_gct.col_metadata_df.index)) logger.debug("gct shape without metadata: {}".format( concat_gct_wo_meta.data_df.shape)) wgx.write( concat_gct_wo_meta, outfile + 'n{}x{}'.format(concat_gct.data_df.shape[1], concat_gct.data_df.shape[0]) + file_suffix) return concat_gct, failure_list
def test_left_right(self): left_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_left.gct") right_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merge_right.gct") expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_merged_left_right.gct") left_gct = pg.parse(left_gct_path) right_gct = pg.parse(right_gct_path) expected_gct = pg.parse(expected_gct_path) # Merge left and right concated_gct = cg.hstack([left_gct, right_gct], False, None, [], False) pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False) pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False) pd.util.testing.assert_frame_equal(expected_gct.col_metadata_df, concated_gct.col_metadata_df, check_names=False)
def cmap_matrix( client, data_level="level5", feature_space="landmark", rid=None, cid=None, verbose=False, chunk_size=1000, table=None, limit=4000, ): """ Query for numerical data for signature-gene level data. :param client: Bigquery Client :param data_level: Data level requested. IDs from siginfo file correspond to 'level5'. Ids from instinfo are available in 'level3' and 'level4'. Choices are ['level5', 'level4', 'level3'] :param rid: Row ids :param cid: Column ids :param feature_space: Common featurespaces to extract. 'rid' overrides selection Choices: ['landmark', 'bing', 'aig'] landmark: 978 landmark genes bing: Best-inferred set of 10,174 genes aig: All inferred genes including 12,328 genes Default is landmark. :param chunk_size: Runs queries in stages to avoid query character limit. Default 1,000 :param limit: Soft limit for number of signatures allowed. Default is 4,000. :param table: Table address to query. Overrides 'data_level' parameter. Generally should not be used. :param verbose: Print query and table address. :return: GCToo object """ config = cfg.get_default_config() if table is not None: table_id = table else: if data_level == "level3": table_id = config.tables.level3 elif data_level == "level4": table_id = config.tables.level4 elif data_level == "level5": table_id = config.tables.level5 else: print( "Unsupported data_level. select from ['level3', 'level4', level5'].\n Default is 'level5'. " ) sys.exit(1) if cid: cid = parse_condition(cid) assert len(cid) <= limit, "List of cids can not exceed limit of {}".format( limit ) cur = 0 nparts = ceil(len(cid) / chunk_size) result_dfs = [] while cur < nparts: start = cur * chunk_size end = ( cur * chunk_size + chunk_size ) # No need to check for end, index only returns present values cur = cur + 1 print("Running query ... ({}/{})".format(cur, nparts)) result_dfs.append( _build_and_launch_query( client, table_id, rid=rid, cid=cid[start:end], feature_space=feature_space, verbose=verbose ) ) try: pool = mp.Pool(mp.cpu_count()) print("Pivoting Dataframes to GCT objects") result_gctoos = pool.map(_pivot_result, result_dfs) pool.close() except: if nparts > 1: print("Multiprocessing unavailable, pivoting chunks in series...") cur = 0 result_gctoos = [] for df in result_dfs: cur = cur + 1 print("Pivoting... ({}/{})".format(cur, nparts)) result_gctoos.append(_pivot_result(df)) print("Complete") return hstack(result_gctoos) else: print("Provide column ids to extract using the cid= keyword argument") sys.exit(1)
def calculate_modz(gct_list, group_by=['pert_well'], skip=None): ''' Args: gct_list: List of GCT objects for performing modZ group_by: skip: Dictionary of col metadata fields and respective values to identify columns which you do not want to modZ Returns: ''' fields_to_remove = [ x for x in gct_list[0].row_metadata_df.columns if x in ['det_plate', 'det_plate_scan_time', 'assay_plate_barcode'] ] master_gct = cg.hstack(gct_list, False, None, fields_to_remove=fields_to_remove) #TODO Change to replicate set ID when we have it in assemble #TODO change prism_replicate to replicate_id in assemble ncomponents = len(gct_list[0].col_metadata_df.index[0].split('_')) replicate_set_id = gct_list[0].col_metadata_df.index[0].rsplit( "_", ncomponents - 3)[0] cc_q75_df = pd.DataFrame(columns=[ 'weave_prefix', 'det_well', 'profile_ids', 'cc_ut', 'cc_q75', 'nprofile', 'ss_ltn3', 'ss_ltn2', 'ss_ltn1', 'cis_ltn3', 'cis_ltn2', 'cis_ltn1' ]) cc_q75_df.index.name = 'sig_id' modZ_mat = pd.DataFrame(index=master_gct.data_df.index) all_weights = pd.Series() all_raw_weights = pd.Series() all_corr_values = pd.DataFrame( columns=['weave_prefix', 'cid', 'rid', 'spearman_corr']) master_gct.col_metadata_df['group_by'] = [ ' '.join(master_gct.col_metadata_df[group_by].astype(str).loc[x]) for x in master_gct.col_metadata_df[group_by].index ] raw_groupby_vals = set(master_gct.col_metadata_df[group_by]) groupby_vals = sorted(list(raw_groupby_vals)) if skip != None: skip_dex = [ pd.Series( master_gct.col_metadata_df[master_gct.col_metadata_df[x].isin( skip[x])].index) for x in skip ] if len(skip_dex) > 1: skip_dex = pd.concat(skip_dex) else: skip_dex = skip_dex[0] skip_df = master_gct.col_metadata_df.loc[skip_dex.values] master_gct.col_metadata_df.drop(skip_dex, inplace=True) #TODO Abstract out a method which just takes the GCT objects. Find a way to get rid of any field that doesn't match up in cg.hstack. for gv_val in master_gct.col_metadata_df['group_by'].unique(): dex = master_gct.col_metadata_df[master_gct.col_metadata_df['group_by'] == gv_val].index.tolist() mat = master_gct.data_df[dex] modz_values, upper_tri_series, raw_weights, weights = modz(mat) if len(mat.columns) == 1: modz_values = mat[mat.columns[0]] upper_tri_series['weave_prefix'] = replicate_set_id all_corr_values = all_corr_values.append(upper_tri_series) all_weights = all_weights.append(weights) all_raw_weights = all_raw_weights.append(raw_weights) ss1_5, ss1, ss_5 = calculate_sig_strength(modz_values, n_reps=len( upper_tri_series.index)) q75 = calculate_q75(upper_tri_series['spearman_corr'].round(4)) cis1_5, cis1, cis_5 = calculate_cis(ss1_5, ss1, ss_5, q75, len(modz_values)) modz_values[modz_values < -10] = -10 modz_values[modz_values > 10] = 10 wells = ''.join(pd.Series([x[-4:] for x in mat.columns]).unique()) replicate_set_id = replicate_set_id.replace('COP23', 'KJ100').replace( '_X1', '').replace('_X2', '') modZ_mat[replicate_set_id + ':' + gv_val] = modz_values cc_q75_df.loc[replicate_set_id + ':' + gv_val] = [ replicate_set_id, wells, ','.join(weights.index.values.tolist()), ','.join([ str(x) for x in upper_tri_series['spearman_corr'].round( 4).values.tolist() ]), q75, len(raw_weights.index), ss1_5, ss1, ss_5, cis1_5.round(4), cis1.round(4), cis_5.round(4) ] col_meta = master_gct.col_metadata_df.drop_duplicates(subset=group_by, keep="first") if skip != None: skip_data = master_gct.data_df[skip_dex.tolist()] modZ_mat = modZ_mat.join(skip_data) col_meta = col_meta.append(skip_df) for dax in skip_dex: cc_q75_df.loc[dax] = [ replicate_set_id, dax[-3:], dax, '-666', '-666', 1, '-666', '-666', '-666', '-666', '-666', '-666' ] col_meta = col_meta.loc[[ x.split(',')[0] for x in cc_q75_df['profile_ids'] ]] col_meta.index = cc_q75_df.index col_meta['data_level'] = 'modZ' if 'provenance' in col_meta: col_meta['provenance'] = gct_list[0].col_metadata_df[ 'provenance'] + ' | modZ' modZ_mat.index = modZ_mat.index.astype(str) master_gct.row_metadata_df.index = master_gct.row_metadata_df.index.astype( str) modZ_GCT = GCToo.GCToo(data_df=modZ_mat, row_metadata_df=master_gct.row_metadata_df, col_metadata_df=col_meta) all_corr_values.set_index(all_corr_values['weave_prefix'], inplace=True) del all_corr_values['weave_prefix'] return modZ_GCT, cc_q75_df, [all_weights, all_raw_weights]