def ssmd_ecdf(norm_gct, median_gct, title, outfile): norm_ssmd = pd.Series() median_ssmd = pd.Series() for rep in norm_gct.col_metadata_df['prism_replicate'].unique(): norm_temp = norm_gct.data_df[norm_gct.col_metadata_df[norm_gct.col_metadata_df['prism_replicate'] == rep].index] norm_temp_col = norm_gct.col_metadata_df.loc[norm_gct.col_metadata_df['prism_replicate'] == rep] temp_norm_ssmd = get_ssmd(GCToo.GCToo(data_df=norm_temp, col_metadata_df=norm_temp_col, row_metadata_df=norm_gct.row_metadata_df), unlog=True) med_temp = median_gct.data_df[median_gct.col_metadata_df[median_gct.col_metadata_df['prism_replicate'] == rep].index] med_temp_col = median_gct.col_metadata_df.loc[median_gct.col_metadata_df['prism_replicate'] == rep] temp_median_ssmd = get_ssmd(GCToo.GCToo(data_df=med_temp, col_metadata_df=med_temp_col, row_metadata_df=median_gct.row_metadata_df)) norm_ssmd = norm_ssmd.append(temp_norm_ssmd) median_ssmd = median_ssmd.append(temp_median_ssmd) norm_ecdf = ECDF(norm_ssmd) med_ecdf = ECDF(median_ssmd) plt.plot(norm_ecdf.x, norm_ecdf.y, label='NORM') plt.plot(med_ecdf.x, med_ecdf.y, label='MFI') plt.xlim(-1,10) plt.xlabel('SSMD Values') plt.title(title) axes = plt.gca() axes.legend(bbox_to_anchor=(.615, 0.81, 0.8, .6), loc=3, borderaxespad=0.) plt.savefig(os.path.join(outfile, 'SSMD_ECDF.png'))
def setUpClass(cls): cls.sym_data_df = pd.DataFrame( [[0.9, 0.4, 0.6], [0.4, 1.0, -0.3], [0.6, -0.3, 1.0]], index=["a", "b", "c"], columns=["a", "b", "c"]) cls.sym_meta_df = pd.DataFrame( [["A375", "great"], ["A375", "bad"], ["A375", "ok"]], index=["a", "b", "c"], columns=["cell_id", "pert_type"]) cls.sym_gct = GCToo.GCToo(cls.sym_data_df, cls.sym_meta_df, cls.sym_meta_df) cls.asym_data_df = pd.DataFrame( [[0.1, 0.4], [-0.7, -0.1], [np.nan, 0.9]], index=["A", "B", "C"], columns=["o", "k"]) cls.asym_row_meta_df = pd.DataFrame( ["3h", "1h", "2h"], index=["A", "B", "C"], columns=["pert_time"]) cls.asym_col_meta_df = pd.DataFrame( [["MCF7", "3h"], ["A375", "6h"]], index=["o", "k"], columns=["cell_id", "pert_time"]) cls.asym_gct = GCToo.GCToo(cls.asym_data_df, cls.asym_row_meta_df, cls.asym_col_meta_df) cls.sym_g = ig.Graph() cls.sym_g.add_vertices(3) cls.sym_g.add_edges([(0, 1), (0, 2), (1, 2)]) cls.sym_g.vs["id"] = ["a", "b", "c"] cls.sym_g.vs["cell_id"] = ["A375", "A375", "A375"] cls.sym_g.vs["pert_type"] = ["great", "bad", "ok"] cls.sym_g.es["weight"] = [0.4, 0.6, -0.3] cls.asym_g = ig.Graph() cls.asym_g.add_vertices(5) cls.asym_g.add_edges([(0, 3), (0, 4), (1, 3), (1, 4), (2, 3), (2, 4)]) cls.asym_g.vs["id"] = ["A", "B", "C", "o", "k"] cls.asym_g.vs["type"] = [False, False, False, True, True] cls.asym_g.vs["cell_id"] = [None, None, None, "MCF7", "A375"] cls.asym_g.vs["pert_time"] = ["3h", "1h", "2h", "3h", "6h"] cls.asym_g.es["weight"] = [0.1, 0.4, -0.7, -0.1, np.nan, 0.9]
def test_check_df(self): not_unique_data_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["a", "b", "a"]) not_unique_rhd = pd.DataFrame([["rhd_A", "rhd_B"], ["rhd_C", "rhd_D"]], index=["A", "B"], columns=["rhd1", "rhd1"]) """ # case 3: row subsetting - sample subset > og # of samples with self.assertRaises(AssertionError) as context: random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row") self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception)) """ # cids in data_df are not unique with self.assertRaises(Exception) as context: GCToo.GCToo(data_df=not_unique_data_df, row_metadata_df=pd.DataFrame(index=["A", "B"]), col_metadata_df=pd.DataFrame(index=["a", "b", "c"])) print(str(not_unique_data_df.columns)) self.assertTrue( str(not_unique_data_df.columns) in str(context.exception)) # rhds are not unique in row_metadata_df with self.assertRaises(Exception) as context: GCToo.GCToo(data_df=pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["a", "b", "c"]), row_metadata_df=not_unique_rhd, col_metadata_df=pd.DataFrame(index=["a", "b", "c"])) self.assertTrue("'rhd1' 'rhd1'" in str(context.exception))
def do_steep_and_sip(gct, similarity_metric, connectivity_metric, fields_to_aggregate): """ Perform steep and sip on the same GCT. AKA introspect. Args: gct: similarity_metric: connectivity_metric: fields_to_aggregate: Returns: sim_gct conn_gct """ #----------STEEP--------# sim_df = steep.compute_similarity_within_df(gct.data_df, similarity_metric) # Row and column metadata are both from gct metadata_df = gct.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = similarity_metric # Assemble similarity gct sim_gct = GCToo.GCToo(data_df=sim_df, row_metadata_df=metadata_df, col_metadata_df=metadata_df) #----------SIP----------# # Check symmetry (is_test_df_sym, _) = sip.check_symmetry(sim_gct.data_df, sim_gct.data_df) # Create deep copies of sim_gct in order to leave the original GCT untouched test_gct = GCToo.GCToo(data_df=sim_df.copy(deep=True), row_metadata_df=metadata_df.copy(deep=True), col_metadata_df=metadata_df.copy(deep=True)) bg_gct = GCToo.GCToo(data_df=sim_df.copy(deep=True), row_metadata_df=metadata_df.copy(deep=True), col_metadata_df=metadata_df.copy(deep=True)) # Create an aggregated metadata field for index and columns of sim_gct # and sort by that field (test_gct, bg_gct) = sip.create_aggregated_fields_in_GCTs( test_gct, bg_gct, fields_to_aggregate, fields_to_aggregate, fields_to_aggregate, QUERY_FIELD_NAME, TARGET_FIELD_NAME, SEPARATOR) # Compute connectivity (_, signed_conn_gct) = sip.compute_connectivities( test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, connectivity_metric, is_test_df_sym, SEPARATOR) # Append to queries a new column saying what connectivity metric was used sip.add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) sip.add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) return sim_gct, signed_conn_gct
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct if os.path.splitext(args.out_name)[1] == ".gct": wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif os.path.splitext(args.out_name)[1] == ".gctx": wgx.write(out_gct, args.out_name) else: raise (Exception( "out_name must end in .gct or .gctx. out_name: {}".format( args.out_name)))
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path, convert_neg_666=False, make_multiindex=True) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def main(args, proj_dir, out_dir, project_name,invar=True): # Read in the data data_map, metadata_map = read_build_data(proj_dir=proj_dir) # Make folders for different outputs mk_folders(out_dir=out_dir, folders=['sensitivities', 'distributions', 'heatmaps']) # Check if project name arg is filled, if not use base folder name if project_name is None: project_name = os.path.basename(os.path.dirname(proj_dir)) # Make distributions and heatmaps of all data at each data level mk_distributions(data_map, metadata_map, project_name, out_dir) # If expected sensitivities arg is set to true, run expected sensitivities analysis # TODO add argument for defining sensitivity cell set # Make standard SC plot for whole dataset, signal strength vs correlation prism_plots.sc_plot(metadata_map['sig'], os.path.join(out_dir,'sc_modz.zspc.png')) # Make modz distribuions split by pert type if 'combat_modz' in data_map.keys(): comp.modz_dist(data_map['combat_modz'], metadata_map['cb_sig'], [], os.path.join(out_dir, 'modz_dist.png')) # If running on data with control barcodes, plot monotonicity of curves #if invar is True: # inv.invariant_monotonicity(data_map['mfi'], metadata_map['inst'], out_dir) # Calculate median SSMD by pool and output in table ssmd.ssmd_by_pool(metadata_map['ssmd'], metadata_map['cell'], out_dir) # Get cell metadata without control barcodes for later use norm_cell_metadata = metadata_map['cell'].loc[[x for x in metadata_map['cell'].index if x in data_map['norm'].row_metadata_df.index]] # ECDF of SSMD Scores in Norm Data and MFI Data if 'norm' and 'mfi' in data_map.keys(): ssmd.ssmd_ecdf(GCToo.GCToo(data_df=data_map['norm'].data_df, col_metadata_df=metadata_map['inst'].loc[data_map['norm'].data_df.columns], row_metadata_df=norm_cell_metadata), GCToo.GCToo(data_df=data_map['mfi'].data_df, col_metadata_df=metadata_map['inst'].loc[data_map['mfi'].data_df.columns], row_metadata_df=metadata_map['cell']), 'SSMD ECDF for {}'.format(os.path.dirname(proj_dir)) ,os.path.join(out_dir)) # Make a bunch of plots at the plate level for each plate in cohort if args.plate_qc: get_plate_qc_data_map_and_run(data_map, metadata_map, norm_cell_metadata, project_name, out_dir, invar) #todo: add a check for get_plate_qc_data_map_and_run before running qc_galleries --> dependent qc_galleries(out_dir, project_name, metadata_map, data_map)
def calculate_zscore(df, plate_control=False): # Calculate level 4 data from level 3 if plate_control == False: neg_dex = df.col_metadata_df[df.col_metadata_df['pert_type'] == 'ctl_vehicle'].index.tolist() neg_df = df.data_df[neg_dex] zscore_data = zscore(df.data_df, neg_df) df.col_metadata_df['data_level'] = 'ZSVC' df.col_metadata_df['provenance'] = [x + ' | ZSVC' for x in df.col_metadata_df['provenance']] elif plate_control == True: zscore_data = zscore(df.data_df) df.col_metadata_df['data_level'] = 'ZSPC' df.col_metadata_df['provenance'] = [x + ' | ZSPC' for x in df.col_metadata_df['provenance']] row_metadata_df = df.row_metadata_df zscore_data[zscore_data < -10] = -10 zscore_data[zscore_data > 10] = 10 zscore_data.sort_index(inplace=True) row_metadata_df.sort_index(inplace=True) zscore_gctoo = GCToo.GCToo(data_df=zscore_data, row_metadata_df=row_metadata_df, col_metadata_df=df.col_metadata_df) return zscore_gctoo
def test_log_transform_if_needed(self): prov_code = ["GR1", "L2X"] rids = ["a", "b", "c"] cids = ["A", "B", "C"] in_df = pd.DataFrame( [[10, 3, 1.2], [0.45, 0.2, 0], [4.5, np.nan, 0.3]], index=rids, columns=cids, dtype=float) in_gct = GCToo.GCToo(data_df=in_df, row_metadata_df=pd.DataFrame(index=rids), col_metadata_df=pd.DataFrame(index=cids)) # Nothing should happen (out_gct, out_prov_code) = dry.log_transform_if_needed(in_gct, prov_code, "L2X") self.assertTrue( np.allclose(out_gct.data_df, in_df, atol=1e-3, equal_nan=True)) self.assertEqual(out_prov_code, prov_code) # L2X should occur prov_code2 = ["GR1"] (_, out_prov_code2) = dry.log_transform_if_needed(in_gct, prov_code2, "L2X") self.assertEqual(out_prov_code2, prov_code) in_gct.data_df.iloc[0, 1] = -3 with self.assertRaises(AssertionError) as e: (_, _) = dry.log_transform_if_needed(in_gct, prov_code2, "L2X") self.assertIn("data_df should not contain negative", str(e.exception))
def test_assemble_multi_index_df(self): # TODO: Add test of only row ids present as metadata # TODO: Add test of only col ids present as metadata g = GCToo.GCToo(data_df=pd.DataFrame( { 10: range(13, 16), 11: range(16, 19), 12: range(19, 22) }, index=range(4, 7)), row_metadata_df=pd.DataFrame({"a": range(3)}, index=range(4, 7)), col_metadata_df=pd.DataFrame({"b": range(7, 10)}, index=range(10, 13)), make_multiindex=True) assert "a" in g.multi_index_df.index.names, g.multi_index_df.index.names assert "rid" in g.multi_index_df.index.names, g.multi_index_df.index.names assert "b" in g.multi_index_df.columns.names, g.multi_index_df.columns.names assert "cid" in g.multi_index_df.columns.names, g.multi_index_df.columns.names r = g.multi_index_df.xs(7, level="b", axis=1) logger.debug("r: {}".format(r)) assert r.xs(4, level="rid", axis=0).values[0][0] == 13, r.xs(4, level="rid", axis=0).values[0][0] assert r.xs(5, level="rid", axis=0).values[0][0] == 14, r.xs(5, level="rid", axis=0).values[0][0] assert r.xs(6, level="rid", axis=0).values[0][0] == 15, r.xs(6, level="rid", axis=0).values[0][0]
def log_transform_if_needed(gct, prov_code, prov_code_entry): """Perform log2 transformation if it hasn't already been done. Args: gct (GCToo object) prov_code (list of strings) prov_code_entry (string) Returns: out_gct (GCToo object) updated_prov_code (list of strings): updated """ # Check if log2 transformation has already occurred if prov_code_entry in prov_code: logger.info("{} has already occurred.".format(prov_code_entry)) updated_prov_code = prov_code out_gct = gct else: assert not (gct.data_df < 0).sum().sum(), ( "data_df should not contain negative values. gct.data_df:\n{}". format(gct.data_df)) out_df = log_transform(gct.data_df, log_base=2) updated_prov_code = prov_code + [prov_code_entry] # Return new GCToo out_gct = GCToo.GCToo(out_df, gct.row_metadata_df, gct.col_metadata_df) return out_gct, updated_prov_code
def test_count_shear(self): count = GCToo.GCToo( data_df=pd.DataFrame( { 'test_CS0_X1:A': [40, 50, 30, 20, 10], 'test_CS0_X1:B': [110, 80, 60, 40, 30], 'test_CS0_X1:C': [5, 15, 4, 3, 5], 'test_CS0_X1:D': [60, 90, 70, 8, 8], 'test_CS0_X1:E': [75, 85, 60, 9, 10] }, index=['1', '2', '3', '661', '662']), row_metadata_df=pd.DataFrame(index=['1', '2', '3', '661', '662']), col_metadata_df=pd.DataFrame( { 'pert_type': [ 'trt_cp', 'trt_cp', 'trt_cp', 'ctl_vehicle', 'ctl_vehicle' ] }, index=[ 'test_CS0_X1:A', 'test_CS0_X1:B', 'test_CS0_X1:C', 'test_CS0_X1:D', 'test_CS0_X1:E' ])) shear = norm.remove_low_bead_wells(l, count) print shear.data_df.shape assert len(shear.data_df.columns) == 4
def test_extract_bg_vals_from_non_sym(self): bg_row_meta_df = pd.DataFrame({ "group": ["A", "B", "A", "B"], "id": [1, 2, 3, 4] }) bg_col_meta_df = pd.DataFrame({ "group": ["F", "F", "E", "E"], "id": [1, 2, 3, 4] }) bg_data_df = pd.DataFrame([[1, 2, 3, 5], [7, 11, 13, 17], [19, 23, 29, 31], [-3, 5, 7, 11]]) bg_gct = GCToo.GCToo(data_df=bg_data_df, row_metadata_df=bg_row_meta_df, col_metadata_df=bg_col_meta_df) # Expected values e_A_vals = [1, 2, 3, 5, 19, 23, 29, 31] e_B_vals = [7, 11, 13, 17, -3, 5, 7, 11] A_vals = sip.extract_bg_vals_from_non_sym("A", "group", bg_gct) self.assertItemsEqual(e_A_vals, A_vals) B_vals = sip.extract_bg_vals_from_non_sym("B", "group", bg_gct) self.assertItemsEqual(e_B_vals, B_vals) # Verify that assert statement works with self.assertRaises(AssertionError) as e: sip.extract_bg_vals_from_non_sym("D", "group", bg_gct) self.assertIn("target D is not in the group metadata", str(e.exception))
def test_extract_bg_vals_from_sym(self): bg_meta_df = pd.DataFrame({ "group": ["A", "B", "A", "B", "C", "C"], "id": [1, 2, 3, 4, 5, 6] }) bg_data_df = pd.DataFrame([[1.0, 0.5, 1.0, -0.4, 1.1, -0.6], [0.5, 1.0, 1.2, -0.8, -0.9, 0.4], [1.0, 1.2, 1.0, 0.1, 0.3, 1.3], [-0.4, -0.8, 0.1, 1.0, 0.5, -0.2], [1.1, -0.9, 0.3, 0.5, 1.0, 0.7], [-0.6, 0.4, 1.3, -0.2, 0.7, 1.0]]) bg_gct = GCToo.GCToo(data_df=bg_data_df, row_metadata_df=bg_meta_df, col_metadata_df=bg_meta_df) # Expected values e_A_vals = [0.5, 1.0, -0.4, 1.1, -0.6, 1.2, 0.1, 0.3, 1.3] e_B_vals = [0.5, 1.2, -0.8, -0.9, 0.4, -0.4, 0.1, 0.5, -0.2] e_C_vals = [1.1, -0.9, 0.3, 0.5, 0.7, -0.6, 0.4, 1.3, -0.2] A_vals = sip.extract_bg_vals_from_sym("A", "group", bg_gct) self.assertItemsEqual(e_A_vals, A_vals) B_vals = sip.extract_bg_vals_from_sym("B", "group", bg_gct) self.assertItemsEqual(e_B_vals, B_vals) C_vals = sip.extract_bg_vals_from_sym("C", "group", bg_gct) self.assertItemsEqual(e_C_vals, C_vals) # Verify that assert statement works with self.assertRaises(AssertionError) as e: sip.extract_bg_vals_from_sym("D", "group", bg_gct) self.assertIn("D is not in the group metadata", str(e.exception))
def test_insert_offsets_and_prov_code(self): data = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], index=["a", "b", "c", "d"], columns=["e", "f", "g"]) row_meta = pd.DataFrame( [["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]], index=["a", "b", "c", "d"], columns=["row_field1", "row_field2"]) col_meta = pd.DataFrame( [["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]], index=["e", "f", "g"], columns=["col_field1", "col_field2"]) in_gct = GCToo.GCToo(data_df=data, row_metadata_df=row_meta, col_metadata_df=col_meta) offsets = np.array([3.0, 5.0, 8.0]) offsets_field = "offsets" prov_code = ["A", "B", "C", "D"] prov_code_field = "col_field2" prov_code_delimiter = "+" e_col_meta = pd.DataFrame( [["cm1", "A+B+C+D", 3.0], ["cm3", "A+B+C+D", 5.0], ["cm5", "A+B+C+D", 8.0]], index=["e", "f", "g"], columns=["col_field1", "col_field2", "offsets"]) out_gct = dry.insert_offsets_and_prov_code(in_gct, offsets, offsets_field, prov_code, prov_code_field, prov_code_delimiter) self.assertTrue(np.array_equal(out_gct.col_metadata_df, e_col_meta))
def vstack(gctoos, remove_all_metadata_fields=False, error_report_file=None, fields_to_remove=[], reset_ids=False): """ Vertically concatenate gctoos. Args: gctoos (list of gctoo objects) remove_all_metadata_fields (bool): ignore/strip all common metadata when combining gctoos error_report_file (string): path to write file containing error report indicating problems that occurred during vstack, mainly for inconsistencies in common metadata fields_to_remove (list of strings): fields to be removed from the common metadata because they don't agree across files reset_ids (bool): set to True if row ids are not unique Return: concated (gctoo object) """ # Separate each gctoo into its component dfs row_meta_dfs = [] col_meta_dfs = [] data_dfs = [] srcs = [] for g in gctoos: row_meta_dfs.append(g.row_metadata_df) col_meta_dfs.append(g.col_metadata_df) data_dfs.append(g.data_df) srcs.append(g.src) # Concatenate col metadata all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file) # Concatenate col metadata all_row_metadata_df = assemble_concatenated_meta( row_meta_dfs, remove_all_metadata_fields) # Concatenate the data_dfs all_data_df = assemble_data(data_dfs, "vert") # Make sure df shapes are correct assert all_data_df.shape[0] == all_row_metadata_df.shape[ 0], "Number of rows is incorrect." assert all_data_df.shape[1] == all_col_metadata_df.shape[ 0], "Number of columns is incorrect." # If requested, reset sample ids to be unique integers and move old sample # ids into column metadata if reset_ids: do_reset_ids(all_row_metadata_df, all_data_df, "vert") logger.info("Build GCToo of all...") concated = GCToo.GCToo(row_metadata_df=all_row_metadata_df, col_metadata_df=all_col_metadata_df, data_df=all_data_df) return concated
def test_main(self): out_name = os.path.join(FUNCTIONAL_TESTS_PATH, "test_main_out.gct") gctoo = GCToo.GCToo(data_df=self.data_df, row_metadata_df=self.row_metadata_df, col_metadata_df=self.col_metadata_df) wg.write(gctoo, out_name, data_null="NaN", metadata_null="-666", filler_null="-666") # Read in the gct and verify that it's the same as gctoo new_gct = pg.parse(out_name) pd.util.testing.assert_frame_equal(new_gct.data_df, gctoo.data_df) pd.util.testing.assert_frame_equal(new_gct.row_metadata_df, gctoo.row_metadata_df) pd.util.testing.assert_frame_equal(new_gct.col_metadata_df, gctoo.col_metadata_df) # Also check that missing values were written to the file as expected in_df = pd.read_csv(out_name, sep="\t", skiprows=2, keep_default_na=False) self.assertEqual(in_df.iloc[0, 1], "-666") self.assertEqual(in_df.iloc[5, 6], "NaN") # Cleanup os.remove(out_name)
def continuous_renormalization(args): # Read in GCT, if path provided, and make deep copies of all DataFrames if args.in_gct_path: gct = parse.parse(args.in_gct_path) else: gct = args.in_gct data_df = gct.data_df.copy(deep=True) row_metadata_df = gct.row_metadata_df.copy(deep=True) col_metadata_df = gct.col_metadata_df.copy(deep=True) # Remove rows that are all NA values data_df = data_df.loc[(data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :] # Pull out enrichment scores from column metadata dataframe enrichment_scores = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True) # Calculate limit as x approaches 1 for non-median normalized data pep_y_offsets = calculate_y_offsets(data_df, enrichment_scores) # Calculate the fit parameters fit_parameters = calculate_fit(data_df, enrichment_scores, pep_y_offsets) # Annotate which rows will be renormalized based on slope_cutoff argument (default 0.2) row_metadata_df["is_log_renormed"] = is_log_renormed(fit_parameters.loc[:, "deg1"].apply(get_slope), args.slope_cutoff) # Calculate the offset matrix offset_mat = calculate_peptide_sample_offsets(data_df, row_metadata_df, enrichment_scores, fit_parameters, pep_y_offsets) # Calculate the output DataFrame out_data_df = calculate_out_matrix(data_df, offset_mat) # Add the 'renorm_correction' metadata field with total sample offset values col_metadata_df["renorm_correction"] = calculate_total_sample_offsets(offset_mat) # Output if args.write_gct: write_gct.write(GCToo.GCToo(data_df=out_data_df, col_metadata_df=col_metadata_df, row_metadata_df=row_metadata_df), args.out_name) else: return GCToo.GCToo(data_df=out_data_df, col_metadata_df=col_metadata_df, row_metadata_df=row_metadata_df)
def subset_gctoo(gctoo, row_bool=None, col_bool=None, rid=None, cid=None, ridx=None, cidx=None, exclude_rid=None, exclude_cid=None): """ Extract a subset of data from a GCToo object in a variety of ways. The order of rows and columns will be preserved. Args: gctoo (GCToo object) row_bool (list of bools): length must equal gctoo.data_df.shape[0] col_bool (list of bools): length must equal gctoo.data_df.shape[1] rid (list of strings): rids to include cid (list of strings): cids to include ridx (list of integers): row integer ids to include cidx (list of integers): col integer ids to include exclude_rid (list of strings): rids to exclude exclude_cid (list of strings): cids to exclude Returns: out_gctoo (GCToo object): gctoo after subsetting """ assert sum([ (rid is not None), (row_bool is not None), (ridx is not None) ]) <= 1, ("Only one of rid, row_bool, and ridx can be provided.") assert sum([ (cid is not None), (col_bool is not None), (cidx is not None) ]) <= 1, ("Only one of cid, col_bool, and cidx can be provided.") # Figure out what rows and columns to keep rows_to_keep = get_rows_to_keep(gctoo, rid, row_bool, ridx, exclude_rid) cols_to_keep = get_cols_to_keep(gctoo, cid, col_bool, cidx, exclude_cid) # Convert labels to boolean array to preserve order rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep) cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep) # Make the output gct out_gctoo = GCToo.GCToo( src=gctoo.src, version=gctoo.version, data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools], row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :], col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :]) assert out_gctoo.data_df.size > 0, "Subsetting yielded an empty gct!" logger.info( ("Initial GCToo with {} rows and {} columns subsetted down to " + "{} rows and {} columns.").format(gctoo.data_df.shape[0], gctoo.data_df.shape[1], out_gctoo.data_df.shape[0], out_gctoo.data_df.shape[1])) return out_gctoo
def drop_less_than_2_replicates(modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df): """ For all input data frames, removes signature entries where the number of profiles in that signature (nprofile) = 1. Returns filtered data frames. """ sub_mat = modZ_GCT.data_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index, axis=1) sub_col_mat = modZ_GCT.col_metadata_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index) modZ_GCT = GCToo.GCToo(data_df=sub_mat, col_metadata_df=sub_col_mat, row_metadata_df=modZ_GCT.row_metadata_df) cc_q75_df.drop(cc_q75_df[cc_q75_df['nprofile'] < 2].index, inplace=True) sub_mat = cb_modZ_GCT.data_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index, axis=1) sub_col_mat = cb_modZ_GCT.col_metadata_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index) cb_modZ_GCT = GCToo.GCToo(data_df=sub_mat, col_metadata_df=sub_col_mat, row_metadata_df=cb_modZ_GCT.row_metadata_df) cb_cc_q75_df.drop(cb_cc_q75_df[cb_cc_q75_df['nprofile'] < 2].index, inplace=True) return (modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df)
def setUpClass(cls): data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]], index=["a", "b", "c", "d"], columns=["e", "f", "g"]) row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]], index=["a","b","c","d"], columns=["rhd1", "rh2"]) col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]], index=["e", "f", "g"], columns=["chd1", "chd2"]) cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df)
def create_gctoo_obj(file_path, version, row_metadata_df, col_metadata_df, data_df, make_multiindex): # Move dataframes into GCToo object gctoo_obj = GCToo.GCToo(src=file_path, version=version, row_metadata_df=row_metadata_df, col_metadata_df=col_metadata_df, data_df=data_df, make_multiindex=make_multiindex) return gctoo_obj
def test_p100_calculate_dists_and_apply_offsets_if_needed(self): no_optim = True offset_bounds = (-2, 2) prov_code = ["PR1", "L2X", "filtering"] e_dists = [14.75, 0.0, 4.75] e_offsets = [3.25, 0.0, -2.25] e_prov_code = ["PR1", "L2X", "filtering", "LLB"] data = pd.DataFrame([[1, 2, 3],[5, 7, 11], [13, 17, 19], [23, 29, 31]], index=["a", "b", "c", "d"], columns=["e", "f", "g"], dtype=float) row_meta = pd.DataFrame([["rm1", "rm2"],["rm3", "rm4"],["rm5", "rm6"],["rm7", "rm8"]], index=["a", "b", "c", "d"], columns=["row_field1", "row_field2"]) col_meta = pd.DataFrame([["cm1", "cm2"],["cm3", "cm4"],["cm5", "cm6"]], index=["e", "f", "g"], columns=["col_field1", "col_field2"]) in_gct = GCToo.GCToo(data_df=data, row_metadata_df=row_meta, col_metadata_df=col_meta) # P100 & optim (out_gct, out_dists, out_offsets, out_prov_code) = ( dry.p100_calculate_dists_and_apply_offsets_if_needed( in_gct, "p100", no_optim_bool=False, offset_bounds=offset_bounds, prov_code=prov_code, prov_code_entry="LLB")) self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2), ( "out_offsets:\n{}\ne_offsets:\n{}".format(out_offsets, e_offsets))) self.assertTrue(np.allclose(out_dists, e_dists, atol=1e-2), ( "out_dists:\n{}\ne_dists:\n{}".format(out_dists, e_dists))) self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2)) self.assertEqual(out_prov_code, e_prov_code) # P100 but no optim e_dists2 = [57, 0, 25] (out_gct, out_dists, out_offsets, out_prov_code) = ( dry.p100_calculate_dists_and_apply_offsets_if_needed( in_gct, "p100", no_optim_bool=True, offset_bounds=offset_bounds, prov_code=prov_code, prov_code_entry="LLB")) self.assertTrue(np.allclose(out_dists, e_dists2, atol=1e-2), ( "out_dists:\n{}\ne_dists2:\n{}".format(out_dists, e_dists2))) self.assertEqual(out_offsets, None) self.assertEqual(out_prov_code, prov_code) # GCP (out_gct, out_dists, out_offsets, out_prov_code) = ( dry.p100_calculate_dists_and_apply_offsets_if_needed( in_gct, "gcp", no_optim_bool=True, offset_bounds=offset_bounds, prov_code=prov_code, prov_code_entry="LLB")) self.assertEqual(out_dists, None) self.assertEqual(out_offsets, None) self.assertEqual(out_prov_code, prov_code)
def shear(gctoo, bad_wells): remove = gctoo.col_metadata_df[gctoo.col_metadata_df['pert_well'].isin( bad_wells)].index new_data_df = gctoo.data_df.drop(remove, axis=1) new_col_df = gctoo.col_metadata_df.drop(remove) new_gctoo = GCToo.GCToo(data_df=new_data_df, col_metadata_df=new_col_df, row_metadata_df=gctoo.row_metadata_df) return new_gctoo
def get_plate_qc_data_map_and_run(data_map, metadata_map, norm_cell_metadata, project_name, out_dir, invar): for plate in metadata_map['inst']['prism_replicate'].unique(): print plate plate_data_map = {} for key in data_map: if key == 'count' or key =='mfi': plate_data_map[key] = GCToo.GCToo(data_df=data_map[key].data_df[metadata_map['inst'][metadata_map['inst']['prism_replicate'] == plate].index], col_metadata_df=metadata_map['inst'][metadata_map['inst']['prism_replicate'] == plate], row_metadata_df=metadata_map['cell']) else: plate_data_map[key]= GCToo.GCToo(data_df=data_map[key].data_df.loc[:, [x for x in metadata_map['inst'][ metadata_map['inst']['prism_replicate'] == plate].index if x in data_map[key].data_df.columns]], col_metadata_df=metadata_map['inst'].loc[[x for x in metadata_map['inst'][ metadata_map['inst']['prism_replicate'] == plate].index if x in data_map[key].data_df.columns]], row_metadata_df=norm_cell_metadata) print plate_data_map['norm'].data_df.shape plate_summary.plate_qc(out_dir, plate, plate_data_map, invar=invar)
def main(args): # Find files full_path_wildcard = args.in_dir + args.file_wildcard gct_paths = glob.glob(full_path_wildcard) assert len(gct_paths) > 1, "full_path_wildcard: {}".format( full_path_wildcard) # Extract prefixes in order to use them later for saving prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0] for path in gct_paths] for path, prefix in zip(gct_paths, prefixes): print "path: {}".format(path) print "prefix: {}".format(prefix) # Import gcts gctoos = [parse(x) for x in gct_paths] assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths) # Compute & save ranks for g, prefix in zip(gctoos, prefixes): # Extract data_df score_df = g.data_df # Must be square assert score_df.shape[0] == score_df.shape[ 1], "Input dataframe must be square." # Set diagonal to NaN np.fill_diagonal(score_df.values, np.nan) # Rank the matrix (percentile score or not) if args.do_percentile_rank: rank_df = score_df.rank(ascending=False, pct=True) * 100 else: rank_df = score_df.rank(ascending=False) # Make a GCToo rank_gctoo = GCToo.GCToo(data_df=rank_df, row_metadata_df=g.row_metadata_df, col_metadata_df=g.col_metadata_df) # Save the rank_df to file out_name = args.out_dir + prefix + args.output_suffix wg.write(rank_gctoo, out_name, filler_null="NaN", data_null="NaN", metadata_null="NaN")
def test_p100_filter_samples_by_dist(self): offsets = np.array([4, 3, 7], dtype=float) dists = np.array([1, 6, 2], dtype=float) dist_sd_cutoff = 1 prov_code = ["A", "B"] data = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], index=["a", "b", "c", "d"], columns=["e", "f", "g"]) row_meta = pd.DataFrame( [["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]], index=["a", "b", "c", "d"], columns=["row_field1", "row_field2"]) col_meta = pd.DataFrame( [["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]], index=["e", "f", "g"], columns=["col_field1", "col_field2"]) in_gct = GCToo.GCToo(data_df=data, row_metadata_df=row_meta, col_metadata_df=col_meta) # P100 e_data = pd.DataFrame([[1, 3], [4, 6], [7, 9], [10, 12]], index=["a", "b", "c", "d"], columns=["e", "g"]) e_col_meta = pd.DataFrame([["cm1", "cm2"], ["cm5", "cm6"]], index=["e", "g"], columns=["col_field1", "col_field2"]) e_offsets = np.array([4, 7], dtype=float) e_remaining = ["e", "g"] e_prov_code = ["A", "B", "OSF1"] (out_gct, out_offsets, out_remaining, out_prov_code) = (dry.p100_filter_samples_by_dist( in_gct, "p100", offsets, dists, dist_sd_cutoff, prov_code, "OSF")) self.assertTrue(np.allclose(out_gct.data_df, e_data, atol=1e-2)) self.assertTrue(np.array_equal(out_gct.col_metadata_df, e_col_meta)) self.assertTrue(np.array_equal(out_gct.row_metadata_df, row_meta)) self.assertTrue(np.allclose(out_offsets, e_offsets, atol=1e-2)) self.assertEqual(out_remaining, e_remaining) self.assertEqual(out_prov_code, e_prov_code) # GCP (out_gct2, out_offsets2, out_remaining2, out_prov_code2) = (dry.p100_filter_samples_by_dist( in_gct, "gcp", None, dists, dist_sd_cutoff, prov_code, "OSF")) self.assertTrue(np.allclose(out_gct2.data_df, data, atol=1e-2)) self.assertTrue(np.array_equal(out_gct2.col_metadata_df, col_meta)) self.assertTrue(np.array_equal(out_gct2.row_metadata_df, row_meta)) self.assertEqual(out_offsets2, None) self.assertEqual(out_remaining2, None) self.assertEqual(out_prov_code2, prov_code)
def do_steep_and_sip(external_gct, internal_gct, bg_gct, similarity_metric, connectivity_metric, fields_to_aggregate_for_external_profiles, fields_to_aggregate_for_internal_profiles): #----------STEEP----------# # Compute similarity between external and internal profiles sim_df = steep.compute_similarity_bw_two_dfs(internal_gct.data_df, external_gct.data_df, similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_for_sim_df = internal_gct.col_metadata_df col_metadata_for_sim_df = external_gct.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric col_metadata_for_sim_df[SIMILARITY_METRIC_FIELD] = similarity_metric # Assemble similarity gct sim_gct = GCToo.GCToo(sim_df, row_metadata_for_sim_df, col_metadata_for_sim_df) #----------SIP----------# # Check symmetry (is_test_df_sym, is_bg_df_sym) = sip.check_symmetry(sim_gct.data_df, bg_gct.data_df) # Create an aggregated metadata field for index and columns of both gcts # and sort by that field (test_gct, bg_gct) = sip.create_aggregated_fields_in_GCTs( sim_gct, bg_gct, fields_to_aggregate_for_external_profiles, fields_to_aggregate_for_internal_profiles, fields_to_aggregate_for_internal_profiles, QUERY_FIELD_NAME, TARGET_FIELD_NAME, SEPARATOR) # Compute connectivity (_, signed_conn_gct) = sip.compute_connectivities( test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME, connectivity_metric, is_test_df_sym, SEPARATOR) # Append to queries a new column saying what connectivity metric was used sip.add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) sip.add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df, connectivity_metric, CONNECTIVITY_METRIC_FIELD) return sim_gct, signed_conn_gct
def test_init(self): # Create test data data_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["a", "b", "c"]) row_metadata_df = pd.DataFrame( [["rhd_A", "rhd_B"], ["rhd_C", "rhd_D"]], index=["A", "B"], columns=["rhd1", "rhd2"]) col_metadata_df = pd.DataFrame(["chd_a", "chd_b", "chd_c"], index=["a", "b", "c"], columns=["chd1"]) # happy path, no multi-index my_gctoo1 = GCToo.GCToo(data_df=data_df, row_metadata_df=row_metadata_df, col_metadata_df=col_metadata_df) self.assertTrue( my_gctoo1.multi_index_df == None, 'Expected no multi-index DataFrame but found {}'.format( my_gctoo1.multi_index_df)) # happy path, with multi-index my_gctoo2 = GCToo.GCToo(data_df=data_df, row_metadata_df=row_metadata_df, col_metadata_df=col_metadata_df, make_multiindex=True) self.assertTrue( isinstance(my_gctoo2.multi_index_df.index, pd.core.index.MultiIndex), "Expected a multi_index DataFrame but instead found {}".format( my_gctoo2.multi_index_df)) #happy path, no metadata provided my_gctoo3 = GCToo.GCToo(data_df) self.assertIsNotNone(my_gctoo3.row_metadata_df) self.assertIsNotNone(my_gctoo3.col_metadata_df)
def remove_outlier_invariants(gctoo, inv_threshold): invdata = gctoo.data_df.loc[invariant_rids] bad_wells = invdata.median()[invdata.median() < inv_threshold].index data = gctoo.data_df.drop(bad_wells, axis=1).dropna(axis=1, how='all') col_data = gctoo.col_metadata_df.loc[data.columns] new_gctoo = GCToo.GCToo(data_df=data, col_metadata_df=col_data, row_metadata_df=gctoo.row_metadata_df) return new_gctoo