def split_cop_data(data_map): new_data_map = {} for key in data_map: temp_gct = data_map[key] vi_gct = sub.subset_gctoo(temp_gct, rid=[x for x in temp_gct.data_df.index.tolist() if 'c-' in x]) gex_gct = sub.subset_gctoo(temp_gct, rid=[x for x in temp_gct.data_df.index.tolist() if 'c-' not in x]) new_data_map[key + '_vi'] = vi_gct new_data_map[key + '_gex'] = gex_gct return new_data_map
def separate(in_gct, separate_field, row_or_col): """ Create a new GCT object for each unique value in separate_field. Args: in_gct (GCToo object) separate_field (string) row_or_col (string) Returns: gcts (list of GCToo objects) unique_values_in_field (list of strings) """ if row_or_col == "row": assert separate_field in in_gct.row_metadata_df.columns, ( ("separate_field must be in in_gct.row_metadata_df.columns. " + "separate_field: {}, in_gct.row_metadata_df.columns: {}").format( separate_field, in_gct.row_metadata_df.columns.values)) unique_values_in_field = list( in_gct.row_metadata_df.loc[:, separate_field].unique()) gcts = [] for val in unique_values_in_field: bool_array = in_gct.row_metadata_df.loc[:, separate_field].values == val new_gct = sg.subset_gctoo(in_gct, row_bool=bool_array) gcts.append(new_gct) elif row_or_col == "col": assert separate_field in in_gct.col_metadata_df.columns, ( ("separate_field must be in in_gct.col_metadata_df.columns. " + "separate_field: {}, in_gct.col_metadata_df.columns: {}").format( separate_field, in_gct.col_metadata_df.columns.values)) unique_values_in_field = list( in_gct.col_metadata_df.loc[:, separate_field].unique()) gcts = [] for val in unique_values_in_field: bool_array = in_gct.col_metadata_df.loc[:, separate_field].values == val new_gct = sg.subset_gctoo(in_gct, col_bool=bool_array) gcts.append(new_gct) else: raise (Exception("row or col must be 'row' or 'col'.")) # Make sure each gct is associated with a value from separate_field assert len(gcts) == len(unique_values_in_field), ( "len(gcts): {}, len(unique_values_in_field): {}".format( len(gcts), len(unique_values_in_field))) return gcts, unique_values_in_field
def main(args): # Import data assert os.path.exists( args.in_gct_path), ("in_gct_path could not be found: {}").format( args.in_gct_path) in_gct = parse.parse(args.in_gct_path) # First, check if any rows are all NaN; if so, remove them dropped_df = in_gct.data_df.dropna(how="all") bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values) in_gct = sg.subset_gctoo(in_gct, row_bool=bools_of_remaining) if args.replace_with == "zero": in_gct.data_df.fillna(0, inplace=True) elif args.replace_with == "median": probe_medians = in_gct.data_df.median(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_medians[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row elif args.replace_with == "mean": probe_means = in_gct.data_df.mean(axis=1) for row_idx, row in enumerate(in_gct.data_df.values): this_row = in_gct.data_df.iloc[row_idx, :] this_row[this_row.isnull()] = probe_means[row_idx] in_gct.data_df.iloc[row_idx, :] = this_row wg.write(in_gct, args.out_name, filler_null="NA")
def test_gctx_parsing(self): # parse in gctx, no other arguments mg1 = mini_gctoo_for_testing.make() mg2 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx") pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # check convert_neg_666 worked correctly self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) # parse w/o convert_neg_666 mg2_alt = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", convert_neg_666 = False) self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all()) # parsing w/rids & cids specified test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", rid=test_rids, cid=test_cids) pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df) pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # parsing w/ridx & cidx specified mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) mg6 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df) pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # parsing row metadata only mg7 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True) pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df) # parsing col metadata only mg8 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True) pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df) # parsing w/multiindex mg9 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", make_multiindex=True) self.assertTrue(mg9.multi_index_df is not None)
def drop_nans(gctoo): nan_cols = gctoo.data_df.isnull().all()[gctoo.data_df.isnull().all() == True].index nan_rows = gctoo.data_df.isnull().all(axis=1)[gctoo.data_df.isnull().all( axis=1) == True].index if len(nan_cols) == gctoo.data_df.shape[1]: return 'empty_plate' new_gctoo = sub.subset_gctoo(gctoo, exclude_cid=nan_cols, exclude_rid=nan_rows) return new_gctoo
def subset_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # Read in each of the command line arguments rid = _read_arg(args.rid) cid = _read_arg(args.cid) exclude_rid = _read_arg(args.exclude_rid) exclude_cid = _read_arg(args.exclude_cid) # If GCT, use subset_gctoo if args.in_path.endswith(".gct"): in_gct = parse_gct.parse(args.in_path) out_gct = sg.subset_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid) # If GCTx, use parse_gctx else: if (exclude_rid is not None) or (exclude_cid is not None): msg = "exclude_{rid,cid} args not currently supported for parse_gctx." raise (Exception(msg)) logger.info("Using hyperslab selection functionality of parse_gctx...") out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid) # Write the output gct if args.out_type == "gctx": wgx.write(out_gct, args.out_name) else: wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
def test_subset_gctoo(self): # Error if resulting GCT is empty with self.assertRaises(AssertionError) as e: sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"]) self.assertIn("Subsetting yielded an", str(e.exception)) # cid and col_bool should not both be provided with self.assertRaises(AssertionError) as e: sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False]) self.assertIn("Only one of cid,", str(e.exception)) # Providing all 3 row inputs is also bad! with self.assertRaises(AssertionError) as e: sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!") self.assertIn("Only one of rid,", str(e.exception)) # happy path out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0], exclude_rid=["a"]) pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]])
def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False): """ The main method. Args: - file_path (string): full path to gct(x) file you want to parse - convert_neg_666 (bool): whether to convert -666 values to numpy.nan (see Note below for more details). Default = False. - rid (list of strings): list of row ids to specifically keep from gct. Default=None. - cid (list of strings): list of col ids to specifically keep from gct. Default=None. - ridx (list of integers): only read the rows corresponding to this list of integer ids. Default=None. - cidx (list of integers): only read the columns corresponding to this list of integer ids. Default=None. - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True) as pandas DataFrame - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True) as pandas DataFrame - make_multiindex (bool): whether to create a multi-index df combining the 3 component dfs Returns: - myGCToo (GCToo object): A GCToo instance containing content of parsed gct file ** OR ** - row_metadata (pandas df) ** OR ** col_metadata (pandas df) Note: why is convert_neg_666 even a thing? In CMap--for somewhat obscure historical reasons--we use "-666" as our null value for metadata. However (so that users can take full advantage of pandas' methods, including those for filtering nan's etc) we provide the option of converting these into numpy.nan values, the pandas default. """ assert sum([row_meta_only, col_meta_only]) <= 1, ( "row_meta_only and col_meta_only cannot both be requested.") nan_values = [ "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan", "#N/A!", "na", "NA", "None", "#VALUE!"] # Add "-666" to the list of NaN values if convert_neg_666: nan_values.append("-666") # Verify that the gct path exists if not os.path.exists(file_path): err_msg = "The given path to the gct file cannot be found. gct_path: {}" logger.error(err_msg.format(file_path)) raise Exception(err_msg.format(file_path)) logger.info("Reading GCT: {}".format(file_path)) # Read version and dimensions (version, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata) = read_version_and_dims(file_path) # Read in metadata and data (row_metadata, col_metadata, data) = parse_into_3_df( file_path, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata, nan_values) # Create the gctoo object and assemble 3 component dataframes # Not the most efficient if only metadata requested (i.e. creating the # whole GCToo just to return the metadata df), but simplest myGCToo = create_gctoo_obj(file_path, version, row_metadata, col_metadata, data, make_multiindex) # Subset if requested if (rid is not None) or (ridx is not None) or (cid is not None) or (cidx is not None): logger.info("Subsetting GCT... (note that there are no speed gains when subsetting GCTs)") myGCToo = sg.subset_gctoo(myGCToo, rid=rid, cid=cid, ridx=ridx, cidx=cidx) if row_meta_only: return myGCToo.row_metadata_df elif col_meta_only: return myGCToo.col_metadata_df else: return myGCToo
# for storing timing results gct_times = {} gctx_times = {} # large input gctx; see notes above for more info about this big_gctoo = parse.parse("/path/to/large/gctx/file") # column and row spaces to test writing on col_spaces = [96, 384, 1536, 3000, 6000, 12000, 24000, 48000, 100000] row_spaces = [978, 10174] for c in col_spaces: for r in row_spaces: curr_gctoo = sg.subset_gctoo(big_gctoo, ridx=range(0, r), cidx=range(0, c)) # gct writing out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct" start = time.clock() write_gct.write(curr_gctoo, out_fname) end = time.clock() elapsed_time = end - start gct_times[out_fname] = elapsed_time os.remove(out_fname) # gctx writing out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx" start = time.clock() write_gctx.write(curr_gctoo, out_fname) end = time.clock() elapsed_time = end - start
def test_parse(self): # parse whole thing mg1 = mini_gctoo_for_testing.make() mg2 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx" ) pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # test with string rid/cid test_rids = [ 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666' ] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=test_rids, cid=test_cids) pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df) pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # first, make & write out temp version of mini_gctoo with int rids/cids new_mg = mini_gctoo_for_testing.make(convert_neg_666=False) int_indexed_data_df = new_mg.data_df.copy() int_indexed_data_df.index = [str(i) for i in range(0, 6)] int_indexed_data_df.columns = [str(i) for i in range(10, 16)] int_indexed_row_meta = new_mg.row_metadata_df.copy() int_indexed_row_meta.index = int_indexed_data_df.index int_indexed_col_meta = new_mg.col_metadata_df.copy() int_indexed_col_meta.index = int_indexed_data_df.columns int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx") # test with numeric (repr as string) rid/cid mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) mg5 = subset_gctoo.subset_gctoo( mg5, row_bool=[True, False, True, False, True, False], col_bool=[True, False, False, True, True, True]) mg5.data_df.index.name = "rid" mg5.data_df.columns.name = "cid" mg5.row_metadata_df.index.name = "rid" mg5.row_metadata_df.columns.name = "rhd" mg5.col_metadata_df.index.name = "cid" mg5.col_metadata_df.columns.name = "chd" mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid=["0", "2", "4"], cid=["10", "13", "14", "15"], convert_neg_666=False) os.remove("int_indexed_mini_gctoo.gctx") pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df) pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # test with ridx/cidx mg7 = subset_gctoo.subset_gctoo( mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) mg8 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df) # test with rid/cidx mg9 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cidx=[4]) pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df) # test with ridx/cid mg10 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df) # test with row_meta_only mg11 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True) pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df) # test with col_meta_only mg12 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True) pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df) # test with sort_row_meta False and ridx mg13 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ) # test with sort_col_meta False and cidx mg13 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cidx=[4, 1, 3], sort_col_meta=False) pandas_testing.assert_frame_equal(mg13.data_df, mg1.data_df.iloc[:, [4, 1, 3]]) pandas_testing.assert_frame_equal( mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :]) pandas_testing.assert_frame_equal(mg13.row_metadata_df, mg1.row_metadata_df) # test with sort_row_meta False and ridx mg14 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", ridx=[3, 0, 1], sort_row_meta=False) pandas_testing.assert_frame_equal(mg14.data_df, mg1.data_df.iloc[[3, 0, 1], :]) pandas_testing.assert_frame_equal(mg14.col_metadata_df, mg1.col_metadata_df) pandas_testing.assert_frame_equal( mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :]) # test with sort_col_meta False and cidx and col_meta_only mg15 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cidx=[4, 1, 3], sort_col_meta=False, col_meta_only=True) pandas_testing.assert_frame_equal( mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :]) # test with sort_row_meta False and ridx and row_meta_only mg16 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", ridx=[3, 0, 1], sort_row_meta=False, row_meta_only=True) pandas_testing.assert_frame_equal( mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :]) # test with sort_col_meta False and cid cid_unsorted = [ 'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10', 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33' ] mg17 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cid=cid_unsorted, sort_col_meta=False) pandas_testing.assert_frame_equal(mg17.data_df, mg1.data_df.iloc[:, [2, 0]]) pandas_testing.assert_frame_equal(mg17.col_metadata_df, mg1.col_metadata_df.iloc[[2, 0], :]) pandas_testing.assert_frame_equal(mg17.row_metadata_df, mg1.row_metadata_df) # test with sort_row_meta False and rid rid_unsorted = [ 'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10', 'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33' ] mg18 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=rid_unsorted, sort_row_meta=False) pandas_testing.assert_frame_equal(mg18.data_df, mg1.data_df.iloc[[5, 1], :]) pandas_testing.assert_frame_equal(mg18.col_metadata_df, mg1.col_metadata_df) pandas_testing.assert_frame_equal(mg18.row_metadata_df, mg1.row_metadata_df.iloc[[5, 1], :])