def parse(file_path, convert_neg_666=True, rid=None, cid=None, make_multiindex=False): """ The main method. Args: - file_path (string): full path to gct(x) file you want to parse - convert_neg_666 (bool): whether to convert -666 values to numpy.nan (see Note below for more details). Default = True. - rid (list of strings): list of row ids to specifically keep None keeps all rids - cid (list of strings): list of col ids to specifically keep, None keeps all cids - make_multiindex (bool): whether to create a multi-index df combining the 3 component dfs Returns: gctoo_obj (GCToo object) Note: why is convert_neg_666 even a thing? In CMap--for somewhat obscure historical reasons--we use "-666" as our null value for metadata. However (so that users can take full advantage of pandas' methods, including those for filtering nan's etc) we provide the option of converting these into numpy.nan values, the pandas default. """ nan_values = [ "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan", "#N/A!", "na", "NA", "None" ] # Add "-666" to the list of NaN values if convert_neg_666: nan_values.append("-666") # Verify that the gct path exists if not os.path.exists(file_path): err_msg = "The given path to the gct file cannot be found. gct_path: {}" logger.error(err_msg.format(file_path)) raise (Exception(err_msg.format(file_path))) logger.info("Reading GCT: {}".format(file_path)) # Read version and dimensions (version, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata) = read_version_and_dims(file_path) # Read in metadata and data (row_metadata, col_metadata, data) = parse_into_3_df(file_path, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata, nan_values) # Create the gctoo object and assemble 3 component dataframes gctoo_obj = create_gctoo_obj(file_path, version, row_metadata, col_metadata, data, make_multiindex) # If requested, slice gctoo if (rid is not None) or (cid is not None): gctoo_obj = slice_gct.slice_gctoo(gctoo_obj, rid=rid, cid=cid) return gctoo_obj
def test_slice_cid_and_col_bool(self): # cid and col_bool should not both be provided with self.assertRaises(AssertionError) as e: out_gct = slice_gct.slice_gctoo(IN_GCT, cid=["e", "f", "g"], col_bool=[True, True, False]) self.assertIn("cid and col_bool", str(e.exception))
def parse(file_path, convert_neg_666=True, rid=None, cid=None): """The main method. Args: Mandatory: - file_path (str): full path to gctx file you want to parse. Optional: - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not (see Note below for more details on this). Default = False. - rid (list of strings): list of row ids to specifically keep from gctx. Default=None. - cid (list of strings): list of col ids to specifically keep from gctx. Default=None. Returns: gctoo_obj: GCToo object Note: why is convert_neg_666 even a thing? - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value for metadata. However (so that users can take full advantage of pandas' methods, including those for filtering nan's etc) we provide the option of converting these into numpy.NaN values, the pandas default. """ if convert_neg_666: # Use default nan values if none given nan_values = [ "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan", "#N/A!", "na", "NA", "None", "-666" ] else: nan_values = [ "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan", "#N/A!", "na", "NA", "None" ] # Verify that the gct path exists if not os.path.exists(file_path): err_msg = "The given path to the gct file cannot be found. gct_path: {}" logger.error(err_msg.format(file_path)) raise (Exception(err_msg.format(file_path))) logger.info("Reading GCT: {}".format(file_path)) # Read version and dimensions (version, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata) = read_version_and_dims(file_path) # Read in metadata and data (row_metadata, col_metadata, data) = parse_into_3_df(file_path, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata, nan_values) # Create the gctoo object and assemble 3 component dataframes gctoo_obj = create_gctoo_obj(file_path, version, row_metadata, col_metadata, data) # slice (if applicable) as specified by user gctoo_obj = slice_gct.slice_gctoo(gctoo_obj, rid=rid, cid=cid) return gctoo_obj
def test_slice_and_exclude_rids(self): out_gct = slice_gct.slice_gctoo(IN_GCT, rid=["a", "c", "d"], exclude_rid=["d"]) # Outputs should be dataframes even if there is only 1 index or column pd.util.testing.assert_frame_equal(out_gct.data_df, IN_GCT.data_df.iloc[[0, 2], :]) pd.util.testing.assert_frame_equal( out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :]) pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, IN_GCT.col_metadata_df)
def test_slice_bools(self): out_gct = slice_gct.slice_gctoo(IN_GCT, row_bool=[True, False, True, False], col_bool=[False, False, True]) # Outputs should be dataframes even if there is only 1 index or column pd.util.testing.assert_frame_equal( out_gct.data_df, pd.DataFrame(IN_GCT.data_df.iloc[[0, 2], 2])) pd.util.testing.assert_frame_equal( out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :]) pd.util.testing.assert_frame_equal( out_gct.col_metadata_df, pd.DataFrame(IN_GCT.col_metadata_df.iloc[2, :]).T)
def test_parse(self): # parse whole thing mg1 = mini_gctoo_for_testing.make() mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx") assert_frame_equal(mg1.data_df, mg2.data_df) assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # test with string rid/cid test_rids = [ 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666' ] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=test_rids, cid=test_cids) assert_frame_equal(mg3.data_df, mg4.data_df) assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # first, make & write out temp version of mini_gctoo with int rids/cids new_mg = mini_gctoo_for_testing.make(convert_neg_666=False) int_indexed_data_df = new_mg.data_df.copy() int_indexed_data_df.index = range(0, 6) int_indexed_data_df.columns = range(10, 16) int_indexed_row_meta = new_mg.row_metadata_df.copy() int_indexed_row_meta.index = range(0, 6) int_indexed_col_meta = new_mg.col_metadata_df.copy() int_indexed_col_meta.index = range(10, 16) int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx") # test with numeric (repr as string) rid/cid mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) mg5 = slice_gct.slice_gctoo( mg5, row_bool=[True, False, True, False, True, False], col_bool=[True, False, False, True, True, True]) mg5.data_df.index.name = "rid" mg5.data_df.columns.name = "cid" mg5.row_metadata_df.index.name = "rid" mg5.row_metadata_df.columns.name = "rhd" mg5.col_metadata_df.index.name = "cid" mg5.col_metadata_df.columns.name = "chd" mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid=[0, 2, 4], cid=[10, 13, 14, 15], convert_neg_666=False) os.remove("int_indexed_mini_gctoo.gctx") assert_frame_equal(mg5.data_df, mg6.data_df) assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # test with ridx/cidx mg7 = slice_gct.slice_gctoo( mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666') mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) assert_frame_equal(mg7.data_df, mg8.data_df) assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df) # test with rid/cidx mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cidx=[4]) assert_frame_equal(mg7.data_df, mg9.data_df) assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df) # test with ridx/cid mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) assert_frame_equal(mg7.data_df, mg10.data_df) assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)