Exemple #1
0
def parse(file_path,
          convert_neg_666=True,
          rid=None,
          cid=None,
          make_multiindex=False):
    """ The main method.

    Args:
        - file_path (string): full path to gct(x) file you want to parse
        - convert_neg_666 (bool): whether to convert -666 values to numpy.nan
            (see Note below for more details). Default = True.
        - rid (list of strings): list of row ids to specifically keep  None keeps all rids
        - cid (list of strings): list of col ids to specifically keep, None keeps all cids
        - make_multiindex (bool): whether to create a multi-index df combining
            the 3 component dfs

    Returns:
        gctoo_obj (GCToo object)

    Note: why is convert_neg_666 even a thing?
        In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
        for metadata. However (so that users can take full advantage of pandas' methods,
        including those for filtering nan's etc) we provide the option of converting these
        into numpy.nan values, the pandas default.

    """
    nan_values = [
        "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan",
        "#N/A!", "na", "NA", "None"
    ]

    # Add "-666" to the list of NaN values
    if convert_neg_666:
        nan_values.append("-666")

    # Verify that the gct path exists
    if not os.path.exists(file_path):
        err_msg = "The given path to the gct file cannot be found. gct_path: {}"
        logger.error(err_msg.format(file_path))
        raise (Exception(err_msg.format(file_path)))
    logger.info("Reading GCT: {}".format(file_path))

    # Read version and dimensions
    (version, num_data_rows, num_data_cols, num_row_metadata,
     num_col_metadata) = read_version_and_dims(file_path)

    # Read in metadata and data
    (row_metadata, col_metadata,
     data) = parse_into_3_df(file_path, num_data_rows, num_data_cols,
                             num_row_metadata, num_col_metadata, nan_values)

    # Create the gctoo object and assemble 3 component dataframes
    gctoo_obj = create_gctoo_obj(file_path, version, row_metadata,
                                 col_metadata, data, make_multiindex)

    # If requested, slice gctoo
    if (rid is not None) or (cid is not None):
        gctoo_obj = slice_gct.slice_gctoo(gctoo_obj, rid=rid, cid=cid)

    return gctoo_obj
 def test_slice_cid_and_col_bool(self):
     # cid and col_bool should not both be provided
     with self.assertRaises(AssertionError) as e:
         out_gct = slice_gct.slice_gctoo(IN_GCT,
                                         cid=["e", "f", "g"],
                                         col_bool=[True, True, False])
     self.assertIn("cid and col_bool", str(e.exception))
Exemple #3
0
def parse(file_path, convert_neg_666=True, rid=None, cid=None):
    """The main method.

	Args:
		Mandatory:
		- file_path (str): full path to gctx file you want to parse. 
		
		Optional:
		- convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not 
			(see Note below for more details on this). Default = False.
		- rid (list of strings): list of row ids to specifically keep from gctx. Default=None. 
		- cid (list of strings): list of col ids to specifically keep from gctx. Default=None. 

	Returns:
		gctoo_obj: GCToo object

	Note: why is convert_neg_666 even a thing? 
	- In CMap--for somewhat obscure historical reasons--we use "-666" as our null value 
	for metadata. However (so that users can take full advantage of pandas' methods, 
	including those for filtering nan's etc) we provide the option of converting these 
	into numpy.NaN values, the pandas default. 

	"""
    if convert_neg_666:
        # Use default nan values if none given
        nan_values = [
            "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan",
            "#N/A!", "na", "NA", "None", "-666"
        ]
    else:
        nan_values = [
            "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN", "nan", "-nan",
            "#N/A!", "na", "NA", "None"
        ]

    # Verify that the gct path exists
    if not os.path.exists(file_path):
        err_msg = "The given path to the gct file cannot be found. gct_path: {}"
        logger.error(err_msg.format(file_path))
        raise (Exception(err_msg.format(file_path)))
    logger.info("Reading GCT: {}".format(file_path))

    # Read version and dimensions
    (version, num_data_rows, num_data_cols, num_row_metadata,
     num_col_metadata) = read_version_and_dims(file_path)

    # Read in metadata and data
    (row_metadata, col_metadata,
     data) = parse_into_3_df(file_path, num_data_rows, num_data_cols,
                             num_row_metadata, num_col_metadata, nan_values)

    # Create the gctoo object and assemble 3 component dataframes
    gctoo_obj = create_gctoo_obj(file_path, version, row_metadata,
                                 col_metadata, data)

    # slice (if applicable) as specified by user
    gctoo_obj = slice_gct.slice_gctoo(gctoo_obj, rid=rid, cid=cid)

    return gctoo_obj
    def test_slice_and_exclude_rids(self):
        out_gct = slice_gct.slice_gctoo(IN_GCT,
                                        rid=["a", "c", "d"],
                                        exclude_rid=["d"])

        # Outputs should be dataframes even if there is only 1 index or column
        pd.util.testing.assert_frame_equal(out_gct.data_df,
                                           IN_GCT.data_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(
            out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(out_gct.col_metadata_df,
                                           IN_GCT.col_metadata_df)
    def test_slice_bools(self):
        out_gct = slice_gct.slice_gctoo(IN_GCT,
                                        row_bool=[True, False, True, False],
                                        col_bool=[False, False, True])

        # Outputs should be dataframes even if there is only 1 index or column
        pd.util.testing.assert_frame_equal(
            out_gct.data_df, pd.DataFrame(IN_GCT.data_df.iloc[[0, 2], 2]))
        pd.util.testing.assert_frame_equal(
            out_gct.row_metadata_df, IN_GCT.row_metadata_df.iloc[[0, 2], :])
        pd.util.testing.assert_frame_equal(
            out_gct.col_metadata_df,
            pd.DataFrame(IN_GCT.col_metadata_df.iloc[2, :]).T)
Exemple #6
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

        assert_frame_equal(mg1.data_df, mg2.data_df)
        assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
        assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=test_rids,
                               cid=test_cids)
        assert_frame_equal(mg3.data_df, mg4.data_df)
        assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
        assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = range(0, 6)
        int_indexed_data_df.columns = range(10, 16)

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = range(0, 6)

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = range(10, 16)

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = slice_gct.slice_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=[0, 2, 4],
                               cid=[10, 13, 14, 15],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        assert_frame_equal(mg5.data_df, mg6.data_df)
        assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
        assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = slice_gct.slice_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
        mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               ridx=[4],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg8.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg9.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                                ridx=[4],
                                cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        assert_frame_equal(mg7.data_df, mg10.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)