Ejemplo n.º 1
0
def split_cop_data(data_map):
    new_data_map = {}
    for key in data_map:
        temp_gct = data_map[key]
        vi_gct = sub.subset_gctoo(temp_gct, rid=[x for x in temp_gct.data_df.index.tolist() if 'c-' in x])
        gex_gct = sub.subset_gctoo(temp_gct, rid=[x for x in temp_gct.data_df.index.tolist() if 'c-' not in x])
        new_data_map[key + '_vi'] = vi_gct
        new_data_map[key + '_gex'] = gex_gct

    return new_data_map
Ejemplo n.º 2
0
def separate(in_gct, separate_field, row_or_col):
    """ Create a new GCT object for each unique value in separate_field.

    Args:
        in_gct (GCToo object)
        separate_field (string)
        row_or_col (string)

    Returns:
        gcts (list of GCToo objects)
        unique_values_in_field (list of strings)

    """
    if row_or_col == "row":
        assert separate_field in in_gct.row_metadata_df.columns, (
            ("separate_field must be in in_gct.row_metadata_df.columns. " +
             "separate_field: {}, in_gct.row_metadata_df.columns: {}").format(
                 separate_field, in_gct.row_metadata_df.columns.values))

        unique_values_in_field = list(
            in_gct.row_metadata_df.loc[:, separate_field].unique())

        gcts = []
        for val in unique_values_in_field:
            bool_array = in_gct.row_metadata_df.loc[:,
                                                    separate_field].values == val

            new_gct = sg.subset_gctoo(in_gct, row_bool=bool_array)
            gcts.append(new_gct)

    elif row_or_col == "col":
        assert separate_field in in_gct.col_metadata_df.columns, (
            ("separate_field must be in in_gct.col_metadata_df.columns. " +
             "separate_field: {}, in_gct.col_metadata_df.columns: {}").format(
                 separate_field, in_gct.col_metadata_df.columns.values))

        unique_values_in_field = list(
            in_gct.col_metadata_df.loc[:, separate_field].unique())

        gcts = []
        for val in unique_values_in_field:
            bool_array = in_gct.col_metadata_df.loc[:,
                                                    separate_field].values == val
            new_gct = sg.subset_gctoo(in_gct, col_bool=bool_array)
            gcts.append(new_gct)

    else:
        raise (Exception("row or col must be 'row' or 'col'."))

    # Make sure each gct is associated with a value from separate_field
    assert len(gcts) == len(unique_values_in_field), (
        "len(gcts): {}, len(unique_values_in_field): {}".format(
            len(gcts), len(unique_values_in_field)))

    return gcts, unique_values_in_field
Ejemplo n.º 3
0
def main(args):

    # Import data
    assert os.path.exists(
        args.in_gct_path), ("in_gct_path could not be found: {}").format(
            args.in_gct_path)
    in_gct = parse.parse(args.in_gct_path)

    # First, check if any rows are all NaN; if so, remove them
    dropped_df = in_gct.data_df.dropna(how="all")
    bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values)
    in_gct = sg.subset_gctoo(in_gct, row_bool=bools_of_remaining)

    if args.replace_with == "zero":
        in_gct.data_df.fillna(0, inplace=True)

    elif args.replace_with == "median":
        probe_medians = in_gct.data_df.median(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_medians[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    elif args.replace_with == "mean":
        probe_means = in_gct.data_df.mean(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_means[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    wg.write(in_gct, args.out_name, filler_null="NA")
Ejemplo n.º 4
0
    def test_gctx_parsing(self):
        # parse in gctx, no other arguments        
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx")

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", convert_neg_666 = False)
        self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())        

        # parsing w/rids & cids specified 
        test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx",
                    rid=test_rids, cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

        # parsing w/ridx & cidx specified 
        mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                                      cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg6 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)

        # parsing row metadata only
        mg7 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True)
        pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df)

        # parsing col metadata only
        mg8 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True)
        pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df)

        # parsing w/multiindex
        mg9 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", make_multiindex=True)
        self.assertTrue(mg9.multi_index_df is not None)
Ejemplo n.º 5
0
Archivo: card.py Proyecto: cmap/merino
def drop_nans(gctoo):
    nan_cols = gctoo.data_df.isnull().all()[gctoo.data_df.isnull().all() ==
                                            True].index
    nan_rows = gctoo.data_df.isnull().all(axis=1)[gctoo.data_df.isnull().all(
        axis=1) == True].index

    if len(nan_cols) == gctoo.data_df.shape[1]:
        return 'empty_plate'

    new_gctoo = sub.subset_gctoo(gctoo,
                                 exclude_cid=nan_cols,
                                 exclude_rid=nan_rows)
    return new_gctoo
Ejemplo n.º 6
0
def subset_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Read in each of the command line arguments
    rid = _read_arg(args.rid)
    cid = _read_arg(args.cid)
    exclude_rid = _read_arg(args.exclude_rid)
    exclude_cid = _read_arg(args.exclude_cid)

    # If GCT, use subset_gctoo
    if args.in_path.endswith(".gct"):

        in_gct = parse_gct.parse(args.in_path)
        out_gct = sg.subset_gctoo(in_gct,
                                  rid=rid,
                                  cid=cid,
                                  exclude_rid=exclude_rid,
                                  exclude_cid=exclude_cid)

    # If GCTx, use parse_gctx
    else:

        if (exclude_rid is not None) or (exclude_cid is not None):
            msg = "exclude_{rid,cid} args not currently supported for parse_gctx."
            raise (Exception(msg))

        logger.info("Using hyperslab selection functionality of parse_gctx...")
        out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid)

    # Write the output gct
    if args.out_type == "gctx":
        wgx.write(out_gct, args.out_name)
    else:
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
Ejemplo n.º 7
0
    def test_subset_gctoo(self):

        # Error if resulting GCT is empty
        with self.assertRaises(AssertionError) as e:
            sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"])
        self.assertIn("Subsetting yielded an", str(e.exception))

        # cid and col_bool should not both be provided
        with self.assertRaises(AssertionError) as e:
            sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False])
        self.assertIn("Only one of cid,", str(e.exception))

        # Providing all 3 row inputs is also bad!
        with self.assertRaises(AssertionError) as e:
            sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!")
        self.assertIn("Only one of rid,", str(e.exception))

        # happy path
        out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0],
                               exclude_rid=["a"])
        pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]])
Ejemplo n.º 8
0
def parse(file_path, convert_neg_666=True, rid=None, cid=None,
          ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False):
    """
    The main method.

    Args:
        - file_path (string): full path to gct(x) file you want to parse
        - convert_neg_666 (bool): whether to convert -666 values to numpy.nan
            (see Note below for more details). Default = False.
        - rid (list of strings): list of row ids to specifically keep from gct. Default=None.
        - cid (list of strings): list of col ids to specifically keep from gct. Default=None.
        - ridx (list of integers): only read the rows corresponding to this
            list of integer ids. Default=None.
        - cidx (list of integers): only read the columns corresponding to this
            list of integer ids. Default=None.
        - row_meta_only (bool): Whether to load data + metadata (if False), or
            just row metadata (if True) as pandas DataFrame
        - col_meta_only (bool): Whether to load data + metadata (if False), or
            just col metadata (if True) as pandas DataFrame
        - make_multiindex (bool): whether to create a multi-index df combining
            the 3 component dfs

    Returns:
        - myGCToo (GCToo object): A GCToo instance containing content of
            parsed gct file ** OR **
        - row_metadata (pandas df) ** OR ** col_metadata (pandas df)

    Note: why is convert_neg_666 even a thing?
        In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
        for metadata. However (so that users can take full advantage of pandas' methods,
        including those for filtering nan's etc) we provide the option of converting these
        into numpy.nan values, the pandas default.

    """
    assert sum([row_meta_only, col_meta_only]) <= 1, (
        "row_meta_only and col_meta_only cannot both be requested.")

    nan_values = [
        "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN",
        "nan", "-nan", "#N/A!", "na", "NA", "None", "#VALUE!"]

    # Add "-666" to the list of NaN values
    if convert_neg_666:
        nan_values.append("-666")

    # Verify that the gct path exists
    if not os.path.exists(file_path):
        err_msg = "The given path to the gct file cannot be found. gct_path: {}"
        logger.error(err_msg.format(file_path))
        raise Exception(err_msg.format(file_path))
    logger.info("Reading GCT: {}".format(file_path))

    # Read version and dimensions
    (version, num_data_rows, num_data_cols,
     num_row_metadata, num_col_metadata) = read_version_and_dims(file_path)

    # Read in metadata and data
    (row_metadata, col_metadata, data) = parse_into_3_df(
        file_path, num_data_rows, num_data_cols,
        num_row_metadata, num_col_metadata, nan_values)

    # Create the gctoo object and assemble 3 component dataframes
    # Not the most efficient if only metadata requested (i.e. creating the
    # whole GCToo just to return the metadata df), but simplest
    myGCToo = create_gctoo_obj(file_path, version, row_metadata, col_metadata,
                               data, make_multiindex)
    # Subset if requested
    if (rid is not None) or (ridx is not None) or (cid is not None) or (cidx is not None):
        logger.info("Subsetting GCT... (note that there are no speed gains when subsetting GCTs)")
        myGCToo = sg.subset_gctoo(myGCToo, rid=rid, cid=cid, ridx=ridx, cidx=cidx)

    if row_meta_only:
        return myGCToo.row_metadata_df

    elif col_meta_only:
        return myGCToo.col_metadata_df

    else:
        return myGCToo
Ejemplo n.º 9
0
# for storing timing results
gct_times = {}
gctx_times = {}

# large input gctx; see notes above for more info about this
big_gctoo = parse.parse("/path/to/large/gctx/file")

# column and row spaces to test writing on
col_spaces = [96, 384, 1536, 3000, 6000, 12000, 24000, 48000, 100000]
row_spaces = [978, 10174]

for c in col_spaces:
    for r in row_spaces:
        curr_gctoo = sg.subset_gctoo(big_gctoo,
                                     ridx=range(0, r),
                                     cidx=range(0, c))
        # gct writing
        out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct"
        start = time.clock()
        write_gct.write(curr_gctoo, out_fname)
        end = time.clock()
        elapsed_time = end - start
        gct_times[out_fname] = elapsed_time
        os.remove(out_fname)
        # gctx writing
        out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx"
        start = time.clock()
        write_gctx.write(curr_gctoo, out_fname)
        end = time.clock()
        elapsed_time = end - start
Ejemplo n.º 10
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=test_rids,
            cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df,
                                          mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df,
                                          mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = [str(i) for i in range(0, 6)]
        int_indexed_data_df.columns = [str(i) for i in range(10, 16)]

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = int_indexed_data_df.index

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = int_indexed_data_df.columns

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = subset_gctoo.subset_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=["0", "2", "4"],
                               cid=["10", "13", "14", "15"],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df,
                                          mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df,
                                          mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = subset_gctoo.subset_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg8 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg8.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg9.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg10.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg10.col_metadata_df)

        # test with row_meta_only
        mg11 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            row_meta_only=True)
        pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df)

        # test with col_meta_only
        mg12 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            col_meta_only=True)
        pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df)

        # test with sort_row_meta False and ridx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
        )

        # test with sort_col_meta False and cidx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False)

        pandas_testing.assert_frame_equal(mg13.data_df,
                                          mg1.data_df.iloc[:, [4, 1, 3]])
        pandas_testing.assert_frame_equal(
            mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :])
        pandas_testing.assert_frame_equal(mg13.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and ridx
        mg14 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False)

        pandas_testing.assert_frame_equal(mg14.data_df,
                                          mg1.data_df.iloc[[3, 0, 1], :])
        pandas_testing.assert_frame_equal(mg14.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(
            mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cidx and col_meta_only
        mg15 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False,
            col_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :])

        # test with sort_row_meta False and ridx and row_meta_only
        mg16 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False,
            row_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cid
        cid_unsorted = [
            'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10',
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg17 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cid=cid_unsorted,
            sort_col_meta=False)
        pandas_testing.assert_frame_equal(mg17.data_df,
                                          mg1.data_df.iloc[:, [2, 0]])
        pandas_testing.assert_frame_equal(mg17.col_metadata_df,
                                          mg1.col_metadata_df.iloc[[2, 0], :])
        pandas_testing.assert_frame_equal(mg17.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and rid
        rid_unsorted = [
            'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10',
            'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg18 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=rid_unsorted,
            sort_row_meta=False)
        pandas_testing.assert_frame_equal(mg18.data_df,
                                          mg1.data_df.iloc[[5, 1], :])
        pandas_testing.assert_frame_equal(mg18.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(mg18.row_metadata_df,
                                          mg1.row_metadata_df.iloc[[5, 1], :])