Example #1
0
	def test_parse_rid_as_entrez_id(self):
		input_file = "functional_tests/test_parse_gctx_rid_entrez_id.gctx"
		g = parse_gctx.parse(input_file)
		self.assertEqual((5,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))

		my_rids = [5720, 55847, 7416]
		g = parse_gctx.parse(input_file, rid=my_rids)
		self.assertEqual((3,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))

		my_rids = [str(x) for x in my_rids]
		logger.debug("using rid as str (mismatched type) - my_rids:  {}".format(my_rids))
		g = parse_gctx.parse(input_file, rid=my_rids)
		self.assertEqual((3,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))
Example #2
0
    def test_with_only_row_metadata(self):

        # path to files
        gctoo_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gct"
        gctoox_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gctx"

        # parse files
        c2_gctoo = parse_gct.parse(gctoo_path)
        c2_gctoox = parse_gctx.parse(gctoox_path)

        # check rows and columns: data_df
        self.assertTrue(set(list(c2_gctoo.data_df.index)) == set(list(c2_gctoox.data_df.index)),
                        "Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.data_df.index,
                                                                                                c2_gctoox.data_df.index))
        self.assertTrue(set(list(c2_gctoo.data_df.columns)) == set(list(c2_gctoox.data_df.columns)),
                        "Mismatch between data_df column values of gct vs gctx: {} vs {}".format(
                            c2_gctoo.data_df.columns, c2_gctoox.data_df.columns))
        logger.debug("c2 gctoo data_df columns equal to gctoox data_df columns? {}".format(
            set(c2_gctoo.data_df.columns) == set(c2_gctoox.data_df.columns)))
        for c in list(c2_gctoo.data_df.columns):
            self.assertTrue(len(list(c2_gctoo.data_df[c])) == len(list(c2_gctoox.data_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            pandas_testing.assert_series_equal(c2_gctoo.data_df[c], c2_gctoox.data_df[c])

        # check rows and columns: row_metadata_df
        self.assertTrue(set(list(c2_gctoo.row_metadata_df.index)) == set(list(c2_gctoox.row_metadata_df.index)),
                        "Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(
                            c2_gctoo.row_metadata_df.index, c2_gctoox.row_metadata_df.index))
        self.assertTrue(set(list(c2_gctoo.row_metadata_df.columns)) == set(list(c2_gctoox.row_metadata_df.columns)),
                        "Mismatch between row_metadata_df column values of gct vs gctx: {} vs {}".format(
                            c2_gctoo.row_metadata_df.columns, c2_gctoox.row_metadata_df.columns))
        logger.debug("c2 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(
            set(c2_gctoo.row_metadata_df.columns) == set(c2_gctoox.row_metadata_df.columns)))
        for c in list(c2_gctoo.row_metadata_df.columns):
            self.assertTrue(len(list(c2_gctoo.row_metadata_df[c])) == len(list(c2_gctoox.row_metadata_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            self.assertTrue(c2_gctoo.row_metadata_df[c].dtype == c2_gctoox.row_metadata_df[c].dtype,
                            "Dtype mismatch between parsed gct & gctx: {} vs {}".format(
                                c2_gctoo.row_metadata_df[c].dtype, c2_gctoox.row_metadata_df[c].dtype))
            logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c2_gctoo.row_metadata_df[c])[0:3]))
            pandas_testing.assert_series_equal(c2_gctoo.row_metadata_df[c], c2_gctoox.row_metadata_df[c])

        # check rows and columns: col_metadata_df
        self.assertTrue(set(list(c2_gctoo.col_metadata_df.index)) == set(list(c2_gctoox.col_metadata_df.index)),
                        "Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(
                            c2_gctoo.col_metadata_df.index, c2_gctoox.col_metadata_df.index))
        self.assertTrue(set(list(c2_gctoo.col_metadata_df.columns)) == set(list(c2_gctoox.col_metadata_df.columns)),
                        "Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(
                            c2_gctoo.col_metadata_df.columns, c2_gctoox.col_metadata_df.columns))
        logger.debug("c2 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(
            set(c2_gctoo.col_metadata_df.columns) == set(c2_gctoox.col_metadata_df.columns)))
        for c in list(c2_gctoo.col_metadata_df.columns):
            self.assertTrue(len(list(c2_gctoo.col_metadata_df[c])) == len(list(c2_gctoox.col_metadata_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            self.assertTrue(c2_gctoo.col_metadata_df[c].dtype == c2_gctoox.col_metadata_df[c].dtype,
                            "Dtype mismatch between parsed gct & gctx: {} vs {}".format(
                                c2_gctoo.col_metadata_df[c].dtype, c2_gctoox.col_metadata_df[c].dtype))
            pandas_testing.assert_series_equal(c2_gctoo.col_metadata_df[c], c2_gctoox.col_metadata_df[c])
Example #3
0
def parse(file_path,
          convert_neg_666=True,
          rid=None,
          cid=None,
          ridx=None,
          cidx=None,
          row_meta_only=False,
          col_meta_only=False,
          make_multiindex=False):
    """
    Identifies whether file_path corresponds to a .gct or .gctx file and calls the
    correct corresponding parse method.

    Input:
        Mandatory:
        - gct(x)_file_path (str): full path to gct(x) file you want to parse.

        Optional:
        - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
            as pandas DataFrame
        - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
            as pandas DataFrame
        - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
            (see Note below for more details on this). Default = False.
        - rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
        - cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
        - make_multiindex (bool): whether to create a multi-index df combining
            the 3 component dfs

    Output:
        - myGCToo (GCToo)

    Note: why does convert_neg_666 exist?
        - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
        for metadata. However (so that users can take full advantage of pandas' methods,
        including those for filtering nan's etc) we provide the option of converting these
        into numpy.NaN values, the pandas default.
    """
    if file_path.endswith(".gct"):
        # Ignoring arguments that won't be passed to parse_gct
        for unused_arg in ["rid", "cid", "ridx", "cidx"]:
            if eval(unused_arg):
                err_msg = "parse_gct does not use the argument {}. Ignoring it...".format(
                    unused_arg)
                logger.error(err_msg)
                raise Exception(err_msg)
        curr = parse_gct.parse(file_path, convert_neg_666, row_meta_only,
                               col_meta_only, make_multiindex)
    elif file_path.endswith(".gctx"):
        curr = parse_gctx.parse(file_path, convert_neg_666, rid, cid, ridx,
                                cidx, row_meta_only, col_meta_only,
                                make_multiindex)
    else:
        err_msg = "File to parse must be .gct or .gctx!"
        logger.error(err_msg)
        raise Exception(err_msg)
    return curr
Example #4
0
    def test_gct2gctx_main(self):

        in_name = "../functional_tests/mini_gctoo_for_testing.gct"
        out_name = "../functional_tests/test_gct2gctx_out.gctx"
        args_string = "-f {} -o {}".format(in_name, out_name)
        args = gct2gctx.build_parser().parse_args(args_string.split())

        gct2gctx.gct2gctx_main(args)

        # Make sure the input is identical to output
        in_gct = parse_gct.parse(in_name)
        out_gctx = parse_gctx.parse(out_name)

        pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df)
        pd.util.testing.assert_frame_equal(in_gct.col_metadata_df,
                                           out_gctx.col_metadata_df)
        pd.util.testing.assert_frame_equal(in_gct.row_metadata_df,
                                           out_gctx.row_metadata_df)

        no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
        added_meta = "../functional_tests/test_gct2gctx_out_annotated.gctx"
        row_meta = "../functional_tests/test_rowmeta_n6.txt"
        col_meta = "../functional_tests/test_colmeta_n6.txt"
        args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(
            no_meta, added_meta, row_meta, col_meta)
        args = gct2gctx.build_parser().parse_args(args_string.split())

        gct2gctx.gct2gctx_main(args)

        annotated_gctx = parse_gctx.parse(added_meta)

        # Check added annotations are the same as original input GCTX
        pd.util.testing.assert_frame_equal(in_gct.data_df,
                                           annotated_gctx.data_df,
                                           check_less_precise=3)
        pd.util.testing.assert_frame_equal(in_gct.col_metadata_df,
                                           annotated_gctx.col_metadata_df)
        pd.util.testing.assert_frame_equal(in_gct.row_metadata_df,
                                           annotated_gctx.row_metadata_df)

        # Clean up
        os.remove(out_name)
        os.remove(added_meta)
Example #5
0
def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=None,
          row_meta_only=False, col_meta_only=False, make_multiindex=False):
    """
    Identifies whether file_path corresponds to a .gct or .gctx file and calls the
    correct corresponding parse method.

    Input:
        Mandatory:
        - gct(x)_file_path (str): full path to gct(x) file you want to parse.

        Optional:
        - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
            (see Note below for more details on this). Default = False.
        - rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
        - cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
        - ridx (list of integers): only read the rows corresponding to this
            list of integer ids. Default=None.
        - cidx (list of integers): only read the columns corresponding to this
            list of integer ids. Default=None.
        - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
            as pandas DataFrame
        - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
            as pandas DataFrame
        - make_multiindex (bool): whether to create a multi-index df combining
            the 3 component dfs

    Output:
        - out (GCToo object or pandas df): if row_meta_only or col_meta_only, then
            out is a metadata df; otherwise, it's a GCToo instance containing
            content of parsed gct(x) file

    Note: why does convert_neg_666 exist?
        - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
        for metadata. However (so that users can take full advantage of pandas' methods,
        including those for filtering nan's etc) we provide the option of converting these
        into numpy.NaN values, the pandas default.
    """
    if file_path.endswith(".gct"):
        out = parse_gct.parse(file_path, convert_neg_666=convert_neg_666,
                              rid=rid, cid=cid, ridx=ridx, cidx=cidx,
                              row_meta_only=row_meta_only, col_meta_only=col_meta_only,
                              make_multiindex=make_multiindex)

    elif file_path.endswith(".gctx"):
        out = parse_gctx.parse(file_path, convert_neg_666=convert_neg_666,
                              rid=rid, cid=cid, ridx=ridx, cidx=cidx,
                              row_meta_only=row_meta_only, col_meta_only=col_meta_only,
                              make_multiindex=make_multiindex)

    else:
        err_msg = "File to parse must be .gct or .gctx!"
        logger.error(err_msg)
        raise Exception(err_msg)

    return out
Example #6
0
def main():
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
    if args.output_filepath == None:
        basename = os.path.basename(args.filename)
        out_name = ".".join(basename.split(".")[:-1])
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Example #7
0
def gctx2gct_main(args):
    """ Separate from main() in order to make command-line tool. """

    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)

    if args.output_filepath is None:
        basename = os.path.basename(args.filename)
        out_name = os.path.splitext(basename)[0] + ".gct"
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Example #8
0
def get_expression(gctx_path, per_dict, gene_dict, sig_info, cell_need,
                   drug_name):
    drug_ids = list(
        sig_info[(sig_info["pert_iname"] == drug_name)
                 & (sig_info["cell_id"].isin(cell_need))]["sig_id"])
    data = parse(gctx_path, convert_neg_666=True, cid=drug_ids)
    data_pd = data.data_df
    data_pd.index = [
        gene_dict[g] if g in gene_dict else None for g in data_pd.index
    ]
    data_pd = data_pd.T
    data_pd['drug_name'] = [
        per_dict[g] if g in per_dict else None for g in data_pd.index
    ]
    c = list(data_pd.columns)
    c.remove('drug_name')
    data_pd_mean = data_pd.groupby('drug_name')[c].mean()
    return data_pd_mean
Example #9
0
    def test_gct2gctx_main(self):
        in_name = "functional_tests/mini_gctoo_for_testing.gct"
        out_name = "functional_tests/test_gct2gctx_out.gctx"
        args_string = "-f {} -o {}".format(in_name, out_name)
        args = gct2gctx.build_parser().parse_args(args_string.split())

        gct2gctx.gct2gctx_main(args)

        # Make sure the input is identical to output
        in_gct = parse_gct.parse(in_name)
        out_gctx = parse_gctx.parse(out_name)

        pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df)
        pd.util.testing.assert_frame_equal(in_gct.col_metadata_df,
                                           out_gctx.col_metadata_df)
        pd.util.testing.assert_frame_equal(in_gct.row_metadata_df,
                                           out_gctx.row_metadata_df)

        # Clean up
        os.remove(out_name)
Example #10
0
def subset_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Read in each of the command line arguments
    rid = _read_arg(args.rid)
    cid = _read_arg(args.cid)
    exclude_rid = _read_arg(args.exclude_rid)
    exclude_cid = _read_arg(args.exclude_cid)

    # If GCT, use subset_gctoo
    if args.in_path.endswith(".gct"):

        in_gct = parse_gct.parse(args.in_path)
        out_gct = sg.subset_gctoo(in_gct,
                                  rid=rid,
                                  cid=cid,
                                  exclude_rid=exclude_rid,
                                  exclude_cid=exclude_cid)

    # If GCTx, use parse_gctx
    else:

        if (exclude_rid is not None) or (exclude_cid is not None):
            msg = "exclude_{rid,cid} args not currently supported for parse_gctx."
            raise (Exception(msg))

        logger.info("Using hyperslab selection functionality of parse_gctx...")
        out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid)

    # Write the output gct
    if args.out_type == "gctx":
        wgx.write(out_gct, args.out_name)
    else:
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    def test_parse_gctx(self):
        # tests the parsing of a gct file with high precision values
        gctx_filepath = os.path.join(FUNCTIONAL_TESTS_PATH, 'test_l1000_highprecision.gctx')
        data_gctx = parse_gctx.parse(gctx_filepath)

        (data, row_metadata, col_metadata) = (data_gctx.data_df, data_gctx.row_metadata_df, data_gctx.col_metadata_df)
        e_dims = [978, 377, 11, 37]
        actual_version = 'GCTX1.0'

        # Check shapes of outputs
        self.assertTrue(row_metadata.shape == (e_dims[0], e_dims[2]),
                        ("row_metadata.shape = {} " +
                         "but expected it to be ({}, {})").format(row_metadata.shape,
                                                                  e_dims[0], e_dims[2]))
        self.assertTrue(col_metadata.shape == (e_dims[1], e_dims[3]),
                        ("col_metadata.shape = {} " +
                         "but expected it to be ({}, {})").format(col_metadata.shape,
                                                                  e_dims[1], e_dims[3]))
        self.assertTrue(data.shape == (e_dims[0], e_dims[1]),
                        ("data.shape = {} " +
                         "but expected it to be ({}, {})").format(data.shape,
                                                                  e_dims[0], e_dims[1]))
        # Check version
        self.assertEqual(actual_version, data_gctx.version.decode())

        # Check the type of data
        self.assertTrue(isinstance(data.iloc[0, 0], np.float32),
                        "The data should be a float32, not {}".format(type(data.iloc[0, 0])))

        # Check a few high precision floating values in data
        correct_val = np.float32(0.3473286032676697)
        self.assertTrue(data.iloc[0, 0] == correct_val,
                        ("The first value in the data matrix should be " +
                         "{} not {}").format(correct_val, data.iloc[0, 0]))
        correct_val = np.float32(-0.624971330165863)
        self.assertTrue(data.iloc[e_dims[0] - 1, e_dims[1] - 1] == correct_val,
                        ("The last value in the data matrix should be " +
                         str(correct_val) + " not {}").format(data.iloc[e_dims[0] - 1, e_dims[1] - 1]))
Example #12
0
def gctx2gct_main(args):
    """ Separate from main() in order to make command-line tool. """

    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)

    if args.output_filepath is None:
        basename = os.path.basename(args.filename)
        out_name = os.path.splitext(basename)[0] + ".gct"
    else:
        out_name = args.output_filepath
    """ If annotations are supplied, parse table and set metadata_df """
    if args.row_annot_path is None:
        pass
    else:
        row_metadata = pd.read_csv(args.row_annot_path,
                                   sep='\t',
                                   index_col=0,
                                   header=0,
                                   low_memory=False)
        assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \
            "Row ids in matrix missing from annotations file"
        in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(
            in_gctoo.data_df.index)]

    if args.col_annot_path is None:
        pass
    else:
        col_metadata = pd.read_csv(args.col_annot_path,
                                   sep='\t',
                                   index_col=0,
                                   header=0,
                                   low_memory=False)
        assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \
            "Column ids in matrix missing from annotations file"
        in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(
            in_gctoo.data_df.columns)]

    write_gct.write(in_gctoo, out_name)
    def test_write_gctx(self):
        out_name = os.path.join(FUNCTIONAL_TESTS_PATH, 'test_write_out_py2py3.gctx')

        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)
        write_gctx.write(gctoo, out_name,
                         convert_back_to_neg_666=True, gzip_compression_level=6,
                         max_chunk_kb=1024, matrix_dtype=np.float32)

        # Read in the gct and verify that it's the same as gctoo
        # re-ininitalising gctooo because write_gctx is changing dtype of one column of col_metadata_df
        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)

        new_gctx = parse_gctx.parse(out_name)

        pd.testing.assert_frame_equal(new_gctx.data_df, gctoo.data_df)
        pd.testing.assert_frame_equal(new_gctx.row_metadata_df, gctoo.row_metadata_df)
        pd.testing.assert_frame_equal(new_gctx.col_metadata_df, gctoo.col_metadata_df)

        # Cleanup
        os.remove(out_name)
Example #14
0
	def test_parse(self):
		# parse whole thing 
		mg1 = mini_gctoo_for_testing.make()
		mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

		assert_frame_equal(mg1.data_df, mg2.data_df)
		assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
		assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

		# test with string rid/cid 
		test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33','LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
		test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
		mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
		mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
			rid = test_rids, cid = test_cids)
		assert_frame_equal(mg3.data_df, mg4.data_df)
		assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
		assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

		# first, make & write out temp version of mini_gctoo with int rids/cids 
		new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
		int_indexed_data_df = new_mg.data_df.copy()
		int_indexed_data_df.index = range(0,6)
		int_indexed_data_df.columns = range(10,16)

		int_indexed_row_meta = new_mg.row_metadata_df.copy()
		int_indexed_row_meta.index = range(0,6)

		int_indexed_col_meta = new_mg.col_metadata_df.copy()
		int_indexed_col_meta.index = range(10,16)

		int_indexed_gctoo = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta,
			col_metadata_df = int_indexed_col_meta)

		write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

		# test with numeric (repr as string) rid/cid
		mg5 = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, 
			col_metadata_df = int_indexed_col_meta)
		mg5 = slice_gct.slice_gctoo(mg5, row_bool = [True, False, True, False, True, False],
			col_bool = [True, False, False, True, True, True])

		mg5.data_df.index.name = "rid"
		mg5.data_df.columns.name = "cid"

		mg5.row_metadata_df.index.name = "rid"
		mg5.row_metadata_df.columns.name = "rhd"

		mg5.col_metadata_df.index.name = "cid"
		mg5.col_metadata_df.columns.name = "chd"

		mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid = [0, 2, 4], 
			cid = [10,13,14,15], convert_neg_666=False)

		os.remove("int_indexed_mini_gctoo.gctx")

		assert_frame_equal(mg5.data_df, mg6.data_df)
		assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
		assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)		

		# test with ridx/cidx
		mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 
			cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
		mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

		assert_frame_equal(mg7.data_df, mg8.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)			

		# test with rid/cidx
		mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
			cidx = [4])

		assert_frame_equal(mg7.data_df, mg9.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)			

		# test with ridx/cid
		mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4],
			cid = ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

		assert_frame_equal(mg7.data_df, mg10.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)			
Example #15
0
    def test_with_both_metadata_fields(self):

        # path to files
        gctoo_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gct"
        gctoox_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gctx"

        # parse files
        c1_gctoo = parse_gct.parse(gctoo_path)
        c1_gctoox = parse_gctx.parse(gctoox_path)

        # check rows and columns: data_df
        self.assertTrue(set(list(c1_gctoo.data_df.index)) == set(list(c1_gctoox.data_df.index)),
                        "Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.data_df.index,
                                                                                                c1_gctoox.data_df.index))
        self.assertTrue(set(list(c1_gctoo.data_df.columns)) == set(list(c1_gctoox.data_df.columns)),
                        "Mismatch between data_df column values of gct vs gctx: {} vs {}".format(
                            c1_gctoo.data_df.columns, c1_gctoox.data_df.columns))
        logger.debug("c1 gctoo data_df columns equal to gctoox data_df columns? {}".format(
            set(c1_gctoo.data_df.columns) == set(c1_gctoox.data_df.columns)))
        for c in list(c1_gctoo.data_df.columns):
            # logger.debug("Comparing data values in Column: {}".format(c))
            self.assertTrue(len(list(c1_gctoo.data_df[c])) == len(list(c1_gctoox.data_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            pandas_testing.assert_series_equal(c1_gctoo.data_df[c], c1_gctoox.data_df[c])

        # check rows and columns: row_metadata_df
        self.assertTrue(set(list(c1_gctoo.row_metadata_df.index)) == set(list(c1_gctoox.row_metadata_df.index)),
                        "Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(
                            c1_gctoo.row_metadata_df.index, c1_gctoox.row_metadata_df.index))
        self.assertTrue(set(list(c1_gctoo.row_metadata_df.columns)) == set(list(c1_gctoox.row_metadata_df.columns)),
                        "Mismatch between row_metadata_df column values of gct vs gctx: difference is {}".format(
                            set(c1_gctoo.row_metadata_df.columns).symmetric_difference(
                                set(c1_gctoox.row_metadata_df.columns))))
        logger.debug("c1 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(
            set(c1_gctoo.row_metadata_df.columns) == set(c1_gctoox.row_metadata_df.columns)))
        logger.debug("c1 gctoo dtypes: {}".format(c1_gctoo.row_metadata_df.dtypes))
        logger.debug("c1 gctoox dtypes: {}".format(c1_gctoox.row_metadata_df.dtypes))
        for c in list(c1_gctoo.row_metadata_df.columns):
            self.assertTrue(len(list(c1_gctoo.row_metadata_df[c])) == len(list(c1_gctoox.row_metadata_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c1_gctoo.row_metadata_df[c])[0:3]))
            self.assertTrue(c1_gctoo.row_metadata_df[c].dtype == c1_gctoox.row_metadata_df[c].dtype,
                            "Dtype mismatch for {} between parsed gct & gctx: {} vs {}".format(c,
                                                                                               c1_gctoo.row_metadata_df[
                                                                                                   c].dtype,
                                                                                               c1_gctoox.row_metadata_df[
                                                                                                   c].dtype))
            pandas_testing.assert_series_equal(c1_gctoo.row_metadata_df[c], c1_gctoox.row_metadata_df[c])

        # check rows and columns: col_metadata_df
        self.assertTrue(set(list(c1_gctoo.col_metadata_df.index)) == set(list(c1_gctoox.col_metadata_df.index)),
                        "Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(
                            c1_gctoo.col_metadata_df.index, c1_gctoox.col_metadata_df.index))
        self.assertTrue(set(list(c1_gctoo.col_metadata_df.columns)) == set(list(c1_gctoox.col_metadata_df.columns)),
                        "Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(
                            c1_gctoo.col_metadata_df.columns, c1_gctoox.col_metadata_df.columns))
        logger.debug("c1 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(
            set(c1_gctoo.col_metadata_df.columns) == set(c1_gctoox.col_metadata_df.columns)))
        for c in list(c1_gctoo.col_metadata_df.columns):
            self.assertTrue(len(list(c1_gctoo.col_metadata_df[c])) == len(list(c1_gctoox.col_metadata_df[c])),
                            "Lengths of column {} differ between gct and gctx".format(c))
            self.assertTrue(c1_gctoo.col_metadata_df[c].dtype == c1_gctoox.col_metadata_df[c].dtype,
                            "Dtype mismatch between parsed gct & gctx: {} vs {}".format(
                                c1_gctoo.col_metadata_df[c].dtype, c1_gctoox.col_metadata_df[c].dtype))

            pandas_testing.assert_series_equal(c1_gctoo.col_metadata_df[c], c1_gctoox.col_metadata_df[c])
def load_gctx(gctx_file):
    parsed = parse_gctx.parse(gctx_file)
    return parsed.data_df
Example #17
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=test_rids,
            cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df,
                                          mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df,
                                          mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = [str(i) for i in range(0, 6)]
        int_indexed_data_df.columns = [str(i) for i in range(10, 16)]

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = int_indexed_data_df.index

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = int_indexed_data_df.columns

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = subset_gctoo.subset_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=["0", "2", "4"],
                               cid=["10", "13", "14", "15"],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df,
                                          mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df,
                                          mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = subset_gctoo.subset_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg8 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg8.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg9.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg10.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg10.col_metadata_df)

        # test with row_meta_only
        mg11 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            row_meta_only=True)
        pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df)

        # test with col_meta_only
        mg12 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            col_meta_only=True)
        pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df)

        # test with sort_row_meta False and ridx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
        )

        # test with sort_col_meta False and cidx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False)

        pandas_testing.assert_frame_equal(mg13.data_df,
                                          mg1.data_df.iloc[:, [4, 1, 3]])
        pandas_testing.assert_frame_equal(
            mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :])
        pandas_testing.assert_frame_equal(mg13.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and ridx
        mg14 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False)

        pandas_testing.assert_frame_equal(mg14.data_df,
                                          mg1.data_df.iloc[[3, 0, 1], :])
        pandas_testing.assert_frame_equal(mg14.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(
            mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cidx and col_meta_only
        mg15 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False,
            col_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :])

        # test with sort_row_meta False and ridx and row_meta_only
        mg16 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False,
            row_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cid
        cid_unsorted = [
            'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10',
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg17 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cid=cid_unsorted,
            sort_col_meta=False)
        pandas_testing.assert_frame_equal(mg17.data_df,
                                          mg1.data_df.iloc[:, [2, 0]])
        pandas_testing.assert_frame_equal(mg17.col_metadata_df,
                                          mg1.col_metadata_df.iloc[[2, 0], :])
        pandas_testing.assert_frame_equal(mg17.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and rid
        rid_unsorted = [
            'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10',
            'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg18 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=rid_unsorted,
            sort_row_meta=False)
        pandas_testing.assert_frame_equal(mg18.data_df,
                                          mg1.data_df.iloc[[5, 1], :])
        pandas_testing.assert_frame_equal(mg18.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(mg18.row_metadata_df,
                                          mg1.row_metadata_df.iloc[[5, 1], :])
Example #18
0
# Phase 1
sig_info = pd.read_csv("GSE92742_Broad_LINCS_sig_info.txt", sep="\t")
gene_info = pd.read_csv("GSE92742_Broad_LINCS_gene_info.txt",
                        sep="\t",
                        dtype=str)
landmark_gene_row_ids = gene_info["pr_gene_id"][gene_info["pr_is_lm"] == "1"]
sub_sig_info = sig_info[(sig_info["pert_type"] == "trt_cp")]
# (sig_info["pert_type"] == "trt_sh") |
# (sig_info["pert_type"] == "trt_sh") |
# (sig_info["pert_type"] == "trt_sh.cgs") |
# (sig_info["pert_type"] == "trt_lig")]
sub_sig_info.set_index("sig_id", inplace=True)

gctoo = pg.parse("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx",
                 cid=sub_sig_info.index.tolist(),
                 rid=landmark_gene_row_ids)
gctoo.col_metadata_df = sub_sig_info.copy()

df_data_1 = gctoo.data_df
rids = df_data_1.index.tolist()
symbols = []
for i in range(len(rids)):
    gene_symbol = gene_info["pr_gene_symbol"][gene_info["pr_gene_id"] ==
                                              rids[i]].values[0]
    symbols.append(gene_symbol)

with open("gene_symbols.csv", 'w+') as f:
    f.write('\n'.join(symbols))

df_data_1 = df_data_1.transpose()