Ejemplo n.º 1
0
	def test_parse_rid_as_entrez_id(self):
		input_file = "functional_tests/test_parse_gctx_rid_entrez_id.gctx"
		g = parse_gctx.parse(input_file)
		self.assertEqual((5,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))

		my_rids = [5720, 55847, 7416]
		g = parse_gctx.parse(input_file, rid=my_rids)
		self.assertEqual((3,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))

		my_rids = [str(x) for x in my_rids]
		logger.debug("using rid as str (mismatched type) - my_rids:  {}".format(my_rids))
		g = parse_gctx.parse(input_file, rid=my_rids)
		self.assertEqual((3,5), g.data_df.shape)
		logger.debug("g.data_df.index:  {}".format(g.data_df.index))
Ejemplo n.º 2
0
def main(args):
    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)

    if args.output_filepath == None:
        out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Ejemplo n.º 3
0
def main():
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
    if args.output_filepath == None:
        out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Ejemplo n.º 4
0
	def test_with_both_metadata_fields(self):

		# path to files
		gctoo_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gct"
		gctoox_path = FUNCTIONAL_TESTS_PATH + "/both_metadata_example_n1476x978.gctx"

		# parse files
		c1_gctoo = parse_gct.parse(gctoo_path)
		c1_gctoox = parse_gctx.parse(gctoox_path)

		#check rows and columns: data_df
		self.assertTrue(set(list(c1_gctoo.data_df.index)) == set(list(c1_gctoox.data_df.index)),
			"Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.data_df.index, c1_gctoox.data_df.index))
		self.assertTrue(set(list(c1_gctoo.data_df.columns)) == set(list(c1_gctoox.data_df.columns)),
			"Mismatch between data_df column values of gct vs gctx: {} vs {}".format(c1_gctoo.data_df.columns, c1_gctoox.data_df.columns))
		logger.debug("c1 gctoo data_df columns equal to gctoox data_df columns? {}".format(set(c1_gctoo.data_df.columns) == set(c1_gctoox.data_df.columns)))
		for c in list(c1_gctoo.data_df.columns):
			# logger.debug("Comparing data values in Column: {}".format(c))
			self.assertTrue(len(list(c1_gctoo.data_df[c])) == len(list(c1_gctoox.data_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			# assert_frame_equal(pandas.DataFrame(c1_gctoo.data_df[c]), pandas.DataFrame(c1_gctoox.data_df[c]))
			assert_series_equal(c1_gctoo.data_df[c], c1_gctoox.data_df[c])

		# check rows and columns: row_metadata_df
		self.assertTrue(set(list(c1_gctoo.row_metadata_df.index)) == set(list(c1_gctoox.row_metadata_df.index)),
			"Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.row_metadata_df.index, c1_gctoox.row_metadata_df.index))
		self.assertTrue(set(list(c1_gctoo.row_metadata_df.columns)) == set(list(c1_gctoox.row_metadata_df.columns)),
			"Mismatch between row_metadata_df column values of gct vs gctx: difference is {}".format(set(c1_gctoo.row_metadata_df.columns).symmetric_difference(set(c1_gctoox.row_metadata_df.columns))))
		logger.debug("c1 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(set(c1_gctoo.row_metadata_df.columns) == set(c1_gctoox.row_metadata_df.columns)))
		logger.debug("c1 gctoo dtypes: {}".format(c1_gctoo.row_metadata_df.dtypes))
		logger.debug("c1 gctoox dtypes: {}".format(c1_gctoox.row_metadata_df.dtypes))
		for c in list(c1_gctoo.row_metadata_df.columns):
			self.assertTrue(len(list(c1_gctoo.row_metadata_df[c])) == len(list(c1_gctoox.row_metadata_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c1_gctoo.row_metadata_df[c])[0:3]))
			self.assertTrue(c1_gctoo.row_metadata_df[c].dtype == c1_gctoox.row_metadata_df[c].dtype,
				"Dtype mismatch for {} between parsed gct & gctx: {} vs {}".format(c, c1_gctoo.row_metadata_df[c].dtype, c1_gctoox.row_metadata_df[c].dtype))
			assert_series_equal(c1_gctoo.row_metadata_df[c], c1_gctoox.row_metadata_df[c])

		# check rows and columns: col_metadata_df
		self.assertTrue(set(list(c1_gctoo.col_metadata_df.index)) == set(list(c1_gctoox.col_metadata_df.index)),
			"Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(c1_gctoo.col_metadata_df.index, c1_gctoox.col_metadata_df.index))
		self.assertTrue(set(list(c1_gctoo.col_metadata_df.columns)) == set(list(c1_gctoox.col_metadata_df.columns)),
			"Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(c1_gctoo.col_metadata_df.columns, c1_gctoox.col_metadata_df.columns))
		logger.debug("c1 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(set(c1_gctoo.col_metadata_df.columns) == set(c1_gctoox.col_metadata_df.columns)))
		for c in list(c1_gctoo.col_metadata_df.columns):
			self.assertTrue(len(list(c1_gctoo.col_metadata_df[c])) == len(list(c1_gctoox.col_metadata_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			self.assertTrue(c1_gctoo.col_metadata_df[c].dtype == c1_gctoox.col_metadata_df[c].dtype,
				"Dtype mismatch between parsed gct & gctx: {} vs {}".format(c1_gctoo.col_metadata_df[c].dtype, c1_gctoox.col_metadata_df[c].dtype))

			assert_series_equal(c1_gctoo.col_metadata_df[c], c1_gctoox.col_metadata_df[c])
Ejemplo n.º 5
0
	def test_with_only_row_metadata(self):
		
		# path to files
		gctoo_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gct"
		gctoox_path = FUNCTIONAL_TESTS_PATH + "/row_meta_only_example_n2x1203.gctx"

		# parse files
		c2_gctoo = parse_gct.parse(gctoo_path)
		c2_gctoox = parse_gctx.parse(gctoox_path)

		#check rows and columns: data_df
		self.assertTrue(set(list(c2_gctoo.data_df.index)) == set(list(c2_gctoox.data_df.index)),
			"Mismatch between data_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.data_df.index, c2_gctoox.data_df.index))
		self.assertTrue(set(list(c2_gctoo.data_df.columns)) == set(list(c2_gctoox.data_df.columns)),
			"Mismatch between data_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.data_df.columns, c2_gctoox.data_df.columns))
		logger.debug("c2 gctoo data_df columns equal to gctoox data_df columns? {}".format(set(c2_gctoo.data_df.columns) == set(c2_gctoox.data_df.columns)))
		for c in list(c2_gctoo.data_df.columns):
			self.assertTrue(len(list(c2_gctoo.data_df[c])) == len(list(c2_gctoox.data_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			assert_series_equal(c2_gctoo.data_df[c], c2_gctoox.data_df[c])

		# check rows and columns: row_metadata_df
		self.assertTrue(set(list(c2_gctoo.row_metadata_df.index)) == set(list(c2_gctoox.row_metadata_df.index)),
			"Mismatch between row_metadata_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.row_metadata_df.index, c2_gctoox.row_metadata_df.index))
		self.assertTrue(set(list(c2_gctoo.row_metadata_df.columns)) == set(list(c2_gctoox.row_metadata_df.columns)),
			"Mismatch between row_metadata_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.row_metadata_df.columns, c2_gctoox.row_metadata_df.columns))
		logger.debug("c2 gctoo row_metadata_df columns equal to gctoox row_metadata_df columns? {}".format(set(c2_gctoo.row_metadata_df.columns) == set(c2_gctoox.row_metadata_df.columns)))
		for c in list(c2_gctoo.row_metadata_df.columns):
			self.assertTrue(len(list(c2_gctoo.row_metadata_df[c])) == len(list(c2_gctoox.row_metadata_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			self.assertTrue(c2_gctoo.row_metadata_df[c].dtype == c2_gctoox.row_metadata_df[c].dtype,
				"Dtype mismatch between parsed gct & gctx: {} vs {}".format(c2_gctoo.row_metadata_df[c].dtype, c2_gctoox.row_metadata_df[c].dtype))
			logger.debug("first couple elems of {} in gctoo: {}".format(c, list(c2_gctoo.row_metadata_df[c])[0:3]))
			assert_series_equal(c2_gctoo.row_metadata_df[c], c2_gctoox.row_metadata_df[c])

		# check rows and columns: col_metadata_df
		self.assertTrue(set(list(c2_gctoo.col_metadata_df.index)) == set(list(c2_gctoox.col_metadata_df.index)),
			"Mismatch between col_metadata_df index values of gct vs gctx: {} vs {}".format(c2_gctoo.col_metadata_df.index, c2_gctoox.col_metadata_df.index))
		self.assertTrue(set(list(c2_gctoo.col_metadata_df.columns)) == set(list(c2_gctoox.col_metadata_df.columns)),
			"Mismatch between col_metadata_df column values of gct vs gctx: {} vs {}".format(c2_gctoo.col_metadata_df.columns, c2_gctoox.col_metadata_df.columns))
		logger.debug("c2 gctoo col_metadata_df columns equal to gctoox col_metadata_df columns? {}".format(set(c2_gctoo.col_metadata_df.columns) == set(c2_gctoox.col_metadata_df.columns)))
		for c in list(c2_gctoo.col_metadata_df.columns):
			self.assertTrue(len(list(c2_gctoo.col_metadata_df[c])) == len(list(c2_gctoox.col_metadata_df[c])),
				"Lengths of column {} differ between gct and gctx".format(c))
			self.assertTrue(c2_gctoo.col_metadata_df[c].dtype == c2_gctoox.col_metadata_df[c].dtype,
				"Dtype mismatch between parsed gct & gctx: {} vs {}".format(c2_gctoo.col_metadata_df[c].dtype, c2_gctoox.col_metadata_df[c].dtype))
			assert_series_equal(c2_gctoo.col_metadata_df[c], c2_gctoox.col_metadata_df[c])
Ejemplo n.º 6
0
def parse(file_path,
          convert_neg_666=True,
          rid=None,
          cid=None,
          ridx=None,
          cidx=None,
          meta_only=False,
          make_multiindex=False):
    """ 
	Identifies whether file_path corresponds to a .gct or .gctx file and calls the
	correct corresponding parse method.

	Input:
		Mandatory:
		- gct(x)_file_path (str): full path to gct(x) file you want to parse.
		
		Optional:
		- convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not 
			(see Note below for more details on this). Default = False.
		- rid (list of strings): list of row ids to specifically keep from gctx. Default=None. 
		- cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
		- make_multiindex (bool): whether to create a multi-index df combining
            the 3 component dfs

	Output:
		- myGCToo (GCToo)

	Note: why does convert_neg_666 exist? 
		- In CMap--for somewhat obscure historical reasons--we use "-666" as our null value 
		for metadata. However (so that users can take full advantage of pandas' methods, 
		including those for filtering nan's etc) we provide the option of converting these 
		into numpy.NaN values, the pandas default. 
	"""
    if file_path.endswith(".gct"):
        curr = parse_gct.parse(file_path, convert_neg_666, rid, cid,
                               make_multiindex)
    elif file_path.endswith(".gctx"):
        curr = parse_gctx.parse(file_path, convert_neg_666, rid, cid, ridx,
                                cidx, meta_only, make_multiindex)
    else:
        msg = "File to parse must be .gct or .gctx! file_path: {}".format(
            file_path)
        logger.error(msg)
        raise (Exception(msg))
    return curr
Ejemplo n.º 7
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

        assert_frame_equal(mg1.data_df, mg2.data_df)
        assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
        assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=test_rids,
                               cid=test_cids)
        assert_frame_equal(mg3.data_df, mg4.data_df)
        assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
        assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = range(0, 6)
        int_indexed_data_df.columns = range(10, 16)

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = range(0, 6)

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = range(10, 16)

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = slice_gct.slice_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=[0, 2, 4],
                               cid=[10, 13, 14, 15],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        assert_frame_equal(mg5.data_df, mg6.data_df)
        assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
        assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = slice_gct.slice_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
        mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               ridx=[4],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg8.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg9.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                                ridx=[4],
                                cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        assert_frame_equal(mg7.data_df, mg10.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)