예제 #1
0
    def test_parse_metadata_df(self):
        mini_gctoo = mini_gctoo_for_testing.make()
        # convert row_metadata to np.nan
        mini_row_meta = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0], [np.nan, np.nan, np.nan])
        logger.debug("mini_row_meta.shape:  {}".format(mini_row_meta.shape))
        logger.debug("mini_row_meta.index:  {}".format(mini_row_meta.index))
        logger.debug("mini_row_meta.columns:  {}".format(mini_row_meta.columns))
        logger.debug("mini_row_meta.dtypes:  {}".format(mini_row_meta.dtypes))

        gctx_file = h5py.File("functional_tests/mini_gctoo_for_testing.gctx", "r")
        row_dset = gctx_file[row_meta_group_node]
        col_dset = gctx_file[col_meta_group_node]

        # with convert_neg_666
        row_df = parse_gctx.parse_metadata_df("row", row_dset, True)
        logger.debug("row_df.dtypes:  {}".format(row_df.dtypes))
        pandas_testing.assert_frame_equal(mini_row_meta, row_df)

        # no convert_neg_666
        mini_gctoo_with_neg_666 = mini_gctoo_for_testing.make(convert_neg_666=False)
        col_df = parse_gctx.parse_metadata_df("col", col_dset, False)
        pandas_testing.assert_frame_equal(mini_gctoo_with_neg_666.col_metadata_df, col_df)

        # test that ID's are not converted to numeric
        expected_rids = [str(i) for i in range(3)]
        row_dset = {"id": MockHdf5Dset(expected_rids, str),
                    "other_meta": MockHdf5Dset(range(3, 6), str)}
        r = parse_gctx.parse_metadata_df("row", row_dset, True)
        logger.debug("test that ID's are not converted to numeric - r:  {}".format(r))
        logger.debug("r.index:  {}".format(r.index))
        self.assertEqual(set(expected_rids), set(r.index))
예제 #2
0
    def test_gct_parsing(self):
        # parse in gct, no other arguments
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct",
            convert_neg_666=False)
        self.assertCountEqual(
            mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(),
            [-666] * 6)

        # parse in gct with subsetting
        my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33"
        mg3 = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct",
            cidx=[0, 2],
            rid=[my_rid])

        self.assertEqual(mg3.data_df.shape, (1, 2))
        self.assertCountEqual(mg3.data_df.values.flatten().tolist(), [1., 3.])
        self.assertEqual(mg3.row_metadata_df.index[0], my_rid)
예제 #3
0
    def test_get_ordered_idx(self):
        mg = mini_gctoo_for_testing.make()

        # case 1: id_type == None
        case1 = parse_gctx.get_ordered_idx(None, [],
                                           mg.row_metadata_df,
                                           sort_idx=True)
        self.assertEqual(
            case1, list(range(0, 6)),
            "Expected ordered idx to be {} but got {}".format(
                list(range(0, 6)), case1))

        # case 2: id_type == "id"
        case2 = parse_gctx.get_ordered_idx(
            "id", ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            mg.col_metadata_df,
            sort_idx=True)
        self.assertEqual(
            case2, [4],
            "Expected ordered idx to be {} but got {}".format([4], case2))

        # case 3: id_type == ridx
        case3 = parse_gctx.get_ordered_idx("idx", [5, 1, 3],
                                           mg.col_metadata_df,
                                           sort_idx=True)
        self.assertEqual(
            case3, [1, 3, 5],
            "Expected ordered idx to be {} but got {}".format([1, 3, 5],
                                                              case3))
예제 #4
0
    def test_make_specified_size_gctoo(self):
        mini_gctoo = mini_gctoo_for_testing.make()
        logger.debug("mini gctoo data_df shape: {}".format(
            mini_gctoo.data_df.shape))
        logger.debug("mini gctoo row_meta shape: {}".format(
            mini_gctoo.row_metadata_df.shape))
        logger.debug("mini gctoo col_meta shape: {}".format(
            mini_gctoo.col_metadata_df.shape))

        # case 1: dim isn't 'row' or 'col'
        with self.assertRaises(AssertionError) as context:
            random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll")
        self.assertEqual(str(context.exception),
                         "dim specified must be either 'row' or 'col'")

        # case 2: row subsetting - happy
        row_subset = random_slice.make_specified_size_gctoo(
            mini_gctoo, 3, "row")
        self.assertEqual(
            row_subset.data_df.shape, (3, 6),
            "data_df after row slice is incorrect shape: {} vs (3,6)".format(
                row_subset.data_df.shape))
        self.assertEqual(
            row_subset.row_metadata_df.shape, (3, 5),
            "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".
            format(row_subset.row_metadata_df.shape))
        self.assertEqual(
            row_subset.col_metadata_df.shape, (6, 5),
            "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".
            format(row_subset.col_metadata_df.shape))

        # case 3: row subsetting - sample subset > og # of samples
        with self.assertRaises(AssertionError) as context:
            random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row")
        self.assertTrue(
            "number of entries must be smaller than dimension being subsetted "
            in str(context.exception))

        # case 4: col subsetting - happy
        col_subset = random_slice.make_specified_size_gctoo(
            mini_gctoo, 3, "col")
        self.assertEqual(
            col_subset.data_df.shape, (6, 3),
            "data_df after col slice is incorrect shape: {} vs (6,3)".format(
                col_subset.data_df.shape))
        self.assertEqual(
            col_subset.row_metadata_df.shape, (6, 5),
            "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".
            format(col_subset.row_metadata_df.shape))
        self.assertEqual(
            col_subset.col_metadata_df.shape, (3, 5),
            "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".
            format(col_subset.col_metadata_df.shape))

        # case 5: col subsetting - sample subset > og # of samples
        with self.assertRaises(AssertionError) as context:
            random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col")
        self.assertTrue(
            "number of entries must be smaller than dimension being subsetted "
            in str(context.exception))
예제 #5
0
	def test_parse_metadata_df(self):
		mini_gctoo = mini_gctoo_for_testing.make()
		# convert row_metadata to np.nan
		mini_row_meta = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0], [np.nan, np.nan, np.nan])

		gctx_file = h5py.File("functional_tests/mini_gctoo_for_testing.gctx", "r")
		row_dset = gctx_file[row_meta_group_node]
		col_dset = gctx_file[col_meta_group_node]

		# with convert_neg_666
		row_df = parse_gctx.parse_metadata_df("row", row_dset, True)
		assert_frame_equal(mini_row_meta, row_df)

		# no convert_neg_666
		mini_gctoo_with_neg_666 = mini_gctoo_for_testing.make(convert_neg_666=False)
		col_df = parse_gctx.parse_metadata_df("col", col_dset, False)
		assert_frame_equal(mini_gctoo_with_neg_666.col_metadata_df, col_df)
예제 #6
0
    def test_write_src(self):
        # case 1: gctoo obj doesn't have src
        mini1 = mini_gctoo_for_testing.make()
        mini1.src = None
        write_gctx.write(mini1, "no_src_example")
        hdf5_file = h5py.File("no_src_example.gctx")
        hdf5_src1 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src1, "no_src_example.gctx")
        os.remove("no_src_example.gctx")

        # case 2: gctoo obj does have src
        mini2 = mini_gctoo_for_testing.make()
        write_gctx.write(mini2, "with_src_example.gctx")
        hdf5_file = h5py.File("with_src_example.gctx")
        hdf5_src2 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src2, "mini_gctoo.gctx")
        os.remove("with_src_example.gctx")
예제 #7
0
    def test_write_version(self):
        # TODO @oana refactor this test so it just calls the write_version method
        # case 1: gctoo obj doesn't have version
        mini1 = mini_gctoo_for_testing.make()
        mini1.version = None
        fn = "no_version_provided_example.gctx"
        write_gctx.write(mini1, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v1 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v1, write_gctx.version_number)
        os.remove(fn)

        # case 2: gctoo obj does have version, but it is not used when writing
        mini2 = mini_gctoo_for_testing.make()
        mini2.version = "MY_VERSION"
        fn = "with_version_provided_example.gctx"
        write_gctx.write(mini2, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v2 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v2, write_gctx.version_number)
        os.remove(fn)
예제 #8
0
	def test_set_metadata_index_and_column_names(self):
		mini_gctoo = mini_gctoo_for_testing.make()
		mini_gctoo.row_metadata_df.index.name = None
		mini_gctoo.row_metadata_df.columns.name = None 
		mini_gctoo.col_metadata_df.index.name = None
		mini_gctoo.col_metadata_df.columns.name = None 

		# case 1: dim == "row"
		parse_gctx.set_metadata_index_and_column_names("row", mini_gctoo.row_metadata_df)
		self.assertEqual(mini_gctoo.row_metadata_df.index.name, "rid")
		self.assertEqual(mini_gctoo.row_metadata_df.columns.name, "rhd")

		# case 2: dim == "col"
		parse_gctx.set_metadata_index_and_column_names("col", mini_gctoo.col_metadata_df)
		self.assertEqual(mini_gctoo.col_metadata_df.index.name, "cid")
		self.assertEqual(mini_gctoo.col_metadata_df.columns.name, "chd")
예제 #9
0
    def test_gct_parsing(self):
        # parse in gct, no other arguments
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse.parse("functional_tests/mini_gctoo_for_testing.gct")

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse.parse("functional_tests/mini_gctoo_for_testing.gct",
                              convert_neg_666=False)
        self.assertFalse(
            mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())

        # check unused rid argument handling
        with self.assertRaises(Exception) as context:
            mg3 = parse.parse("functional_tests/mini_gctoo_for_testing.gct",
                              rid=["a"])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused cid argument handling
        with self.assertRaises(Exception) as context:
            mg4 = parse.parse("functional_tests/mini_gctoo_for_testing.gct",
                              cid=["a"])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused ridx argument handling
        with self.assertRaises(Exception) as context:
            mg5 = parse.parse("functional_tests/mini_gctoo_for_testing.gct",
                              ridx=[0])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))

        # check unused cidx argument handling
        with self.assertRaises(Exception) as context:
            mg6 = parse.parse("functional_tests/mini_gctoo_for_testing.gct",
                              cidx=[0])
        self.assertTrue(
            "parse_gct does not use the argument" in str(context.exception))
예제 #10
0
    def test_gctx_parsing(self):
        # parse in gctx, no other arguments        
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx")

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", convert_neg_666 = False)
        self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())        

        # parsing w/rids & cids specified 
        test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx",
                    rid=test_rids, cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

        # parsing w/ridx & cidx specified 
        mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                                      cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg6 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)

        # parsing row metadata only
        mg7 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True)
        pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df)

        # parsing col metadata only
        mg8 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True)
        pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df)

        # parsing w/multiindex
        mg9 = parse.parse("../functional_tests/mini_gctoo_for_testing.gctx", make_multiindex=True)
        self.assertTrue(mg9.multi_index_df is not None)
예제 #11
0
	def test_parse(self):
		# parse whole thing 
		mg1 = mini_gctoo_for_testing.make()
		mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

		assert_frame_equal(mg1.data_df, mg2.data_df)
		assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
		assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

		# test with string rid/cid 
		test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33','LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
		test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
		mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
		mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
			rid = test_rids, cid = test_cids)
		assert_frame_equal(mg3.data_df, mg4.data_df)
		assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
		assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

		# first, make & write out temp version of mini_gctoo with int rids/cids 
		new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
		int_indexed_data_df = new_mg.data_df.copy()
		int_indexed_data_df.index = range(0,6)
		int_indexed_data_df.columns = range(10,16)

		int_indexed_row_meta = new_mg.row_metadata_df.copy()
		int_indexed_row_meta.index = range(0,6)

		int_indexed_col_meta = new_mg.col_metadata_df.copy()
		int_indexed_col_meta.index = range(10,16)

		int_indexed_gctoo = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta,
			col_metadata_df = int_indexed_col_meta)

		write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

		# test with numeric (repr as string) rid/cid
		mg5 = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, 
			col_metadata_df = int_indexed_col_meta)
		mg5 = slice_gct.slice_gctoo(mg5, row_bool = [True, False, True, False, True, False],
			col_bool = [True, False, False, True, True, True])

		mg5.data_df.index.name = "rid"
		mg5.data_df.columns.name = "cid"

		mg5.row_metadata_df.index.name = "rid"
		mg5.row_metadata_df.columns.name = "rhd"

		mg5.col_metadata_df.index.name = "cid"
		mg5.col_metadata_df.columns.name = "chd"

		mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid = [0, 2, 4], 
			cid = [10,13,14,15], convert_neg_666=False)

		os.remove("int_indexed_mini_gctoo.gctx")

		assert_frame_equal(mg5.data_df, mg6.data_df)
		assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
		assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)		

		# test with ridx/cidx
		mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 
			cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
		mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

		assert_frame_equal(mg7.data_df, mg8.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)			

		# test with rid/cidx
		mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
			cidx = [4])

		assert_frame_equal(mg7.data_df, mg9.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)			

		# test with ridx/cid
		mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4],
			cid = ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

		assert_frame_equal(mg7.data_df, mg10.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)			
예제 #12
0
    def test_write_metadata(self):
        """
		CASE 1:
			- write metadata (has '-666') to file, do not convert -666
			- parse in written metadata, don't convert -666 
		"""
        mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False)
        hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6)
        write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6)
        hdf5_writer.close()
        logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format(
            os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx")))

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                                 convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                              convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns),
                        "Mismatch between expected row metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns))
        self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect row metadata index {} and index values written to file: {}".format(
                            mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C1: For column name: {}".format(c))
            logger.debug("C1: populated values: {}".format(set(mini_gctoo_row_metadata[c])))
            logger.debug("C1: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]),
                            "Values in column {} differ between expected metadata and written row metadata: {} vs {}".format(
                                c, set(mini_gctoo.row_metadata_df[c]), set(mini_gctoo_row_metadata[c])))

        # check col metadata
        self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns),
                        "Mismatch between expected col metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns))
        self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect col metadata index {} and index values written to file: {}".format(
                            mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]),
                            "Values in column {} differ between expected metadata and written col metadata!".format(c))

        """
		CASE 2:
			- write metadata (has NaN, not '-666') to file, do convert NaN back to '-666'
			- parse in written metadata, don't convert -666 
		"""
        # first convert mini_gctoo's row & col metadata dfs -666s to NaN
        converted_row_metadata = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0],
                                                                    [numpy.nan, numpy.nan, numpy.nan])
        logger.debug("First row of converted_row_metadata: {}".format(converted_row_metadata.iloc[0]))
        converted_col_metadata = mini_gctoo.col_metadata_df.replace([-666, "-666", -666.0],
                                                                    [numpy.nan, numpy.nan, numpy.nan])

        # write row and col metadata fields from mini_gctoo_for_testing instance to file
        # Note this time does convert back to -666
        hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6)
        write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6)
        hdf5_writer.close()

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                                 convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                              convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns),
                        "Mismatch between expected row metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns))
        self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect row metadata index {} and index values written to file: {}".format(
                            mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C2: For column name: {}".format(c))
            logger.debug("C2: populated values: {}".format(set(mini_gctoo_row_metadata[c])))
            logger.debug("C2: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]),
                            "Values in column {} differ between expected metadata and written row metadata!".format(c))

        # check col metadata
        self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns),
                        "Mismatch between expected col metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns))
        self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect col metadata index {} and index values written to file: {}".format(
                            mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]),
                            "Values in column {} differ between expected metadata and written col metadata!".format(c))
예제 #13
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=test_rids,
            cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df,
                                          mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df,
                                          mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = [str(i) for i in range(0, 6)]
        int_indexed_data_df.columns = [str(i) for i in range(10, 16)]

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = int_indexed_data_df.index

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = int_indexed_data_df.columns

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = subset_gctoo.subset_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=["0", "2", "4"],
                               cid=["10", "13", "14", "15"],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df,
                                          mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df,
                                          mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = subset_gctoo.subset_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg8 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg8.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg9.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg10.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg10.col_metadata_df)

        # test with row_meta_only
        mg11 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            row_meta_only=True)
        pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df)

        # test with col_meta_only
        mg12 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            col_meta_only=True)
        pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df)

        # test with sort_row_meta False and ridx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
        )

        # test with sort_col_meta False and cidx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False)

        pandas_testing.assert_frame_equal(mg13.data_df,
                                          mg1.data_df.iloc[:, [4, 1, 3]])
        pandas_testing.assert_frame_equal(
            mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :])
        pandas_testing.assert_frame_equal(mg13.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and ridx
        mg14 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False)

        pandas_testing.assert_frame_equal(mg14.data_df,
                                          mg1.data_df.iloc[[3, 0, 1], :])
        pandas_testing.assert_frame_equal(mg14.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(
            mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cidx and col_meta_only
        mg15 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False,
            col_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :])

        # test with sort_row_meta False and ridx and row_meta_only
        mg16 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False,
            row_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cid
        cid_unsorted = [
            'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10',
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg17 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cid=cid_unsorted,
            sort_col_meta=False)
        pandas_testing.assert_frame_equal(mg17.data_df,
                                          mg1.data_df.iloc[:, [2, 0]])
        pandas_testing.assert_frame_equal(mg17.col_metadata_df,
                                          mg1.col_metadata_df.iloc[[2, 0], :])
        pandas_testing.assert_frame_equal(mg17.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and rid
        rid_unsorted = [
            'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10',
            'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg18 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=rid_unsorted,
            sort_row_meta=False)
        pandas_testing.assert_frame_equal(mg18.data_df,
                                          mg1.data_df.iloc[[5, 1], :])
        pandas_testing.assert_frame_equal(mg18.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(mg18.row_metadata_df,
                                          mg1.row_metadata_df.iloc[[5, 1], :])