Example #1
0
    def test_get_ordered_idx(self):
        mg = mini_gctoo_for_testing.make()

        # case 1: id_type == None
        case1 = parse_gctx.get_ordered_idx(None, [], mg.row_metadata_df)
        self.assertEqual(
            case1, range(0, 6),
            "Expected oredered idx to be {} but got {}".format(
                range(0, 6), case1))

        # case 2: id_type == "id"
        case2 = parse_gctx.get_ordered_idx(
            "id", ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            mg.col_metadata_df)
        self.assertEqual(
            case2, [4],
            "Expected oredered idx to be {} but got {}".format([4], case2))

        # case 3: id_type == ridx
        case3 = parse_gctx.get_ordered_idx("idx", [5, 1, 3],
                                           mg.col_metadata_df)
        self.assertEqual(
            case3, [1, 3, 5],
            "Expected oredered idx to be {} but got {}".format([1, 3, 5],
                                                               case3))
	def test_parse_metadata_df(self):
		mini_gctoo = mini_gctoo_for_testing.make()
		# convert row_metadata to np.nan
		mini_row_meta = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0], 
			[np.nan, np.nan, np.nan])

		gctx_file = h5py.File("functional_tests/mini_gctoo_for_testing.gctx", "r")
		row_dset = gctx_file[row_meta_group_node]
		col_dset = gctx_file[col_meta_group_node]

		# with convert_neg_666
		row_df = parse_gctx.parse_metadata_df("row", row_dset, True)
		assert_frame_equal(mini_row_meta, row_df)

		# no convert_neg_666
		mini_gctoo_with_neg_666 = mini_gctoo_for_testing.make(convert_neg_666=False)
		col_df = parse_gctx.parse_metadata_df("col", col_dset, False)
		assert_frame_equal(mini_gctoo_with_neg_666.col_metadata_df, col_df)
Example #3
0
    def test_write_src(self):
        # case 1: gctoo obj doesn't have src
        mini1 = mini_gctoo_for_testing.make()
        mini1.src = None
        write_gctx.write(mini1, "no_src_example")
        hdf5_file = h5py.File("no_src_example.gctx")
        hdf5_src1 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src1, "no_src_example.gctx")
        os.remove("no_src_example.gctx")

        # case 2: gctoo obj does have src
        mini2 = mini_gctoo_for_testing.make()
        write_gctx.write(mini2, "with_src_example.gctx")
        hdf5_file = h5py.File("with_src_example.gctx")
        hdf5_src2 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src2, "mini_gctoo.gctx")
        os.remove("with_src_example.gctx")
Example #4
0
    def test_write_version(self):
        #TODO @oana refactor this test so it just calls the write_version method
        # case 1: gctoo obj doesn't have version
        mini1 = mini_gctoo_for_testing.make()
        mini1.version = None
        fn = "no_version_provided_example.gctx"
        write_gctx.write(mini1, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v1 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v1, write_gctx.version_number)
        os.remove(fn)

        # case 2: gctoo obj does have version, but it is not used when writing
        mini2 = mini_gctoo_for_testing.make()
        mini2.version = "MY_VERSION"
        fn = "with_version_provided_example.gctx"
        write_gctx.write(mini2, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v2 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v2, write_gctx.version_number)
        os.remove(fn)
	def test_set_metadata_index_and_column_names(self):
		mini_gctoo = mini_gctoo_for_testing.make()
		mini_gctoo.row_metadata_df.index.name = None
		mini_gctoo.row_metadata_df.columns.name = None 
		mini_gctoo.col_metadata_df.index.name = None
		mini_gctoo.col_metadata_df.columns.name = None 

		# case 1: dim == "row"
		parse_gctx.set_metadata_index_and_column_names("row", mini_gctoo.row_metadata_df)
		self.assertEqual(mini_gctoo.row_metadata_df.index.name, "rid")
		self.assertEqual(mini_gctoo.row_metadata_df.columns.name, "rhd")

		# case 2: dim == "col"
		parse_gctx.set_metadata_index_and_column_names("col", mini_gctoo.col_metadata_df)
		self.assertEqual(mini_gctoo.col_metadata_df.index.name, "cid")
		self.assertEqual(mini_gctoo.col_metadata_df.columns.name, "chd")
	def test_make_specified_size_gctoo(self):
		mini_gctoo = mini_gctoo_for_testing.make()
		logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape))
		logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape))
		logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape))

		# case 1: dim isn't 'row' or 'col'
		with self.assertRaises(AssertionError) as context:
			random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll")
		self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'")

		# case 2: row subsetting - happy
		row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row")
		self.assertEqual(row_subset.data_df.shape, (3,6), 
			"data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape))
		self.assertEqual(row_subset.row_metadata_df.shape, (3,5), 
			"row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format(row_subset.row_metadata_df.shape))
		self.assertEqual(row_subset.col_metadata_df.shape, (6,5),
			"col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format(row_subset.col_metadata_df.shape))

		# case 3: row subsetting - sample subset > og # of samples
		with self.assertRaises(AssertionError) as context:
			random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row")
		self.assertEqual(str(context.exception), "number of samples must be subset of original file sample size")

		# case 4: col subsetting - happy
		col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col")
		self.assertEqual(col_subset.data_df.shape, (6,3), 
			"data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape))
		self.assertEqual(col_subset.row_metadata_df.shape, (6, 5), 
			"row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format(col_subset.row_metadata_df.shape))
		self.assertEqual(col_subset.col_metadata_df.shape, (3,5),
			"col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format(col_subset.col_metadata_df.shape))

		# case 5: col subsetting - sample subset > og # of samples
		with self.assertRaises(AssertionError) as context:
			random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col")
		self.assertEqual(str(context.exception), "number of samples must be subset of original file sample size")
Example #7
0
import h5py
import GCTXAttrInfo
import parse_gctoox
import write_gctoox
import mini_gctoo_for_testing

__author__ = "Oana Enache"
__email__ = "*****@*****.**"

FUNCTIONAL_TESTS_PATH = "functional_tests"

# instantiate logger
logger = logging.getLogger(setup_logger.LOGGER_NAME)

# instance of mini_gctoo for testing
mini_gctoo = mini_gctoo_for_testing.make()

version_node = "version"
rid_node = "/0/META/ROW/id"
cid_node = "/0/META/COL/id"
data_node = "/0/DATA/0/matrix"
row_meta_group_node = "/0/META/ROW"
col_meta_group_node = "/0/META/COL"


class TestParseGCTooX(unittest.TestCase):
    def test_add_gctx_to_out_name(self):
        name1 = "my_cool_file"
        name2 = "my_other_cool_file.gctx"

        # case 1: out file name doesn't end in gctx
Example #8
0
    def test_write_metadata(self):
        """
		CASE 1:
			- write metadata (has '-666') to file, do not convert -666
			- parse in written metadata, don't convert -666 
		"""
        mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False)
        hdf5_writer = h5py.File(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row",
                                  mini_gctoo.row_metadata_df, False)
        write_gctx.write_metadata(hdf5_writer, "col",
                                  mini_gctoo.col_metadata_df, False)
        hdf5_writer.close()
        logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format(
            os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx")))

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
            convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
            convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(
            set(mini_gctoo.row_metadata_df.columns) == set(
                mini_gctoo_row_metadata.columns),
            "Mismatch between expected row metadata columns {} and column values written to file: {}"
            .format(mini_gctoo.row_metadata_df.columns,
                    mini_gctoo_row_metadata.columns))
        self.assertTrue(
            set(mini_gctoo.row_metadata_df.index) == set(
                mini_gctoo.col_metadata_df.index),
            "Mismatch between expect row metadata index {} and index values written to file: {}"
            .format(mini_gctoo.row_metadata_df.index,
                    mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C1: For column name: {}".format(c))
            logger.debug("C1: populated values: {}".format(
                set(mini_gctoo_row_metadata[c])))
            logger.debug("C1: mini_gctoo values: {}".format(
                set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(
                set(mini_gctoo.row_metadata_df[c]) == set(
                    mini_gctoo_row_metadata[c]),
                "Values in column {} differ between expected metadata and written row metadata: {} vs {}"
                .format(c, set(mini_gctoo.row_metadata_df[c]),
                        set(mini_gctoo_row_metadata[c])))

        # check col metadata
        self.assertTrue(
            set(mini_gctoo.col_metadata_df.columns) == set(
                mini_gctoo_col_metadata.columns),
            "Mismatch between expected col metadata columns {} and column values written to file: {}"
            .format(mini_gctoo.col_metadata_df.columns,
                    mini_gctoo_col_metadata.columns))
        self.assertTrue(
            set(mini_gctoo.col_metadata_df.index) == set(
                mini_gctoo.col_metadata_df.index),
            "Mismatch between expect col metadata index {} and index values written to file: {}"
            .format(mini_gctoo.col_metadata_df.index,
                    mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(
                set(mini_gctoo.col_metadata_df[c]) == set(
                    mini_gctoo_col_metadata[c]),
                "Values in column {} differ between expected metadata and written col metadata!"
                .format(c))
        """
		CASE 2:
			- write metadata (has NaN, not '-666') to file, do convert NaN back to '-666'
			- parse in written metadata, don't convert -666 
		"""
        # first convert mini_gctoo's row & col metadata dfs -666s to NaN
        converted_row_metadata = mini_gctoo.row_metadata_df.replace(
            [-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan])
        logger.debug("First row of converted_row_metadata: {}".format(
            converted_row_metadata.iloc[0]))
        converted_col_metadata = mini_gctoo.col_metadata_df.replace(
            [-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan])

        # write row and col metadata fields from mini_gctoo_for_testing instance to file
        # Note this time does convert back to -666
        hdf5_writer = h5py.File(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata,
                                  True)
        write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata,
                                  True)
        hdf5_writer.close()

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
            convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(
            FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
            convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(
            set(mini_gctoo.row_metadata_df.columns) == set(
                mini_gctoo_row_metadata.columns),
            "Mismatch between expected row metadata columns {} and column values written to file: {}"
            .format(mini_gctoo.row_metadata_df.columns,
                    mini_gctoo_row_metadata.columns))
        self.assertTrue(
            set(mini_gctoo.row_metadata_df.index) == set(
                mini_gctoo.col_metadata_df.index),
            "Mismatch between expect row metadata index {} and index values written to file: {}"
            .format(mini_gctoo.row_metadata_df.index,
                    mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C2: For column name: {}".format(c))
            logger.debug("C2: populated values: {}".format(
                set(mini_gctoo_row_metadata[c])))
            logger.debug("C2: mini_gctoo values: {}".format(
                set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(
                set(mini_gctoo.row_metadata_df[c]) == set(
                    mini_gctoo_row_metadata[c]),
                "Values in column {} differ between expected metadata and written row metadata!"
                .format(c))

        # check col metadata
        self.assertTrue(
            set(mini_gctoo.col_metadata_df.columns) == set(
                mini_gctoo_col_metadata.columns),
            "Mismatch between expected col metadata columns {} and column values written to file: {}"
            .format(mini_gctoo.col_metadata_df.columns,
                    mini_gctoo_col_metadata.columns))
        self.assertTrue(
            set(mini_gctoo.col_metadata_df.index) == set(
                mini_gctoo.col_metadata_df.index),
            "Mismatch between expect col metadata index {} and index values written to file: {}"
            .format(mini_gctoo.col_metadata_df.index,
                    mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(
                set(mini_gctoo.col_metadata_df[c]) == set(
                    mini_gctoo_col_metadata[c]),
                "Values in column {} differ between expected metadata and written col metadata!"
                .format(c))
Example #9
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

        assert_frame_equal(mg1.data_df, mg2.data_df)
        assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
        assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=test_rids,
                               cid=test_cids)
        assert_frame_equal(mg3.data_df, mg4.data_df)
        assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
        assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = range(0, 6)
        int_indexed_data_df.columns = range(10, 16)

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = range(0, 6)

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = range(10, 16)

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = slice_gct.slice_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=[0, 2, 4],
                               cid=[10, 13, 14, 15],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        assert_frame_equal(mg5.data_df, mg6.data_df)
        assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
        assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = slice_gct.slice_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
        mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               ridx=[4],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg8.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                               rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                               cidx=[4])

        assert_frame_equal(mg7.data_df, mg9.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                                ridx=[4],
                                cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        assert_frame_equal(mg7.data_df, mg10.data_df)
        assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
        assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)