def test_get_ordered_idx(self): mg = mini_gctoo_for_testing.make() # case 1: id_type == None case1 = parse_gctx.get_ordered_idx(None, [], mg.row_metadata_df) self.assertEqual( case1, range(0, 6), "Expected oredered idx to be {} but got {}".format( range(0, 6), case1)) # case 2: id_type == "id" case2 = parse_gctx.get_ordered_idx( "id", ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], mg.col_metadata_df) self.assertEqual( case2, [4], "Expected oredered idx to be {} but got {}".format([4], case2)) # case 3: id_type == ridx case3 = parse_gctx.get_ordered_idx("idx", [5, 1, 3], mg.col_metadata_df) self.assertEqual( case3, [1, 3, 5], "Expected oredered idx to be {} but got {}".format([1, 3, 5], case3))
def test_parse_metadata_df(self): mini_gctoo = mini_gctoo_for_testing.make() # convert row_metadata to np.nan mini_row_meta = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0], [np.nan, np.nan, np.nan]) gctx_file = h5py.File("functional_tests/mini_gctoo_for_testing.gctx", "r") row_dset = gctx_file[row_meta_group_node] col_dset = gctx_file[col_meta_group_node] # with convert_neg_666 row_df = parse_gctx.parse_metadata_df("row", row_dset, True) assert_frame_equal(mini_row_meta, row_df) # no convert_neg_666 mini_gctoo_with_neg_666 = mini_gctoo_for_testing.make(convert_neg_666=False) col_df = parse_gctx.parse_metadata_df("col", col_dset, False) assert_frame_equal(mini_gctoo_with_neg_666.col_metadata_df, col_df)
def test_write_src(self): # case 1: gctoo obj doesn't have src mini1 = mini_gctoo_for_testing.make() mini1.src = None write_gctx.write(mini1, "no_src_example") hdf5_file = h5py.File("no_src_example.gctx") hdf5_src1 = hdf5_file.attrs[write_gctx.src_attr] hdf5_file.close() self.assertEqual(hdf5_src1, "no_src_example.gctx") os.remove("no_src_example.gctx") # case 2: gctoo obj does have src mini2 = mini_gctoo_for_testing.make() write_gctx.write(mini2, "with_src_example.gctx") hdf5_file = h5py.File("with_src_example.gctx") hdf5_src2 = hdf5_file.attrs[write_gctx.src_attr] hdf5_file.close() self.assertEqual(hdf5_src2, "mini_gctoo.gctx") os.remove("with_src_example.gctx")
def test_write_version(self): #TODO @oana refactor this test so it just calls the write_version method # case 1: gctoo obj doesn't have version mini1 = mini_gctoo_for_testing.make() mini1.version = None fn = "no_version_provided_example.gctx" write_gctx.write(mini1, fn) hdf5_file = h5py.File(fn) hdf5_v1 = hdf5_file.attrs[write_gctx.version_attr] hdf5_file.close() self.assertEqual(hdf5_v1, write_gctx.version_number) os.remove(fn) # case 2: gctoo obj does have version, but it is not used when writing mini2 = mini_gctoo_for_testing.make() mini2.version = "MY_VERSION" fn = "with_version_provided_example.gctx" write_gctx.write(mini2, fn) hdf5_file = h5py.File(fn) hdf5_v2 = hdf5_file.attrs[write_gctx.version_attr] hdf5_file.close() self.assertEqual(hdf5_v2, write_gctx.version_number) os.remove(fn)
def test_set_metadata_index_and_column_names(self): mini_gctoo = mini_gctoo_for_testing.make() mini_gctoo.row_metadata_df.index.name = None mini_gctoo.row_metadata_df.columns.name = None mini_gctoo.col_metadata_df.index.name = None mini_gctoo.col_metadata_df.columns.name = None # case 1: dim == "row" parse_gctx.set_metadata_index_and_column_names("row", mini_gctoo.row_metadata_df) self.assertEqual(mini_gctoo.row_metadata_df.index.name, "rid") self.assertEqual(mini_gctoo.row_metadata_df.columns.name, "rhd") # case 2: dim == "col" parse_gctx.set_metadata_index_and_column_names("col", mini_gctoo.col_metadata_df) self.assertEqual(mini_gctoo.col_metadata_df.index.name, "cid") self.assertEqual(mini_gctoo.col_metadata_df.columns.name, "chd")
def test_make_specified_size_gctoo(self): mini_gctoo = mini_gctoo_for_testing.make() logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape)) logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape)) logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape)) # case 1: dim isn't 'row' or 'col' with self.assertRaises(AssertionError) as context: random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll") self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'") # case 2: row subsetting - happy row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row") self.assertEqual(row_subset.data_df.shape, (3,6), "data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape)) self.assertEqual(row_subset.row_metadata_df.shape, (3,5), "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format(row_subset.row_metadata_df.shape)) self.assertEqual(row_subset.col_metadata_df.shape, (6,5), "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format(row_subset.col_metadata_df.shape)) # case 3: row subsetting - sample subset > og # of samples with self.assertRaises(AssertionError) as context: random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row") self.assertEqual(str(context.exception), "number of samples must be subset of original file sample size") # case 4: col subsetting - happy col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col") self.assertEqual(col_subset.data_df.shape, (6,3), "data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape)) self.assertEqual(col_subset.row_metadata_df.shape, (6, 5), "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format(col_subset.row_metadata_df.shape)) self.assertEqual(col_subset.col_metadata_df.shape, (3,5), "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format(col_subset.col_metadata_df.shape)) # case 5: col subsetting - sample subset > og # of samples with self.assertRaises(AssertionError) as context: random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col") self.assertEqual(str(context.exception), "number of samples must be subset of original file sample size")
import h5py import GCTXAttrInfo import parse_gctoox import write_gctoox import mini_gctoo_for_testing __author__ = "Oana Enache" __email__ = "*****@*****.**" FUNCTIONAL_TESTS_PATH = "functional_tests" # instantiate logger logger = logging.getLogger(setup_logger.LOGGER_NAME) # instance of mini_gctoo for testing mini_gctoo = mini_gctoo_for_testing.make() version_node = "version" rid_node = "/0/META/ROW/id" cid_node = "/0/META/COL/id" data_node = "/0/DATA/0/matrix" row_meta_group_node = "/0/META/ROW" col_meta_group_node = "/0/META/COL" class TestParseGCTooX(unittest.TestCase): def test_add_gctx_to_out_name(self): name1 = "my_cool_file" name2 = "my_other_cool_file.gctx" # case 1: out file name doesn't end in gctx
def test_write_metadata(self): """ CASE 1: - write metadata (has '-666') to file, do not convert -666 - parse in written metadata, don't convert -666 """ mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False) hdf5_writer = h5py.File( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False) write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False) hdf5_writer.close() logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format( os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx"))) # read in written metadata, then close and delete file mini_gctoo_col_metadata = parse_gctx.get_column_metadata( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) mini_gctoo_row_metadata = parse_gctx.get_row_metadata( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx") # check row metadata self.assertTrue( set(mini_gctoo.row_metadata_df.columns) == set( mini_gctoo_row_metadata.columns), "Mismatch between expected row metadata columns {} and column values written to file: {}" .format(mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns)) self.assertTrue( set(mini_gctoo.row_metadata_df.index) == set( mini_gctoo.col_metadata_df.index), "Mismatch between expect row metadata index {} and index values written to file: {}" .format(mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index)) for c in list(mini_gctoo.row_metadata_df.columns): logger.debug("C1: For column name: {}".format(c)) logger.debug("C1: populated values: {}".format( set(mini_gctoo_row_metadata[c]))) logger.debug("C1: mini_gctoo values: {}".format( set(mini_gctoo.row_metadata_df[c]))) self.assertTrue( set(mini_gctoo.row_metadata_df[c]) == set( mini_gctoo_row_metadata[c]), "Values in column {} differ between expected metadata and written row metadata: {} vs {}" .format(c, set(mini_gctoo.row_metadata_df[c]), set(mini_gctoo_row_metadata[c]))) # check col metadata self.assertTrue( set(mini_gctoo.col_metadata_df.columns) == set( mini_gctoo_col_metadata.columns), "Mismatch between expected col metadata columns {} and column values written to file: {}" .format(mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns)) self.assertTrue( set(mini_gctoo.col_metadata_df.index) == set( mini_gctoo.col_metadata_df.index), "Mismatch between expect col metadata index {} and index values written to file: {}" .format(mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index)) for c in list(mini_gctoo.col_metadata_df.columns): self.assertTrue( set(mini_gctoo.col_metadata_df[c]) == set( mini_gctoo_col_metadata[c]), "Values in column {} differ between expected metadata and written col metadata!" .format(c)) """ CASE 2: - write metadata (has NaN, not '-666') to file, do convert NaN back to '-666' - parse in written metadata, don't convert -666 """ # first convert mini_gctoo's row & col metadata dfs -666s to NaN converted_row_metadata = mini_gctoo.row_metadata_df.replace( [-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan]) logger.debug("First row of converted_row_metadata: {}".format( converted_row_metadata.iloc[0])) converted_col_metadata = mini_gctoo.col_metadata_df.replace( [-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan]) # write row and col metadata fields from mini_gctoo_for_testing instance to file # Note this time does convert back to -666 hdf5_writer = h5py.File( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True) write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True) hdf5_writer.close() # read in written metadata, then close and delete file mini_gctoo_col_metadata = parse_gctx.get_column_metadata( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) mini_gctoo_row_metadata = parse_gctx.get_row_metadata( FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx") # check row metadata self.assertTrue( set(mini_gctoo.row_metadata_df.columns) == set( mini_gctoo_row_metadata.columns), "Mismatch between expected row metadata columns {} and column values written to file: {}" .format(mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns)) self.assertTrue( set(mini_gctoo.row_metadata_df.index) == set( mini_gctoo.col_metadata_df.index), "Mismatch between expect row metadata index {} and index values written to file: {}" .format(mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index)) for c in list(mini_gctoo.row_metadata_df.columns): logger.debug("C2: For column name: {}".format(c)) logger.debug("C2: populated values: {}".format( set(mini_gctoo_row_metadata[c]))) logger.debug("C2: mini_gctoo values: {}".format( set(mini_gctoo.row_metadata_df[c]))) self.assertTrue( set(mini_gctoo.row_metadata_df[c]) == set( mini_gctoo_row_metadata[c]), "Values in column {} differ between expected metadata and written row metadata!" .format(c)) # check col metadata self.assertTrue( set(mini_gctoo.col_metadata_df.columns) == set( mini_gctoo_col_metadata.columns), "Mismatch between expected col metadata columns {} and column values written to file: {}" .format(mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns)) self.assertTrue( set(mini_gctoo.col_metadata_df.index) == set( mini_gctoo.col_metadata_df.index), "Mismatch between expect col metadata index {} and index values written to file: {}" .format(mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index)) for c in list(mini_gctoo.col_metadata_df.columns): self.assertTrue( set(mini_gctoo.col_metadata_df[c]) == set( mini_gctoo_col_metadata[c]), "Values in column {} differ between expected metadata and written col metadata!" .format(c))
def test_parse(self): # parse whole thing mg1 = mini_gctoo_for_testing.make() mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx") assert_frame_equal(mg1.data_df, mg2.data_df) assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # test with string rid/cid test_rids = [ 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666' ] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=test_rids, cid=test_cids) assert_frame_equal(mg3.data_df, mg4.data_df) assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # first, make & write out temp version of mini_gctoo with int rids/cids new_mg = mini_gctoo_for_testing.make(convert_neg_666=False) int_indexed_data_df = new_mg.data_df.copy() int_indexed_data_df.index = range(0, 6) int_indexed_data_df.columns = range(10, 16) int_indexed_row_meta = new_mg.row_metadata_df.copy() int_indexed_row_meta.index = range(0, 6) int_indexed_col_meta = new_mg.col_metadata_df.copy() int_indexed_col_meta.index = range(10, 16) int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx") # test with numeric (repr as string) rid/cid mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) mg5 = slice_gct.slice_gctoo( mg5, row_bool=[True, False, True, False, True, False], col_bool=[True, False, False, True, True, True]) mg5.data_df.index.name = "rid" mg5.data_df.columns.name = "cid" mg5.row_metadata_df.index.name = "rid" mg5.row_metadata_df.columns.name = "rhd" mg5.col_metadata_df.index.name = "cid" mg5.col_metadata_df.columns.name = "chd" mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid=[0, 2, 4], cid=[10, 13, 14, 15], convert_neg_666=False) os.remove("int_indexed_mini_gctoo.gctx") assert_frame_equal(mg5.data_df, mg6.data_df) assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # test with ridx/cidx mg7 = slice_gct.slice_gctoo( mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666') mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) assert_frame_equal(mg7.data_df, mg8.data_df) assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df) # test with rid/cidx mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cidx=[4]) assert_frame_equal(mg7.data_df, mg9.data_df) assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df) # test with ridx/cid mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) assert_frame_equal(mg7.data_df, mg10.data_df) assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)