def test_parse_data_df(self): mini_data_df = pd.DataFrame( [[-0.283359, 0.011270], [0.304119, 1.921061], [0.398655, -0.144652]], index=["200814_at", "218597_s_at", "217140_s_at"], columns=[ "LJP005_A375_24H:DMSO:-666", "LJP005_A375_24H:BRD-K76908866:10" ]) mini_data_df = mini_data_df.astype(np.float32) mini_data_df.index.name = "rid" mini_data_df.columns.name = "cid" # create h5py File instance mini_gctx = h5py.File( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx", "r") data_dset = mini_gctx[data_node] # get relevant metadata fields col_meta = parse_gctx.get_column_metadata( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx" ) row_meta = parse_gctx.get_row_metadata( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx" ) # case 1: no subsetting data_df1 = parse_gctx.parse_data_df(data_dset, [0, 1, 2], [0, 1], row_meta, col_meta) # note: checks to 3 decimal places pandas_testing.assert_frame_equal(mini_data_df, data_df1, check_exact=False, check_less_precise=True) # case 2: subset; ridx < cidx data_df2 = parse_gctx.parse_data_df(data_dset, [0], [0, 1], row_meta, col_meta) pandas_testing.assert_frame_equal(mini_data_df.iloc[[0], [0, 1]], data_df2, check_exact=False, check_less_precise=True) # case 3: subset; ridx == cidx data_df3 = parse_gctx.parse_data_df(data_dset, [0], [0], row_meta, col_meta) pandas_testing.assert_frame_equal(mini_data_df.iloc[[0], [0]], data_df3, check_exact=False, check_less_precise=True) # case 4: subset; ridx > cidx data_df4 = parse_gctx.parse_data_df(data_dset, [0, 1, 2], [0], row_meta, col_meta) pandas_testing.assert_frame_equal(mini_data_df.iloc[[0, 1, 2], [0]], data_df4, check_exact=False, check_less_precise=True) mini_gctx.close()
def test_write_metadata(self): """ CASE 1: - write metadata (has '-666') to file, do not convert -666 - parse in written metadata, don't convert -666 """ mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False) hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6) write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6) hdf5_writer.close() logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format( os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx"))) # read in written metadata, then close and delete file mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx") # check row metadata self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns), "Mismatch between expected row metadata columns {} and column values written to file: {}".format( mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns)) self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index), "Mismatch between expect row metadata index {} and index values written to file: {}".format( mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index)) for c in list(mini_gctoo.row_metadata_df.columns): logger.debug("C1: For column name: {}".format(c)) logger.debug("C1: populated values: {}".format(set(mini_gctoo_row_metadata[c]))) logger.debug("C1: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c]))) self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]), "Values in column {} differ between expected metadata and written row metadata: {} vs {}".format( c, set(mini_gctoo.row_metadata_df[c]), set(mini_gctoo_row_metadata[c]))) # check col metadata self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns), "Mismatch between expected col metadata columns {} and column values written to file: {}".format( mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns)) self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index), "Mismatch between expect col metadata index {} and index values written to file: {}".format( mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index)) for c in list(mini_gctoo.col_metadata_df.columns): self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]), "Values in column {} differ between expected metadata and written col metadata!".format(c)) """ CASE 2: - write metadata (has NaN, not '-666') to file, do convert NaN back to '-666' - parse in written metadata, don't convert -666 """ # first convert mini_gctoo's row & col metadata dfs -666s to NaN converted_row_metadata = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan]) logger.debug("First row of converted_row_metadata: {}".format(converted_row_metadata.iloc[0])) converted_col_metadata = mini_gctoo.col_metadata_df.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan]) # write row and col metadata fields from mini_gctoo_for_testing instance to file # Note this time does convert back to -666 hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w") write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6) write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6) hdf5_writer.close() # read in written metadata, then close and delete file mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", convert_neg_666=False) os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx") # check row metadata self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns), "Mismatch between expected row metadata columns {} and column values written to file: {}".format( mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns)) self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index), "Mismatch between expect row metadata index {} and index values written to file: {}".format( mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index)) for c in list(mini_gctoo.row_metadata_df.columns): logger.debug("C2: For column name: {}".format(c)) logger.debug("C2: populated values: {}".format(set(mini_gctoo_row_metadata[c]))) logger.debug("C2: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c]))) self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]), "Values in column {} differ between expected metadata and written row metadata!".format(c)) # check col metadata self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns), "Mismatch between expected col metadata columns {} and column values written to file: {}".format( mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns)) self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index), "Mismatch between expect col metadata index {} and index values written to file: {}".format( mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index)) for c in list(mini_gctoo.col_metadata_df.columns): self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]), "Values in column {} differ between expected metadata and written col metadata!".format(c))