Exemple #1
0
    def test_parse_data_df(self):
        mini_data_df = pd.DataFrame(
            [[-0.283359, 0.011270], [0.304119, 1.921061],
             [0.398655, -0.144652]],
            index=["200814_at", "218597_s_at", "217140_s_at"],
            columns=[
                "LJP005_A375_24H:DMSO:-666", "LJP005_A375_24H:BRD-K76908866:10"
            ])
        mini_data_df = mini_data_df.astype(np.float32)
        mini_data_df.index.name = "rid"
        mini_data_df.columns.name = "cid"

        # create h5py File instance
        mini_gctx = h5py.File(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx",
            "r")
        data_dset = mini_gctx[data_node]

        # get relevant metadata fields
        col_meta = parse_gctx.get_column_metadata(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx"
        )
        row_meta = parse_gctx.get_row_metadata(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctx_with_metadata_n2x3.gctx"
        )

        # case 1: no subsetting
        data_df1 = parse_gctx.parse_data_df(data_dset, [0, 1, 2], [0, 1],
                                            row_meta, col_meta)
        # note: checks to 3 decimal places
        pandas_testing.assert_frame_equal(mini_data_df,
                                          data_df1,
                                          check_exact=False,
                                          check_less_precise=True)

        # case 2: subset; ridx < cidx
        data_df2 = parse_gctx.parse_data_df(data_dset, [0], [0, 1], row_meta,
                                            col_meta)
        pandas_testing.assert_frame_equal(mini_data_df.iloc[[0], [0, 1]],
                                          data_df2,
                                          check_exact=False,
                                          check_less_precise=True)

        # case 3: subset; ridx == cidx
        data_df3 = parse_gctx.parse_data_df(data_dset, [0], [0], row_meta,
                                            col_meta)
        pandas_testing.assert_frame_equal(mini_data_df.iloc[[0], [0]],
                                          data_df3,
                                          check_exact=False,
                                          check_less_precise=True)

        # case 4: subset; ridx > cidx
        data_df4 = parse_gctx.parse_data_df(data_dset, [0, 1, 2], [0],
                                            row_meta, col_meta)
        pandas_testing.assert_frame_equal(mini_data_df.iloc[[0, 1, 2], [0]],
                                          data_df4,
                                          check_exact=False,
                                          check_less_precise=True)

        mini_gctx.close()
Exemple #2
0
    def test_write_metadata(self):
        """
		CASE 1:
			- write metadata (has '-666') to file, do not convert -666
			- parse in written metadata, don't convert -666 
		"""
        mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False)
        hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6)
        write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6)
        hdf5_writer.close()
        logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format(
            os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx")))

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                                 convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                              convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns),
                        "Mismatch between expected row metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns))
        self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect row metadata index {} and index values written to file: {}".format(
                            mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C1: For column name: {}".format(c))
            logger.debug("C1: populated values: {}".format(set(mini_gctoo_row_metadata[c])))
            logger.debug("C1: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]),
                            "Values in column {} differ between expected metadata and written row metadata: {} vs {}".format(
                                c, set(mini_gctoo.row_metadata_df[c]), set(mini_gctoo_row_metadata[c])))

        # check col metadata
        self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns),
                        "Mismatch between expected col metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns))
        self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect col metadata index {} and index values written to file: {}".format(
                            mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]),
                            "Values in column {} differ between expected metadata and written col metadata!".format(c))

        """
		CASE 2:
			- write metadata (has NaN, not '-666') to file, do convert NaN back to '-666'
			- parse in written metadata, don't convert -666 
		"""
        # first convert mini_gctoo's row & col metadata dfs -666s to NaN
        converted_row_metadata = mini_gctoo.row_metadata_df.replace([-666, "-666", -666.0],
                                                                    [numpy.nan, numpy.nan, numpy.nan])
        logger.debug("First row of converted_row_metadata: {}".format(converted_row_metadata.iloc[0]))
        converted_col_metadata = mini_gctoo.col_metadata_df.replace([-666, "-666", -666.0],
                                                                    [numpy.nan, numpy.nan, numpy.nan])

        # write row and col metadata fields from mini_gctoo_for_testing instance to file
        # Note this time does convert back to -666
        hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
        write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6)
        write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6)
        hdf5_writer.close()

        # read in written metadata, then close and delete file
        mini_gctoo_col_metadata = parse_gctx.get_column_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                                 convert_neg_666=False)
        mini_gctoo_row_metadata = parse_gctx.get_row_metadata(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx",
                                                              convert_neg_666=False)

        os.remove(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx")

        # check row metadata
        self.assertTrue(set(mini_gctoo.row_metadata_df.columns) == set(mini_gctoo_row_metadata.columns),
                        "Mismatch between expected row metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.row_metadata_df.columns, mini_gctoo_row_metadata.columns))
        self.assertTrue(set(mini_gctoo.row_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect row metadata index {} and index values written to file: {}".format(
                            mini_gctoo.row_metadata_df.index, mini_gctoo_row_metadata.index))
        for c in list(mini_gctoo.row_metadata_df.columns):
            logger.debug("C2: For column name: {}".format(c))
            logger.debug("C2: populated values: {}".format(set(mini_gctoo_row_metadata[c])))
            logger.debug("C2: mini_gctoo values: {}".format(set(mini_gctoo.row_metadata_df[c])))
            self.assertTrue(set(mini_gctoo.row_metadata_df[c]) == set(mini_gctoo_row_metadata[c]),
                            "Values in column {} differ between expected metadata and written row metadata!".format(c))

        # check col metadata
        self.assertTrue(set(mini_gctoo.col_metadata_df.columns) == set(mini_gctoo_col_metadata.columns),
                        "Mismatch between expected col metadata columns {} and column values written to file: {}".format(
                            mini_gctoo.col_metadata_df.columns, mini_gctoo_col_metadata.columns))
        self.assertTrue(set(mini_gctoo.col_metadata_df.index) == set(mini_gctoo.col_metadata_df.index),
                        "Mismatch between expect col metadata index {} and index values written to file: {}".format(
                            mini_gctoo.col_metadata_df.index, mini_gctoo_col_metadata.index))
        for c in list(mini_gctoo.col_metadata_df.columns):
            self.assertTrue(set(mini_gctoo.col_metadata_df[c]) == set(mini_gctoo_col_metadata[c]),
                            "Values in column {} differ between expected metadata and written col metadata!".format(c))