예제 #1
0
    def test_build_mismatched_common_meta_report(self):
        # rhd3 header needs to be removed
        meta1 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_33"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        meta3 = pd.DataFrame(
            [["r3_1", "r3_3", "r3_2"], ["r1_1", "r1_3", "r1_2"],
             ["r2_1", "r2_3", "r2_2"]],
            index=["r3", "r1", "r2"],
            columns=["rhd1", "rhd3", "rhd2"])

        logger.debug("meta1:\n{}".format(meta1))
        logger.debug("meta2:\n{}".format(meta2))
        logger.debug("meta3:\n{}".format(meta3))

        common_meta_dfs = [meta1, meta2, meta3]
        all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_df(
            common_meta_dfs, [], False)
        common_meta_df_shapes = [x.shape for x in common_meta_dfs]
        sources = ["my_src1", "my_src2", "my_src3"]
        self.assertFalse(all_meta_df.index.is_unique,
                         "during setup expected the index to not be unique")

        r = cg.build_mismatched_common_meta_report(common_meta_df_shapes,
                                                   sources, all_meta_df,
                                                   all_meta_df_with_dups)
        logger.debug("r:\n{}".format(r))
        self.assertEqual((3, 5), r.shape)
        self.assertIn("source_file", r.columns)
        self.assertIn("orig_rid", r.columns)
        self.assertTrue(set(meta1.columns) < set(r.columns))
        self.assertEqual({"r3"}, set(r.orig_rid))
예제 #2
0
    def test_build_common_all_meta_df(self):
        # rhd3 header needs to be removed
        meta1 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_33"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        e_meta1 = pd.DataFrame(
            [["r1_1", "r1_2"], ["r2_1", "r2_2"], ["r3_1", "r3_2"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2"])

        r_all, r_all_w_dups = cg.build_common_all_meta_df([meta1, meta2],
                                                          ["rhd3"], False)
        logger.debug(
            "rhd3 header needs to be removed - r_all:\n{}".format(r_all))
        logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
        self.assertEqual((3, 2), r_all.shape)
        self.assertEqual((6, 2), r_all_w_dups.shape)
        pd.util.testing.assert_frame_equal(e_meta1, r_all)

        #remove all metadata fields
        r_all, r_all_w_dups = cg.build_common_all_meta_df([meta1, meta2], [],
                                                          True)
        logger.debug("remove all metadata fields - r_all\n{}".format(r_all))
        logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
        self.assertEqual((3, 0), r_all.shape)
        self.assertTrue((e_meta1.index == r_all.index).all())

        meta4 = pd.DataFrame(
            [["r1_1", "r1_22", "r1_5"], ["r4_1", "r4_22", "r4_5"],
             ["r3_1", "r3_22", "r3_5"]],
            index=["r1", "r4", "r3"],
            columns=["rhd1", "rhd2", "rhd5"])
        e_meta3 = pd.DataFrame([["r1_1"], ["r2_1"], ["r3_1"], ["r4_1"]],
                               index=["r1", "r2", "r3", "r4"],
                               columns=["rhd1"])
        logger.debug("meta4:\n{}".format(meta4))
        logger.debug("e_meta3:\n{}".format(e_meta3))

        # rhd5 not in meta4, so it should be dropped even without being
        # explicitly provided
        out_meta3, _ = cg.build_common_all_meta_df([meta1, meta4], ["rhd2"],
                                                   False)
        logger.debug(
            """rhd5 not in meta4 so it should be automatically dropped without being
        explictly listed in fields_to_remove - out_meta3:
        {}""".format(out_meta3))
        pd.util.testing.assert_frame_equal(out_meta3, e_meta3)

        # Empty metadata
        empty_meta = pd.DataFrame([], index=["a", "b", "c"])
        logger.debug("empty metadata provided - empty_meta.empty: {}".format(
            empty_meta.empty))
        out_meta4, _ = cg.build_common_all_meta_df([empty_meta, empty_meta],
                                                   [], False)
        logger.debug(
            "empty metadata provided - out_meta4:\n{}".format(out_meta4))
        pd.util.testing.assert_frame_equal(out_meta4, empty_meta)

        #metadata has duplicates but index is unique
        meta5 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3))
        meta6 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3))
        out_meta5, _ = cg.build_common_all_meta_df([meta5, meta6], [], False)
        logger.debug(
            "metadata has duplicates but index is unique - out_meta5:\n{}".
            format(out_meta5))
        self.assertEqual(
            (3, 1), out_meta5.shape,
            "metadata contains duplicates but index is unique - should have been kept"
        )