Beispiel #1
0
    def test_assemble_common_meta(self):
        # rhd3 header needs to be removed
        meta1 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_33"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        e_meta1 = pd.DataFrame(
            [["r1_1", "r1_2"], ["r2_1", "r2_2"], ["r3_1", "r3_2"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2"])

        logger.debug("meta1:\n{}".format(meta1))
        logger.debug("meta2:\n{}".format(meta2))
        logger.debug("e_meta:\n{}".format(e_meta1))

        error_report_file = tempfile.NamedTemporaryFile().name
        logger.debug(
            "rhd3 header needs to be removed - error_report_file:  {}".format(
                error_report_file))
        with self.assertRaises(
                cg.MismatchCommonMetadataConcatGctooException) as e:
            cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"],
                                    False, error_report_file)
        self.assertIn("r3", str(e.exception))
        logger.debug(
            "rhd3 header needs to be removed - e.exception:  {}".format(
                e.exception))
        report_df = pd.read_csv(error_report_file, sep="\t")
        self.assertGreater(report_df.shape[0], 0)
        self.assertGreater(report_df.shape[1], 0)
        self.assertIn("source_file", report_df.columns)
        self.assertIn("orig_rid", report_df.columns)
        self.assertTrue(set(meta1.columns) < set(report_df.columns))

        os.remove(error_report_file)

        out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None,
                                            False, None)
        logger.debug("out_meta1:\n{}".format(out_meta1))
        pd.util.testing.assert_frame_equal(out_meta1, e_meta1)

        # Order of indices and columns are different
        meta3 = pd.DataFrame(
            [["r3_1", "r3_3", "r3_2"], ["r1_1", "r1_3", "r1_2"],
             ["r2_1", "r2_3", "r2_2"]],
            index=["r3", "r1", "r2"],
            columns=["rhd1", "rhd3", "rhd2"])
        e_meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])

        logger.debug("meta3:\n{}".format(meta3))
        logger.debug("e_meta2:\n{}".format(e_meta2))
        out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None, False,
                                            None)
        pd.util.testing.assert_frame_equal(out_meta2, e_meta2)

        # Some ids not present in both dfs
        meta4 = pd.DataFrame(
            [["r1_1", "r1_22", "r1_5"], ["r4_1", "r4_22", "r4_5"],
             ["r3_1", "r3_22", "r3_5"]],
            index=["r1", "r4", "r3"],
            columns=["rhd1", "rhd2", "rhd5"])
        logger.debug("meta1:\n{}".format(meta1))
        logger.debug("meta4:\n{}".format(meta4))

        with self.assertRaises(
                cg.MismatchCommonMetadataConcatGctooException) as e:
            cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"],
                                    False, None)
        self.assertIn("r1", str(e.exception))
Beispiel #2
0
    def test_assemble_common_meta(self):
        # rhd3 header needs to be removed
        meta1 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_33"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])
        e_meta1 = pd.DataFrame(
            [["r1_1", "r1_2"], ["r2_1", "r2_2"], ["r3_1", "r3_2"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2"])

        logger.debug("meta1:\n{}".format(meta1))
        logger.debug("meta2:\n{}".format(meta2))
        logger.debug("e_meta:\n{}".format(e_meta1))

        with self.assertRaises(AssertionError) as e:
            _ = cg.assemble_common_meta([meta1.copy(), meta2.copy()], [])
        self.assertIn("r3", str(e.exception))
        logger.debug(
            "rhd3 header needs to be removed - e.exception:  {}".format(
                e.exception))

        out_meta1 = cg.assemble_common_meta(
            [meta1.copy(), meta2.copy()], ["rhd3"])
        logger.debug("out_meta1:\n{}".format(out_meta1))
        pd.util.testing.assert_frame_equal(out_meta1, e_meta1)

        # Order of indices and columns are different
        meta3 = pd.DataFrame(
            [["r3_1", "r3_3", "r3_2"], ["r1_1", "r1_3", "r1_2"],
             ["r2_1", "r2_3", "r2_2"]],
            index=["r3", "r1", "r2"],
            columns=["rhd1", "rhd3", "rhd2"])
        e_meta2 = pd.DataFrame(
            [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"],
             ["r3_1", "r3_2", "r3_3"]],
            index=["r1", "r2", "r3"],
            columns=["rhd1", "rhd2", "rhd3"])

        logger.debug("meta3:\n{}".format(meta3))
        logger.debug("e_meta2:\n{}".format(e_meta2))
        out_meta2 = cg.assemble_common_meta([meta1.copy(), meta3.copy()], [])
        pd.util.testing.assert_frame_equal(out_meta2, e_meta2)

        # Some ids not present in both dfs
        meta4 = pd.DataFrame(
            [["r1_1", "r1_22", "r1_5"], ["r4_1", "r4_22", "r4_5"],
             ["r3_1", "r3_22", "r3_5"]],
            index=["r1", "r4", "r3"],
            columns=["rhd1", "rhd2", "rhd5"])
        e_meta3 = pd.DataFrame([["r1_1"], ["r2_1"], ["r3_1"], ["r4_1"]],
                               index=["r1", "r2", "r3", "r4"],
                               columns=["rhd1"])

        logger.debug("meta1:\n{}".format(meta1))
        logger.debug("meta4:\n{}".format(meta4))
        logger.debug("e_meta3:\n{}".format(e_meta3))

        with self.assertRaises(AssertionError) as e:
            _ = cg.assemble_common_meta([meta1.copy(), meta4.copy()], [])
        self.assertIn("r1", str(e.exception))

        out_meta3 = cg.assemble_common_meta(
            [meta1.copy(), meta4.copy()], ["rhd2"])
        logger.debug("out_meta3:\n{}".format(out_meta3))
        pd.util.testing.assert_frame_equal(out_meta3, e_meta3)

        # Empty metadata
        empty_meta = pd.DataFrame([], index=["a", "b", "c"])
        logger.debug("empty_meta.empty: {}".format(empty_meta.empty))
        out_meta4 = cg.assemble_common_meta([empty_meta, empty_meta], [])
        pd.util.testing.assert_frame_equal(out_meta4, empty_meta)

        #metadata has duplicates but index is unique
        meta5 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3))
        meta6 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3))
        out_meta5 = cg.assemble_common_meta([meta5, meta6], [])
        self.assertEqual((3, 1), out_meta5.shape)