def test_assemble_common_meta(self): # rhd3 header needs to be removed meta1 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_3"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) meta2 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_33"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) e_meta1 = pd.DataFrame( [["r1_1", "r1_2"], ["r2_1", "r2_2"], ["r3_1", "r3_2"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2"]) logger.debug("meta1:\n{}".format(meta1)) logger.debug("meta2:\n{}".format(meta2)) logger.debug("e_meta:\n{}".format(e_meta1)) error_report_file = tempfile.NamedTemporaryFile().name logger.debug( "rhd3 header needs to be removed - error_report_file: {}".format( error_report_file)) with self.assertRaises( cg.MismatchCommonMetadataConcatGctooException) as e: cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"], False, error_report_file) self.assertIn("r3", str(e.exception)) logger.debug( "rhd3 header needs to be removed - e.exception: {}".format( e.exception)) report_df = pd.read_csv(error_report_file, sep="\t") self.assertGreater(report_df.shape[0], 0) self.assertGreater(report_df.shape[1], 0) self.assertIn("source_file", report_df.columns) self.assertIn("orig_rid", report_df.columns) self.assertTrue(set(meta1.columns) < set(report_df.columns)) os.remove(error_report_file) out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None, False, None) logger.debug("out_meta1:\n{}".format(out_meta1)) pd.util.testing.assert_frame_equal(out_meta1, e_meta1) # Order of indices and columns are different meta3 = pd.DataFrame( [["r3_1", "r3_3", "r3_2"], ["r1_1", "r1_3", "r1_2"], ["r2_1", "r2_3", "r2_2"]], index=["r3", "r1", "r2"], columns=["rhd1", "rhd3", "rhd2"]) e_meta2 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_3"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) logger.debug("meta3:\n{}".format(meta3)) logger.debug("e_meta2:\n{}".format(e_meta2)) out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None, False, None) pd.util.testing.assert_frame_equal(out_meta2, e_meta2) # Some ids not present in both dfs meta4 = pd.DataFrame( [["r1_1", "r1_22", "r1_5"], ["r4_1", "r4_22", "r4_5"], ["r3_1", "r3_22", "r3_5"]], index=["r1", "r4", "r3"], columns=["rhd1", "rhd2", "rhd5"]) logger.debug("meta1:\n{}".format(meta1)) logger.debug("meta4:\n{}".format(meta4)) with self.assertRaises( cg.MismatchCommonMetadataConcatGctooException) as e: cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"], False, None) self.assertIn("r1", str(e.exception))
def test_assemble_common_meta(self): # rhd3 header needs to be removed meta1 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_3"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) meta2 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_33"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) e_meta1 = pd.DataFrame( [["r1_1", "r1_2"], ["r2_1", "r2_2"], ["r3_1", "r3_2"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2"]) logger.debug("meta1:\n{}".format(meta1)) logger.debug("meta2:\n{}".format(meta2)) logger.debug("e_meta:\n{}".format(e_meta1)) with self.assertRaises(AssertionError) as e: _ = cg.assemble_common_meta([meta1.copy(), meta2.copy()], []) self.assertIn("r3", str(e.exception)) logger.debug( "rhd3 header needs to be removed - e.exception: {}".format( e.exception)) out_meta1 = cg.assemble_common_meta( [meta1.copy(), meta2.copy()], ["rhd3"]) logger.debug("out_meta1:\n{}".format(out_meta1)) pd.util.testing.assert_frame_equal(out_meta1, e_meta1) # Order of indices and columns are different meta3 = pd.DataFrame( [["r3_1", "r3_3", "r3_2"], ["r1_1", "r1_3", "r1_2"], ["r2_1", "r2_3", "r2_2"]], index=["r3", "r1", "r2"], columns=["rhd1", "rhd3", "rhd2"]) e_meta2 = pd.DataFrame( [["r1_1", "r1_2", "r1_3"], ["r2_1", "r2_2", "r2_3"], ["r3_1", "r3_2", "r3_3"]], index=["r1", "r2", "r3"], columns=["rhd1", "rhd2", "rhd3"]) logger.debug("meta3:\n{}".format(meta3)) logger.debug("e_meta2:\n{}".format(e_meta2)) out_meta2 = cg.assemble_common_meta([meta1.copy(), meta3.copy()], []) pd.util.testing.assert_frame_equal(out_meta2, e_meta2) # Some ids not present in both dfs meta4 = pd.DataFrame( [["r1_1", "r1_22", "r1_5"], ["r4_1", "r4_22", "r4_5"], ["r3_1", "r3_22", "r3_5"]], index=["r1", "r4", "r3"], columns=["rhd1", "rhd2", "rhd5"]) e_meta3 = pd.DataFrame([["r1_1"], ["r2_1"], ["r3_1"], ["r4_1"]], index=["r1", "r2", "r3", "r4"], columns=["rhd1"]) logger.debug("meta1:\n{}".format(meta1)) logger.debug("meta4:\n{}".format(meta4)) logger.debug("e_meta3:\n{}".format(e_meta3)) with self.assertRaises(AssertionError) as e: _ = cg.assemble_common_meta([meta1.copy(), meta4.copy()], []) self.assertIn("r1", str(e.exception)) out_meta3 = cg.assemble_common_meta( [meta1.copy(), meta4.copy()], ["rhd2"]) logger.debug("out_meta3:\n{}".format(out_meta3)) pd.util.testing.assert_frame_equal(out_meta3, e_meta3) # Empty metadata empty_meta = pd.DataFrame([], index=["a", "b", "c"]) logger.debug("empty_meta.empty: {}".format(empty_meta.empty)) out_meta4 = cg.assemble_common_meta([empty_meta, empty_meta], []) pd.util.testing.assert_frame_equal(out_meta4, empty_meta) #metadata has duplicates but index is unique meta5 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3)) meta6 = pd.DataFrame({"rhd1": [0, 0, 1]}, index=range(3)) out_meta5 = cg.assemble_common_meta([meta5, meta6], []) self.assertEqual((3, 1), out_meta5.shape)