def test_summarize_big_training_data(self): for fname in ["sample1.csv", "sample1.csv.zip", "sample1.csv.gz", "sample1.csv.tar.gz", "sample1.csv.tar.bz2"]: sample_file_csv = os.path.join(CURDIR, "res", fname) (summary, n_rows, label_counts) = eda.summarize_big_training_data( sample_file_csv, y_name="c", summary_pkl=None ) self.assertEqual(n_rows, 8) self.assertEqual(len(label_counts), 8) self.assertEqual(set(label_counts.values()), {1}) self.assertEqual(summary.shape[0], 3) summary_a = summary[summary["attribute"] == "a"] self.assertEqual(summary_a["min"].values[0], 1) self.assertEqual(summary_a["max"].values[0], 11111111) self.assertEqual(summary_a["n_null"].values[0], 0) self.assertEqual(summary_a["perc_null"].values[0], 0) summary_b = summary[summary["attribute"] == "b"] self.assertEqual(summary_b["min"].values[0], 2) self.assertEqual(summary_b["max"].values[0], 22222222) self.assertEqual(summary_b["n_null"].values[0], 0) self.assertEqual(summary_b["perc_null"].values[0], 0) summary_c = summary[summary["attribute"] == "c"] self.assertEqual(summary_c["min"].values[0], 3) self.assertEqual(summary_c["max"].values[0], 33333333) self.assertEqual(summary_c["n_null"].values[0], 0) self.assertEqual(summary_c["perc_null"].values[0], 0) sample_file_csv = os.path.join(CURDIR, "res", "sample2.csv") summary_pkl_file = os.path.join(CURDIR, "res", "foo.pkl") summary, n_rows, label_counts = eda.summarize_big_training_data( sample_file_csv, y_name="c", summary_pkl=summary_pkl_file ) self.assertEqual(n_rows, 10) self.assertEqual(len(label_counts), 4) self.assertEqual(set(label_counts.values()), {1, 2, 3, 4}) self.assertEqual(label_counts[""], 3) self.assertEqual(label_counts["3"], 1) self.assertEqual(label_counts["33"], 2) self.assertEqual(label_counts["333"], 4) self.assertEqual(summary.shape[0], 3) summary_a = summary[summary["attribute"] == "a"] self.assertEqual(summary_a["min"].values[0], 11) self.assertEqual(summary_a["max"].values[0], 1111111111) self.assertEqual(summary_a["n_null"].values[0], 1) self.assertEqual(summary_a["perc_null"].values[0], 0.10) self.assertEqual(summary_a["n_uniq"].values[0], 10) summary_b = summary[summary["attribute"] == "b"] self.assertEqual(summary_b["min"].values[0], 2) self.assertEqual(summary_b["max"].values[0], 222222222) self.assertEqual(summary_b["n_null"].values[0], 2) self.assertEqual(summary_b["perc_null"].values[0], 0.2) self.assertEqual(summary_b["n_uniq"].values[0], 9) summary_c = summary[summary["attribute"] == "c"] self.assertEqual(summary_c["min"].values[0], 3) self.assertEqual(summary_c["max"].values[0], 333) self.assertEqual(summary_c["n_null"].values[0], 3) self.assertEqual(summary_c["perc_null"].values[0], 0.3) self.assertEqual(summary_c["n_uniq"].values[0], 4) # Check that summary pkl file exists self.assertTrue(os.path.exists(summary_pkl_file)) # Check saved values can be loaded and is correct summary2, n_rows2, label_counts2 = eda.load_summary_data(summary_pkl_file) self.assertTrue(eda.df_isclose(summary, summary2)) self.assertEqual(n_rows, n_rows2) self.assertEqual(set(label_counts.items()), set(label_counts2.items())) # Delete file os.remove(summary_pkl_file) self.assertFalse(os.path.exists(summary_pkl_file)) # Run again with summary_pkl option set to None summary, n_rows, label_counts = eda.summarize_big_training_data(sample_file_csv, y_name="c", summary_pkl=None) self.assertFalse(os.path.exists(summary_pkl_file))
def test_summarize_training_data(self): sample_file_csv = os.path.join(CURDIR, "res", "sample1.csv") df = pd.read_csv(sample_file_csv) (summary, n_rows, label_counts) = eda.summarize_training_data(df, y_name="c", summary_pkl=None) self.assertEqual(n_rows, 8) self.assertEqual(len(label_counts), 8) self.assertEqual(set(label_counts.values()), {1}) self.assertEqual(summary.shape[0], 3) self.assertEqual(summary[summary["attribute"] == "a"]["min"].values[0], 1) self.assertEqual(summary[summary["attribute"] == "a"]["max"].values[0], 11111111) self.assertEqual(summary[summary["attribute"] == "a"]["n_null"].values[0], 0) self.assertEqual(summary[summary["attribute"] == "a"]["perc_null"].values[0], 0) self.assertEqual(summary[summary["attribute"] == "b"]["min"].values[0], 2) self.assertEqual(summary[summary["attribute"] == "b"]["max"].values[0], 22222222) self.assertEqual(summary[summary["attribute"] == "b"]["n_null"].values[0], 0) self.assertEqual(summary[summary["attribute"] == "b"]["perc_null"].values[0], 0) self.assertEqual(summary[summary["attribute"] == "c"]["min"].values[0], 3) self.assertEqual(summary[summary["attribute"] == "c"]["max"].values[0], 33333333) self.assertEqual(summary[summary["attribute"] == "c"]["n_null"].values[0], 0) self.assertEqual(summary[summary["attribute"] == "c"]["perc_null"].values[0], 0) sample_file_csv = os.path.join(CURDIR, "res", "sample2.csv") df = pd.read_csv(sample_file_csv) (summary, n_rows, label_counts) = eda.summarize_training_data(df, y_name="c", summary_pkl=None) self.assertEqual(n_rows, 10) self.assertEqual(len(label_counts), 4) self.assertEqual(set(label_counts.values()), {1, 2, 3, 4}) self.assertEqual(label_counts[sorted(label_counts.keys())[0]], 3) self.assertEqual(label_counts[3], 1) self.assertEqual(label_counts[33], 2) self.assertEqual(label_counts[333], 4) self.assertEqual(summary.shape[0], 3) summary_a = summary[summary["attribute"] == "a"] self.assertEqual(summary_a["min"].values[0], 11) self.assertEqual(summary_a["max"].values[0], 1111111111) self.assertEqual(summary_a["n_null"].values[0], 1) self.assertEqual(summary_a["perc_null"].values[0], 0.10) self.assertEqual(summary_a["n_uniq"].values[0], 10) summary_b = summary[summary["attribute"] == "b"] self.assertEqual(summary_b["min"].values[0], 2) self.assertEqual(summary_b["max"].values[0], 222222222) self.assertEqual(summary_b["n_null"].values[0], 2) self.assertEqual(summary_b["perc_null"].values[0], 0.2) self.assertEqual(summary_b["n_uniq"].values[0], 9) summary_c = summary[summary["attribute"] == "c"] self.assertEqual(summary_c["min"].values[0], 3) self.assertEqual(summary_c["max"].values[0], 333) self.assertEqual(summary_c["n_null"].values[0], 3) self.assertEqual(summary_c["perc_null"].values[0], 0.3) self.assertEqual(summary_c["n_uniq"].values[0], 4) sample_file_csv = os.path.join(CURDIR, "res", "sample3.csv") summary_pkl_file = os.path.join(CURDIR, "res", "foo.pkl") df = pd.read_csv(sample_file_csv) summary, n_rows, label_counts = eda.summarize_training_data(df, y_name="z", summary_pkl=summary_pkl_file) self.assertEqual(n_rows, 10) self.assertEqual(len(label_counts), 4) self.assertEqual(set(label_counts.values()), {1, 2, 3, 4}) self.assertEqual(label_counts[np.nan], 3) self.assertEqual(label_counts["c"], 1) self.assertEqual(label_counts["cc"], 2) self.assertEqual(label_counts["ccc"], 4) self.assertEqual(summary.shape[0], 3) summary_x = summary[summary["attribute"] == "x"] self.assertTrue(pd.isnull(summary_x["min"].values[0])) self.assertTrue(pd.isnull(summary_x["max"].values[0])) self.assertEqual(summary_x["n_null"].values[0], 1) self.assertEqual(summary_x["perc_null"].values[0], 0.10) self.assertEqual(summary_x["n_uniq"].values[0], 10) summary_y = summary[summary["attribute"] == "y"] self.assertTrue(pd.isnull(summary_y["min"].values[0])) self.assertTrue(pd.isnull(summary_y["max"].values[0])) self.assertEqual(summary_y["n_null"].values[0], 2) self.assertEqual(summary_y["perc_null"].values[0], 0.2) self.assertEqual(summary_y["n_uniq"].values[0], 9) summary_z = summary[summary["attribute"] == "z"] self.assertTrue(pd.isnull(summary_z["min"].values[0])) self.assertTrue(pd.isnull(summary_z["max"].values[0])) self.assertEqual(summary_z["n_null"].values[0], 3) self.assertEqual(summary_z["perc_null"].values[0], 0.3) self.assertEqual(summary_z["n_uniq"].values[0], 4) # Check that summary pkl file exists self.assertTrue(os.path.exists(summary_pkl_file)) # Check saved values can be loaded and is correct summary2, n_rows2, label_counts2 = eda.load_summary_data(summary_pkl_file) self.assertTrue(eda.df_isclose(summary, summary2)) self.assertEqual(n_rows, n_rows2) self.assertEqual(str(list(label_counts.items())), str(list(label_counts2.items()))) # Delete file os.remove(summary_pkl_file) self.assertFalse(os.path.exists(summary_pkl_file)) # Run again with summary_pkl option set to None summary, n_rows, label_counts = eda.summarize_training_data(df, y_name="z", summary_pkl=None) self.assertFalse(os.path.exists(summary_pkl_file))