コード例 #1
0
ファイル: test_dataio.py プロジェクト: jjinking/datsci
    def test_head(self):
        '''
        Test loading first few lines of file
        '''
        sample_file_txt = os.path.join(CURDIR, 'res', 'sample1.txt')
        colnames = dataio.head(sample_file_txt, k=0, sep='\t')
        self.assertTrue(isinstance(colnames, pd.Series))

        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.gz')
        df = dataio.head(sample_file, k=1)
        self.assertEqual(df.shape[0], 1)
        self.assertEqual(df.shape[1], 3)
        df = dataio.head(sample_file, k=7)
        self.assertEqual(df.shape[0], 7)
        df = dataio.head(sample_file, k=8, sep=',')
        self.assertEqual(df.shape[0], 8)
        df = dataio.head(sample_file, k=9)
        self.assertEqual(df.shape[0], 8)
        df = dataio.head(sample_file, k=100, sep=',')
        self.assertEqual(df.shape[0], 8)

        # Must change to numeric, since data read from file is string
        for col in df.columns:
            df[col] = df[col].astype(int)
        df2 = pd.read_csv(sample_file_txt, sep='\t')
        self.assertTrue(eda.df_isclose(df, df2))
コード例 #2
0
ファイル: test_eda.py プロジェクト: jjinking/datsci
    def test_df_isclose(self):

        # Test integers
        df1 = pd.DataFrame([[1, 2], [3, 4]])
        df2 = pd.DataFrame([[1, 2], [3, 4]])
        self.assertTrue(eda.df_isclose(df1, df2, tol=0))

        df1 = pd.DataFrame([[1, 2], [3, 5]])
        df2 = pd.DataFrame([[1, 2], [3, 4]])
        self.assertFalse(eda.df_isclose(df1, df2))

        # Test rounding
        df1 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 4.1234]])
        df2 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 4.1234]])
        self.assertTrue(eda.df_isclose(df1, df2))

        df1 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 5.1234]])
        df2 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 4.1232]])
        self.assertFalse(eda.df_isclose(df1, df2))

        df1 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 4.1234]])
        df2 = pd.DataFrame([[1.1234, 2.1234], [3.1234, 4.1232]])
        self.assertTrue(eda.df_isclose(df1, df2, tol=1e-3))

        df1 = pd.DataFrame([[np.nan, 2.1234], [3.1234, 5.1234123]])
        df2 = pd.DataFrame([[np.nan, 2.1234], [3.1234, 5.123412]])
        self.assertTrue(eda.df_isclose(df1, df2, tol=1e-6))
コード例 #3
0
ファイル: test_eda.py プロジェクト: jjinking/datsci
    def test_summarize_big_training_data(self):
        for fname in ["sample1.csv", "sample1.csv.zip", "sample1.csv.gz", "sample1.csv.tar.gz", "sample1.csv.tar.bz2"]:
            sample_file_csv = os.path.join(CURDIR, "res", fname)
            (summary, n_rows, label_counts) = eda.summarize_big_training_data(
                sample_file_csv, y_name="c", summary_pkl=None
            )

            self.assertEqual(n_rows, 8)
            self.assertEqual(len(label_counts), 8)
            self.assertEqual(set(label_counts.values()), {1})
            self.assertEqual(summary.shape[0], 3)
            summary_a = summary[summary["attribute"] == "a"]
            self.assertEqual(summary_a["min"].values[0], 1)
            self.assertEqual(summary_a["max"].values[0], 11111111)
            self.assertEqual(summary_a["n_null"].values[0], 0)
            self.assertEqual(summary_a["perc_null"].values[0], 0)
            summary_b = summary[summary["attribute"] == "b"]
            self.assertEqual(summary_b["min"].values[0], 2)
            self.assertEqual(summary_b["max"].values[0], 22222222)
            self.assertEqual(summary_b["n_null"].values[0], 0)
            self.assertEqual(summary_b["perc_null"].values[0], 0)
            summary_c = summary[summary["attribute"] == "c"]
            self.assertEqual(summary_c["min"].values[0], 3)
            self.assertEqual(summary_c["max"].values[0], 33333333)
            self.assertEqual(summary_c["n_null"].values[0], 0)
            self.assertEqual(summary_c["perc_null"].values[0], 0)

        sample_file_csv = os.path.join(CURDIR, "res", "sample2.csv")
        summary_pkl_file = os.path.join(CURDIR, "res", "foo.pkl")
        summary, n_rows, label_counts = eda.summarize_big_training_data(
            sample_file_csv, y_name="c", summary_pkl=summary_pkl_file
        )

        self.assertEqual(n_rows, 10)
        self.assertEqual(len(label_counts), 4)
        self.assertEqual(set(label_counts.values()), {1, 2, 3, 4})
        self.assertEqual(label_counts[""], 3)
        self.assertEqual(label_counts["3"], 1)
        self.assertEqual(label_counts["33"], 2)
        self.assertEqual(label_counts["333"], 4)
        self.assertEqual(summary.shape[0], 3)
        summary_a = summary[summary["attribute"] == "a"]
        self.assertEqual(summary_a["min"].values[0], 11)
        self.assertEqual(summary_a["max"].values[0], 1111111111)
        self.assertEqual(summary_a["n_null"].values[0], 1)
        self.assertEqual(summary_a["perc_null"].values[0], 0.10)
        self.assertEqual(summary_a["n_uniq"].values[0], 10)
        summary_b = summary[summary["attribute"] == "b"]
        self.assertEqual(summary_b["min"].values[0], 2)
        self.assertEqual(summary_b["max"].values[0], 222222222)
        self.assertEqual(summary_b["n_null"].values[0], 2)
        self.assertEqual(summary_b["perc_null"].values[0], 0.2)
        self.assertEqual(summary_b["n_uniq"].values[0], 9)
        summary_c = summary[summary["attribute"] == "c"]
        self.assertEqual(summary_c["min"].values[0], 3)
        self.assertEqual(summary_c["max"].values[0], 333)
        self.assertEqual(summary_c["n_null"].values[0], 3)
        self.assertEqual(summary_c["perc_null"].values[0], 0.3)
        self.assertEqual(summary_c["n_uniq"].values[0], 4)

        # Check that summary pkl file exists
        self.assertTrue(os.path.exists(summary_pkl_file))

        # Check saved values can be loaded and is correct
        summary2, n_rows2, label_counts2 = eda.load_summary_data(summary_pkl_file)
        self.assertTrue(eda.df_isclose(summary, summary2))
        self.assertEqual(n_rows, n_rows2)
        self.assertEqual(set(label_counts.items()), set(label_counts2.items()))

        # Delete file
        os.remove(summary_pkl_file)
        self.assertFalse(os.path.exists(summary_pkl_file))

        # Run again with summary_pkl option set to None
        summary, n_rows, label_counts = eda.summarize_big_training_data(sample_file_csv, y_name="c", summary_pkl=None)
        self.assertFalse(os.path.exists(summary_pkl_file))
コード例 #4
0
ファイル: test_eda.py プロジェクト: jjinking/datsci
    def test_summarize_training_data(self):
        sample_file_csv = os.path.join(CURDIR, "res", "sample1.csv")
        df = pd.read_csv(sample_file_csv)
        (summary, n_rows, label_counts) = eda.summarize_training_data(df, y_name="c", summary_pkl=None)
        self.assertEqual(n_rows, 8)
        self.assertEqual(len(label_counts), 8)
        self.assertEqual(set(label_counts.values()), {1})
        self.assertEqual(summary.shape[0], 3)
        self.assertEqual(summary[summary["attribute"] == "a"]["min"].values[0], 1)
        self.assertEqual(summary[summary["attribute"] == "a"]["max"].values[0], 11111111)
        self.assertEqual(summary[summary["attribute"] == "a"]["n_null"].values[0], 0)
        self.assertEqual(summary[summary["attribute"] == "a"]["perc_null"].values[0], 0)
        self.assertEqual(summary[summary["attribute"] == "b"]["min"].values[0], 2)
        self.assertEqual(summary[summary["attribute"] == "b"]["max"].values[0], 22222222)
        self.assertEqual(summary[summary["attribute"] == "b"]["n_null"].values[0], 0)
        self.assertEqual(summary[summary["attribute"] == "b"]["perc_null"].values[0], 0)
        self.assertEqual(summary[summary["attribute"] == "c"]["min"].values[0], 3)
        self.assertEqual(summary[summary["attribute"] == "c"]["max"].values[0], 33333333)
        self.assertEqual(summary[summary["attribute"] == "c"]["n_null"].values[0], 0)
        self.assertEqual(summary[summary["attribute"] == "c"]["perc_null"].values[0], 0)

        sample_file_csv = os.path.join(CURDIR, "res", "sample2.csv")
        df = pd.read_csv(sample_file_csv)
        (summary, n_rows, label_counts) = eda.summarize_training_data(df, y_name="c", summary_pkl=None)
        self.assertEqual(n_rows, 10)
        self.assertEqual(len(label_counts), 4)
        self.assertEqual(set(label_counts.values()), {1, 2, 3, 4})
        self.assertEqual(label_counts[sorted(label_counts.keys())[0]], 3)
        self.assertEqual(label_counts[3], 1)
        self.assertEqual(label_counts[33], 2)
        self.assertEqual(label_counts[333], 4)
        self.assertEqual(summary.shape[0], 3)
        summary_a = summary[summary["attribute"] == "a"]
        self.assertEqual(summary_a["min"].values[0], 11)
        self.assertEqual(summary_a["max"].values[0], 1111111111)
        self.assertEqual(summary_a["n_null"].values[0], 1)
        self.assertEqual(summary_a["perc_null"].values[0], 0.10)
        self.assertEqual(summary_a["n_uniq"].values[0], 10)
        summary_b = summary[summary["attribute"] == "b"]
        self.assertEqual(summary_b["min"].values[0], 2)
        self.assertEqual(summary_b["max"].values[0], 222222222)
        self.assertEqual(summary_b["n_null"].values[0], 2)
        self.assertEqual(summary_b["perc_null"].values[0], 0.2)
        self.assertEqual(summary_b["n_uniq"].values[0], 9)
        summary_c = summary[summary["attribute"] == "c"]
        self.assertEqual(summary_c["min"].values[0], 3)
        self.assertEqual(summary_c["max"].values[0], 333)
        self.assertEqual(summary_c["n_null"].values[0], 3)
        self.assertEqual(summary_c["perc_null"].values[0], 0.3)
        self.assertEqual(summary_c["n_uniq"].values[0], 4)

        sample_file_csv = os.path.join(CURDIR, "res", "sample3.csv")
        summary_pkl_file = os.path.join(CURDIR, "res", "foo.pkl")
        df = pd.read_csv(sample_file_csv)
        summary, n_rows, label_counts = eda.summarize_training_data(df, y_name="z", summary_pkl=summary_pkl_file)
        self.assertEqual(n_rows, 10)
        self.assertEqual(len(label_counts), 4)
        self.assertEqual(set(label_counts.values()), {1, 2, 3, 4})
        self.assertEqual(label_counts[np.nan], 3)
        self.assertEqual(label_counts["c"], 1)
        self.assertEqual(label_counts["cc"], 2)
        self.assertEqual(label_counts["ccc"], 4)
        self.assertEqual(summary.shape[0], 3)
        summary_x = summary[summary["attribute"] == "x"]
        self.assertTrue(pd.isnull(summary_x["min"].values[0]))
        self.assertTrue(pd.isnull(summary_x["max"].values[0]))
        self.assertEqual(summary_x["n_null"].values[0], 1)
        self.assertEqual(summary_x["perc_null"].values[0], 0.10)
        self.assertEqual(summary_x["n_uniq"].values[0], 10)
        summary_y = summary[summary["attribute"] == "y"]
        self.assertTrue(pd.isnull(summary_y["min"].values[0]))
        self.assertTrue(pd.isnull(summary_y["max"].values[0]))
        self.assertEqual(summary_y["n_null"].values[0], 2)
        self.assertEqual(summary_y["perc_null"].values[0], 0.2)
        self.assertEqual(summary_y["n_uniq"].values[0], 9)
        summary_z = summary[summary["attribute"] == "z"]
        self.assertTrue(pd.isnull(summary_z["min"].values[0]))
        self.assertTrue(pd.isnull(summary_z["max"].values[0]))
        self.assertEqual(summary_z["n_null"].values[0], 3)
        self.assertEqual(summary_z["perc_null"].values[0], 0.3)
        self.assertEqual(summary_z["n_uniq"].values[0], 4)

        # Check that summary pkl file exists
        self.assertTrue(os.path.exists(summary_pkl_file))

        # Check saved values can be loaded and is correct
        summary2, n_rows2, label_counts2 = eda.load_summary_data(summary_pkl_file)
        self.assertTrue(eda.df_isclose(summary, summary2))
        self.assertEqual(n_rows, n_rows2)
        self.assertEqual(str(list(label_counts.items())), str(list(label_counts2.items())))

        # Delete file
        os.remove(summary_pkl_file)
        self.assertFalse(os.path.exists(summary_pkl_file))

        # Run again with summary_pkl option set to None
        summary, n_rows, label_counts = eda.summarize_training_data(df, y_name="z", summary_pkl=None)
        self.assertFalse(os.path.exists(summary_pkl_file))