Exemple #1
0
    def test_arrange(self):
        df = (
            data.df_diamonds.groupby("cut")
            .apply(arrange_apply_helperfunc)
            .reset_index(drop=True)
        )
        d = (
            data.df_diamonds
            >> gr.tf_group_by("cut")
            >> gr.tf_arrange("depth", ascending=False)
            >> gr.tf_head(5)
            >> gr.tf_ungroup()
        ).reset_index(drop=True)
        self.assertTrue(df.equals(d))

        d = (
            data.df_diamonds
            >> gr.tf_group_by("cut")
            >> gr.tf_arrange(X.depth, ascending=False)
            >> gr.tf_head(5)
            >> gr.tf_ungroup()
        ).reset_index(drop=True)
        assert df.equals(d)

        df = data.df_diamonds.sort_values(["cut", "price"], ascending=False)
        d = data.df_diamonds >> gr.tf_arrange(gr.desc(X.cut), gr.desc(X.price))
        self.assertTrue(df.equals(d))
Exemple #2
0
 def test_sd(self):
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(3)
         >> gr.tf_select(X.cut, X.x)
         >> gr.tf_ungroup()
     )
     # straight summarize
     t = df >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame({"s": [0.829091]})
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342],
         }
     )
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # straight mutate
     t = df >> gr.tf_mutate(s=gr.sd(X.x))
     df_truth = df.copy()
     df_truth["s"] = 0.829091
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x))
     # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436,
     #                            0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934,
     #                            0.072342, 0.072342, 0.072342],
     #                           index=t.index)
     # test_vector = abs(t.s - df_truth.s)
     # print(t)
     # print(df_truth)
     self.assertTrue(all(test_vector < 0.00001))
     # test with single value (var undefined)
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(1)
         >> gr.tf_select(X.cut, X.x)
     )
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [np.nan, np.nan, np.nan, np.nan, np.nan],
         }
     )
     self.assertTrue(t.equals(df_truth))
Exemple #3
0
    def test_var(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )

        # straight summarize
        t = df >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame({"v": [0.687392]})
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233],
            }
        )
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # straight mutate
        t = df >> gr.tf_mutate(v=gr.var(X.x))
        df_truth = df.copy()
        df_truth["v"] = 0.687392
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(v=var(X.x))
        # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033,
        #                            0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100,
        #                            0.005233, 0.005233, 0.005233],
        #                           index=t.index)
        # test_vector = abs(t.v - df_truth.v)
        # assert all(test_vector < .00001)
        # test with single value (var undefined)
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(1)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [np.nan, np.nan, np.nan, np.nan, np.nan],
            }
        )
        self.assertTrue(t.equals(df_truth))
Exemple #4
0
    def test_median(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )
        # straight summarize
        t = df >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame({"m": [4.05]})
        self.assertTrue(t.equals(df_truth))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [6.27, 4.25, 3.95, 3.89, 3.95],
            }
        )
        self.assertTrue(t.equals(df_truth))
        # straight mutate
        t = df >> gr.tf_mutate(m=gr.median(X.x))
        df_truth = df.copy()
        df_truth["m"] = 4.05
        self.assertTrue(t.equals(df_truth))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(m=median(X.x))
        # df_truth['m'] = pd.Series(
        #     [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95],
        #     index=t.index)
        # assert t.equals(df_truth)
        # make sure it handles case with even counts properly
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(2)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [5.160, 4.195, 3.940, 4.045, 3.945],
            }
        )
        test_vector = abs(t.m - df_truth.m)
        self.assertTrue(all(test_vector < 0.000000001))
Exemple #5
0
 def test_group_mutate(self):
     df = data.df_diamonds.copy()
     df = df.groupby("cut").apply(group_mutate_helper)
     d = (data.df_diamonds >> gr.tf_group_by("cut") >>
          gr.tf_mutate(testcol=X.x * X.shape[0]) >> gr.tf_ungroup())
     self.assertTrue(df.equals(d.sort_index()))