Example #1
0
 def test_select_to(self):
     df = data.df_diamonds[["carat", "cut"]]
     self.assertTrue(
         df.equals(
             data.df_diamonds >> gr.tf_select(gr.columns_to("color"))))
     self.assertTrue(
         df.equals(
             data.df_diamonds >> gr.tf_select(gr.columns_to(X.color))))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(gr.columns_to(2))))
Example #2
0
 def test_select_from(self):
     df = data.df_diamonds[["x", "y", "z"]]
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from("x"))))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from(X.x))))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from(7))))
     self.assertTrue(data.df_diamonds[[]].equals(
         data.df_diamonds >> gr.tf_select(gr.columns_from(100))))
Example #3
0
    def test_var(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )

        # straight summarize
        t = df >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame({"v": [0.687392]})
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233],
            }
        )
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # straight mutate
        t = df >> gr.tf_mutate(v=gr.var(X.x))
        df_truth = df.copy()
        df_truth["v"] = 0.687392
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(v=var(X.x))
        # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033,
        #                            0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100,
        #                            0.005233, 0.005233, 0.005233],
        #                           index=t.index)
        # test_vector = abs(t.v - df_truth.v)
        # assert all(test_vector < .00001)
        # test with single value (var undefined)
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(1)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [np.nan, np.nan, np.nan, np.nan, np.nan],
            }
        )
        self.assertTrue(t.equals(df_truth))
Example #4
0
 def test_sd(self):
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(3)
         >> gr.tf_select(X.cut, X.x)
         >> gr.tf_ungroup()
     )
     # straight summarize
     t = df >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame({"s": [0.829091]})
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342],
         }
     )
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # straight mutate
     t = df >> gr.tf_mutate(s=gr.sd(X.x))
     df_truth = df.copy()
     df_truth["s"] = 0.829091
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x))
     # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436,
     #                            0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934,
     #                            0.072342, 0.072342, 0.072342],
     #                           index=t.index)
     # test_vector = abs(t.s - df_truth.s)
     # print(t)
     # print(df_truth)
     self.assertTrue(all(test_vector < 0.00001))
     # test with single value (var undefined)
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(1)
         >> gr.tf_select(X.cut, X.x)
     )
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [np.nan, np.nan, np.nan, np.nan, np.nan],
         }
     )
     self.assertTrue(t.equals(df_truth))
Example #5
0
 def select_through(self):
     df = data.df_diamonds[["carat", "cut", "color"]]
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(
             gr.columns_to("color", inclusive=True))))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(
             gr.columns_to(X.color, inclusive=True))))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(
             gr.columns_to(2, inclusive=True))))
Example #6
0
    def test_median(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )
        # straight summarize
        t = df >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame({"m": [4.05]})
        self.assertTrue(t.equals(df_truth))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [6.27, 4.25, 3.95, 3.89, 3.95],
            }
        )
        self.assertTrue(t.equals(df_truth))
        # straight mutate
        t = df >> gr.tf_mutate(m=gr.median(X.x))
        df_truth = df.copy()
        df_truth["m"] = 4.05
        self.assertTrue(t.equals(df_truth))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(m=median(X.x))
        # df_truth['m'] = pd.Series(
        #     [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95],
        #     index=t.index)
        # assert t.equals(df_truth)
        # make sure it handles case with even counts properly
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(2)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [5.160, 4.195, 3.940, 4.045, 3.945],
            }
        )
        test_vector = abs(t.m - df_truth.m)
        self.assertTrue(all(test_vector < 0.000000001))
Example #7
0
    def test_select_between(self):
        df = data.df_diamonds[["cut", "color", "clarity"]]
        self.assertTrue(
            df.equals(data.df_diamonds >> gr.tf_select(
                gr.columns_between(X.cut, X.clarity))))
        self.assertTrue(
            df.equals(data.df_diamonds >> gr.tf_select(
                gr.columns_between("cut", "clarity"))))
        self.assertTrue(
            df.equals(
                data.df_diamonds >> gr.tf_select(gr.columns_between(1, 3))))

        df = data.df_diamonds[["x", "y", "z"]]
        assert df.equals(
            data.df_diamonds >> gr.tf_select(gr.columns_between("x", 20)))
Example #8
0
 def test_n(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(n=gr.n(X.x))
     df_truth = pd.DataFrame({"n": [5]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(n=gr.n(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "n": [2, 1, 2]
     })
     self.assertTrue(t.equals(df_truth))
     # straight mutate
     t = df >> gr.tf_mutate(n=gr.n(X.x))
     df_truth = df.copy()
     df_truth["n"] = 5
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n(X.x))
     df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2])
     self.assertTrue(t.sort_index().equals(df_truth))
     # Implicit mode summarize
     t = df >> gr.tf_summarize(n=gr.n())
     df_truth = pd.DataFrame({"n": [5]})
     self.assertTrue(t.equals(df_truth))
     # Implicit mode mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n())
     df_truth = df.copy()
     df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #9
0
 def test_first(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(f=gr.first(X.x))
     df_truth = pd.DataFrame({"f": [3.95]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(f=gr.first(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "f": [4.05, 3.95, 3.89]
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(f=gr.first(X.x, order_by=gr.desc(X.cut)))
     df_truth = pd.DataFrame({"f": [3.89]})
     # straight mutate
     t = df >> gr.tf_mutate(f=gr.first(X.x))
     df_truth = df.copy()
     df_truth["f"] = df_truth.x.iloc[0]
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(f=gr.first(X.x))
     df_truth["f"] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #10
0
 def test_last(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(l=gr.last(X.x))
     df_truth = pd.DataFrame({"l": [4.34]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(l=gr.last(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "l": [4.34, 3.95, 4.20]
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(f=gr.last(
         X.x, order_by=[gr.desc(X.cut), gr.desc(X.x)]))
     df_truth = pd.DataFrame({"f": [4.05]})
     assert df_truth.equals(t)
     # straight mutate
     t = df >> gr.tf_mutate(l=gr.last(X.x))
     df_truth = df.copy()
     df_truth["l"] = df_truth.x.iloc[4]
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(l=gr.last(X.x))
     df_truth["l"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #11
0
 def test_nth(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10)
     # straight summarize
     t = df >> gr.tf_summarize(second=gr.nth(X.x, 1))
     df_truth = pd.DataFrame({"second": [3.89]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(
         X.cut) >> gr.tf_summarize(first=gr.nth(X.x, 0))
     df_truth = pd.DataFrame({
         "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
         "first": [3.87, 4.05, 3.95, 3.89, 3.94],
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(last=gr.nth(
         X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)]))
     df_truth = pd.DataFrame({"last": [3.87]})
     self.assertTrue(df_truth.equals(t))
     # straight mutate
     t = df >> gr.tf_mutate(out_of_range=gr.nth(X.x, 500))
     df_truth = df.copy()
     df_truth["out_of_range"] = np.nan
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(
         X.cut) >> gr.tf_mutate(penultimate=gr.nth(X.x, -2))
     df_truth = df.copy()
     df_truth["penultimate"] = pd.Series(
         [np.nan, 3.89, 4.05, 3.89, 4.05, 4.07, 4.07, 4.07, np.nan, 4.07])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #12
0
    def test_kmeans(self):
        ## Fit routine creates usable model
        var = ["x", "y"]
        md_fit = fit.fit_kmeans(self.df_cluster, var=var, n_clusters=2)
        df_res = gr.eval_df(md_fit, self.df_cluster[var])

        ## Check correctness
        # Match clusters by min(x)
        id_true = (self.df_cluster >> gr.tf_filter(X.x == gr.colmin(X.x))).c[0]
        id_res = (df_res >> gr.tf_filter(X.x == gr.colmin(X.x))).cluster_id[0]

        df_res1 = (self.df_cluster >> gr.tf_filter(X.c == id_true) >>
                   gr.tf_select(X.x, X.y))
        df_res2 = (df_res >> gr.tf_filter(X.cluster_id == id_res) >>
                   gr.tf_select(X.x, X.y))

        self.assertTrue(gr.df_equal(df_res1, df_res2))
Example #13
0
    def test_desc(self):
        df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10)
        t = df >> gr.tf_summarize(last=gr.nth(
            X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)]))

        series_num = pd.Series([4, 1, 3, 2])
        series_bool = pd.Series([True, False, True, False])
        series_str = pd.Series(["d", "a", "c", "b"])

        num_truth = series_num.rank(method="min", ascending=False)
        bool_truth = series_bool.rank(method="min", ascending=False)
        str_truth = series_str.rank(method="min", ascending=False)

        self.assertTrue(gr.desc(series_num).equals(num_truth))
        self.assertTrue(gr.desc(series_bool).equals(bool_truth))
        self.assertTrue(gr.desc(series_str).equals(str_truth))
Example #14
0
 def test_select(self):
     df = data.df_diamonds[["carat", "cut", "price"]]
     self.assertTrue(
         df.equals(
             data.df_diamonds >> gr.tf_select("carat", "cut", "price")))
     self.assertTrue(df.equals(data.df_diamonds >> gr.tf_select(0, 1, 6)))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(0, 1, "price")))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select([0, X.cut], X.price)))
     self.assertTrue(
         df.equals(
             data.df_diamonds >> gr.tf_select(X.carat, X["cut"], X.price)))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(X[
             ["carat", "cut", "price"]])))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(X[["carat", "cut"]],
                                                    X.price)))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(X.iloc[:, [0, 1, 6]])))
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_select(
             [X.loc[:, ["carat", "cut", "price"]]])))
Example #15
0
 def test_max(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(m=gr.max(X.x))
     df_truth = pd.DataFrame({"m": [4.34]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.max(X.x))
     df_truth = pd.DataFrame(
         {"cut": ["Good", "Ideal", "Premium"], "m": [4.34, 3.95, 4.20]}
     )
     self.assertTrue(t.equals(df_truth))
     # straight mutate
     t = df >> gr.tf_mutate(m=gr.max(X.x))
     df_truth = df.copy()
     df_truth["m"] = 4.34
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(m=gr.max(X.x))
     df_truth["m"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #16
0
 def test_IQR(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(i=gr.IQR(X.x))
     df_truth = pd.DataFrame({"i": [0.25]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(i=gr.IQR(X.x))
     df_truth = pd.DataFrame(
         {"cut": ["Good", "Ideal", "Premium"], "i": [0.145, 0.000, 0.155]}
     )
     test_vector = abs(t.i - df_truth.i)
     assert all(test_vector < 0.000000001)
     # straight mutate
     t = df >> gr.tf_mutate(i=gr.IQR(X.x))
     df_truth = df.copy()
     df_truth["i"] = 0.25
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(i=gr.IQR(X.x))
     df_truth["i"] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145])
     test_vector = abs(t.i - df_truth.i)
     self.assertTrue(all(test_vector < 0.000000001))
Example #17
0
 def test_select_endswith(self):
     df = data.df_diamonds[["table", "price"]]
     assert df.equals(data.df_diamonds >> gr.tf_select(gr.ends_with("e")))
Example #18
0
 def test_select_inversion(self):
     df = data.df_diamonds.iloc[:, 3:]
     d = data.df_diamonds >> gr.tf_select(~X.carat, ~X.cut, ~X.color)
     self.assertTrue(df.equals(d))
Example #19
0
 def test_select_containing(self):
     df = data.df_diamonds[["carat", "cut", "color", "clarity", "price"]]
     assert df.equals(data.df_diamonds >> gr.tf_select(gr.contains("c")))
Example #20
0
 def test_select_matches(self):
     df = data.df_diamonds[["carat", "cut", "color", "clarity", "price"]]
     assert df.equals(
         data.df_diamonds >> gr.tf_select(gr.matches("^c[auol]|pri")))
Example #21
0
 def test_select_startswith(self):
     df = data.df_diamonds[["carat", "cut", "color", "clarity"]]
     assert df.equals(data.df_diamonds >> gr.tf_select(gr.starts_with("c")))