Example #1
0
 def test_last(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(l=gr.last(X.x))
     df_truth = pd.DataFrame({"l": [4.34]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(l=gr.last(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "l": [4.34, 3.95, 4.20]
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(f=gr.last(
         X.x, order_by=[gr.desc(X.cut), gr.desc(X.x)]))
     df_truth = pd.DataFrame({"f": [4.05]})
     assert df_truth.equals(t)
     # straight mutate
     t = df >> gr.tf_mutate(l=gr.last(X.x))
     df_truth = df.copy()
     df_truth["l"] = df_truth.x.iloc[4]
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(l=gr.last(X.x))
     df_truth["l"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #2
0
 def test_n(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(n=gr.n(X.x))
     df_truth = pd.DataFrame({"n": [5]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(n=gr.n(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "n": [2, 1, 2]
     })
     self.assertTrue(t.equals(df_truth))
     # straight mutate
     t = df >> gr.tf_mutate(n=gr.n(X.x))
     df_truth = df.copy()
     df_truth["n"] = 5
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n(X.x))
     df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2])
     self.assertTrue(t.sort_index().equals(df_truth))
     # Implicit mode summarize
     t = df >> gr.tf_summarize(n=gr.n())
     df_truth = pd.DataFrame({"n": [5]})
     self.assertTrue(t.equals(df_truth))
     # Implicit mode mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n())
     df_truth = df.copy()
     df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #3
0
 def test_first(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(f=gr.first(X.x))
     df_truth = pd.DataFrame({"f": [3.95]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(f=gr.first(X.x))
     df_truth = pd.DataFrame({
         "cut": ["Good", "Ideal", "Premium"],
         "f": [4.05, 3.95, 3.89]
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(f=gr.first(X.x, order_by=gr.desc(X.cut)))
     df_truth = pd.DataFrame({"f": [3.89]})
     # straight mutate
     t = df >> gr.tf_mutate(f=gr.first(X.x))
     df_truth = df.copy()
     df_truth["f"] = df_truth.x.iloc[0]
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(f=gr.first(X.x))
     df_truth["f"] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #4
0
 def test_nth(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10)
     # straight summarize
     t = df >> gr.tf_summarize(second=gr.nth(X.x, 1))
     df_truth = pd.DataFrame({"second": [3.89]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(
         X.cut) >> gr.tf_summarize(first=gr.nth(X.x, 0))
     df_truth = pd.DataFrame({
         "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
         "first": [3.87, 4.05, 3.95, 3.89, 3.94],
     })
     self.assertTrue(t.equals(df_truth))
     # summarize with order_by
     t = df >> gr.tf_summarize(last=gr.nth(
         X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)]))
     df_truth = pd.DataFrame({"last": [3.87]})
     self.assertTrue(df_truth.equals(t))
     # straight mutate
     t = df >> gr.tf_mutate(out_of_range=gr.nth(X.x, 500))
     df_truth = df.copy()
     df_truth["out_of_range"] = np.nan
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(
         X.cut) >> gr.tf_mutate(penultimate=gr.nth(X.x, -2))
     df_truth = df.copy()
     df_truth["penultimate"] = pd.Series(
         [np.nan, 3.89, 4.05, 3.89, 4.05, 4.07, 4.07, 4.07, np.nan, 4.07])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #5
0
    def test_plot_xbs(self):
        r"""Tests that Xbar and S chart runs"""
        ## Basic functionality
        (data.df_shewhart >> gr.tf_mutate(idx=DF.index // 10) >> gr.pt_xbs(
            group="idx", var="tensile_strength"))

        ## Works with discrete group variable
        (data.df_shewhart >> gr.tf_mutate(idx=gr.as_factor(DF.index // 10)) >>
         gr.pt_xbs(group="idx", var="tensile_strength"))
Example #6
0
 def test_sd(self):
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(3)
         >> gr.tf_select(X.cut, X.x)
         >> gr.tf_ungroup()
     )
     # straight summarize
     t = df >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame({"s": [0.829091]})
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342],
         }
     )
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # straight mutate
     t = df >> gr.tf_mutate(s=gr.sd(X.x))
     df_truth = df.copy()
     df_truth["s"] = 0.829091
     test_vector = abs(t.s - df_truth.s)
     self.assertTrue(all(test_vector < 0.00001))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x))
     # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436,
     #                            0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934,
     #                            0.072342, 0.072342, 0.072342],
     #                           index=t.index)
     # test_vector = abs(t.s - df_truth.s)
     # print(t)
     # print(df_truth)
     self.assertTrue(all(test_vector < 0.00001))
     # test with single value (var undefined)
     df = (
         data.df_diamonds
         >> gr.tf_group_by(X.cut)
         >> gr.tf_head(1)
         >> gr.tf_select(X.cut, X.x)
     )
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x))
     df_truth = pd.DataFrame(
         {
             "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
             "s": [np.nan, np.nan, np.nan, np.nan, np.nan],
         }
     )
     self.assertTrue(t.equals(df_truth))
Example #7
0
 def test_mutate(self):
     df = data.df_diamonds.copy()
     df["testcol"] = 1
     self.assertTrue(df.equals(data.df_diamonds >> gr.tf_mutate(testcol=1)))
     df["testcol"] = df["x"]
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_mutate(testcol=X.x)))
     df["testcol"] = df["x"] * df["y"]
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_mutate(testcol=X.x * X.y)))
     df["testcol"] = df["x"].mean()
     self.assertTrue(
         df.equals(data.df_diamonds >> gr.tf_mutate(testcol=np.mean(X.x))))
Example #8
0
    def test_if_else(self):
        df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
        b_truth = [
            "odd", "even", "odd", "even", "odd", "even", "odd", "even", "odd"
        ]
        d = df >> gr.tf_mutate(b=gr.if_else(X.a % 2 == 0, "even", "odd"))
        self.assertTrue(d.equals(df.assign(b=b_truth)))

        df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1, 2, 2, 2]})
        b_truth = [5, 5, 5, 5, 5, 5, 9, 9, 9]
        d = df >> gr.tf_mutate(b=gr.if_else(
            X.a < 2, [5, 5, 5, 5, 5, 5, 5, 5, 5], [9, 9, 9, 9, 9, 9, 9, 9, 9]))
        self.assertTrue(d.equals(df.assign(b=b_truth)))
    def test_stratum_min(self):
        df_test = gr.df_make(
            x=[1, 2, 0, 1, 2, 0, 1, 2],
            y=[0, 0, 1, 1, 1, 2, 2, 2],
            p=[1, 2, 1, 2, 3, 2, 3, 4],
        )

        # Test for accuracy
        self.assertTrue(
            (df_test >> gr.tf_mutate(p_comp=gr.stratum_min(X.x, X.y)) >>
             gr.tf_mutate(flag=X.p == X.p_comp)).flag.all())

        # Check for ValueError
        with self.assertRaises(ValueError):
            gr.stratum_min([1], [1, 2, 3])
Example #10
0
 def test_na_if(self):
     df = pd.DataFrame({"a": [1, 2, 3, 4, 5]})
     d = df >> gr.tf_mutate(b=gr.na_if(X.a, 3), c=gr.na_if(X.a, 1, 2, 3))
     d = d[["a", "b", "c"]]
     df_true = df.assign(b=[1, 2, np.nan, 4, 5],
                         c=[np.nan, np.nan, np.nan, 4, 5])
     self.assertTrue(df_true.equals(d))
Example #11
0
    def test_mean_ci(self):
        # Basic functionality
        y = pd.Series([-1, -1, 0, +1, +1])  # sd == 1
        lo_true = 0 - (-norm.ppf(0.005)) * 1 / np.sqrt(5)
        up_true = 0 + (-norm.ppf(0.005)) * 1 / np.sqrt(5)

        self.assertTrue((lo_true - gr.mean_lo(y, alpha=0.005)) < 1e-6)
        self.assertTrue((up_true - gr.mean_up(y, alpha=0.005)) < 1e-6)

        # Grouped functionality
        df = (gr.df_grid(
            y=[-1, -1, 0, +1, +1],
            x=[0, 1],
        ) >> gr.tf_mutate(y=X.y + X.x) >> gr.tf_group_by(X.x) >>
              gr.tf_summarize(
                  mean_lo=gr.mean_lo(X.y),
                  mean_up=gr.mean_up(X.y),
              ))

        self.assertTrue((df[df.x == 0].mean_lo.values[0] - lo_true) < 1e-6)
        self.assertTrue((df[df.x == 0].mean_up.values[0] - up_true) < 1e-6)

        self.assertTrue((df[df.x == 1].mean_lo.values[0] -
                         (lo_true + 1)) < 1e-6)
        self.assertTrue((df[df.x == 1].mean_up.values[0] -
                         (up_true + 1)) < 1e-6)
Example #12
0
 def test_coalesce(self):
     df = pd.DataFrame({
         "a": [1, np.nan, np.nan, np.nan, np.nan],
         "b": [2, 3, np.nan, np.nan, np.nan],
         "c": [np.nan, np.nan, 4, 5, np.nan],
         "d": [6, 7, 8, 9, np.nan],
     })
     truth_df = df.assign(coal=[1, 3, 4, 5, np.nan])
     d = df >> gr.tf_mutate(coal=gr.coalesce(X.a, X.b, X.c, X.d))
     self.assertTrue(truth_df.equals(d))
Example #13
0
    def test_var(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )

        # straight summarize
        t = df >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame({"v": [0.687392]})
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233],
            }
        )
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # straight mutate
        t = df >> gr.tf_mutate(v=gr.var(X.x))
        df_truth = df.copy()
        df_truth["v"] = 0.687392
        test_vector = abs(t.v - df_truth.v)
        self.assertTrue(all(test_vector < 0.00001))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(v=var(X.x))
        # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033,
        #                            0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100,
        #                            0.005233, 0.005233, 0.005233],
        #                           index=t.index)
        # test_vector = abs(t.v - df_truth.v)
        # assert all(test_vector < .00001)
        # test with single value (var undefined)
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(1)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "v": [np.nan, np.nan, np.nan, np.nan, np.nan],
            }
        )
        self.assertTrue(t.equals(df_truth))
Example #14
0
 def test_max(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(m=gr.max(X.x))
     df_truth = pd.DataFrame({"m": [4.34]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.max(X.x))
     df_truth = pd.DataFrame(
         {"cut": ["Good", "Ideal", "Premium"], "m": [4.34, 3.95, 4.20]}
     )
     self.assertTrue(t.equals(df_truth))
     # straight mutate
     t = df >> gr.tf_mutate(m=gr.max(X.x))
     df_truth = df.copy()
     df_truth["m"] = 4.34
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(m=gr.max(X.x))
     df_truth["m"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
     self.assertTrue(t.sort_index().equals(df_truth))
Example #15
0
    def test_tran_polyridge(self):
        """Test the functionality and correctness of tran_polyridge()
        """
        ## Setup
        df_test = (gr.df_make(x=range(10)) >> gr.tf_outer(
            gr.df_make(y=range(10))) >> gr.tf_outer(gr.df_make(z=range(10))) >>
                   gr.tf_mutate(f=DF.x - DF.y))

        ## Assertions
        # No `out` column
        with self.assertRaises(ValueError):
            gr.tran_polyridge(df_test)
        # Unrecognized `out` column
        with self.assertRaises(ValueError):
            gr.tran_polyridge(df_test, out="foo")
        # Unrecognized `var` column(s)
        with self.assertRaises(ValueError):
            gr.tran_polyridge(df_test, var=["foo", "bar"])
        # Invalid degree
        with self.assertRaises(ValueError):
            gr.tran_polyridge(df_test, out="f", n_degree=1, n_dim=2)

        ## Correctness
        df_res = (df_test >> gr.tf_polyridge(
            out="f",
            n_dim=1,
            n_degree=1,
        ))
        df_true = gr.df_make(x=1 / gr.sqrt(2), y=-1 / gr.sqrt(2), z=0)

        self.assertTrue(gr.df_equal(df_res, df_true, close=True))

        ## Higher-dimensional functionality
        df_higher = (gr.df_grid(
            x=range(10),
            y=range(10),
            z=range(10),
        ) >> gr.tf_mutate(f=DF.x + DF.y + DF.z))
        gr.tran_polyridge(df_higher, out="f", n_degree=2, n_dim=2)
Example #16
0
 def test_case_when(self):
     df = pd.DataFrame({"num": np.arange(31)})
     df_truth = df.assign(strnum=[
         "fizzbuzz" if (i % 15 == 0) else "fizz" if (
             i % 3 == 0) else "buzz" if (i % 5 == 0) else str(i)
         for i in np.arange(31)
     ])
     d = df >> gr.tf_mutate(strnum=gr.case_when(
         [X.num % 15 == 0, "fizzbuzz"],
         [X.num % 3 == 0, "fizz"],
         [X.num % 5 == 0, "buzz"],
         [True, X.num.astype(str)],
     ))
     self.assertTrue(df_truth.equals(d))
Example #17
0
 def test_IQR(self):
     df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5)
     # straight summarize
     t = df >> gr.tf_summarize(i=gr.IQR(X.x))
     df_truth = pd.DataFrame({"i": [0.25]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(i=gr.IQR(X.x))
     df_truth = pd.DataFrame(
         {"cut": ["Good", "Ideal", "Premium"], "i": [0.145, 0.000, 0.155]}
     )
     test_vector = abs(t.i - df_truth.i)
     assert all(test_vector < 0.000000001)
     # straight mutate
     t = df >> gr.tf_mutate(i=gr.IQR(X.x))
     df_truth = df.copy()
     df_truth["i"] = 0.25
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(i=gr.IQR(X.x))
     df_truth["i"] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145])
     test_vector = abs(t.i - df_truth.i)
     self.assertTrue(all(test_vector < 0.000000001))
Example #18
0
    def test_median(self):
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(3)
            >> gr.tf_select(X.cut, X.x)
            >> gr.tf_ungroup()
        )
        # straight summarize
        t = df >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame({"m": [4.05]})
        self.assertTrue(t.equals(df_truth))

        # grouped summarize
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [6.27, 4.25, 3.95, 3.89, 3.95],
            }
        )
        self.assertTrue(t.equals(df_truth))
        # straight mutate
        t = df >> gr.tf_mutate(m=gr.median(X.x))
        df_truth = df.copy()
        df_truth["m"] = 4.05
        self.assertTrue(t.equals(df_truth))
        # grouped mutate
        # t = df >> group_by(X.cut) >> mutate(m=median(X.x))
        # df_truth['m'] = pd.Series(
        #     [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95],
        #     index=t.index)
        # assert t.equals(df_truth)
        # make sure it handles case with even counts properly
        df = (
            data.df_diamonds
            >> gr.tf_group_by(X.cut)
            >> gr.tf_head(2)
            >> gr.tf_select(X.cut, X.x)
        )
        t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x))
        df_truth = pd.DataFrame(
            {
                "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"],
                "m": [5.160, 4.195, 3.940, 4.045, 3.945],
            }
        )
        test_vector = abs(t.m - df_truth.m)
        self.assertTrue(all(test_vector < 0.000000001))
Example #19
0
 def test_n_distinct(self):
     df = pd.DataFrame({
         "col_1": ["a", "a", "a", "b", "b", "b", "c", "c"],
         "col_2": [1, 1, 1, 2, 3, 3, 4, 5],
     })
     # straight summarize
     t = df >> gr.tf_summarize(n=gr.n_distinct(X.col_2))
     df_truth = pd.DataFrame({"n": [5]})
     self.assertTrue(t.equals(df_truth))
     # grouped summarize
     t = df >> gr.tf_group_by(
         X.col_1) >> gr.tf_summarize(n=gr.n_distinct(X.col_2))
     df_truth = pd.DataFrame({"col_1": ["a", "b", "c"], "n": [1, 2, 2]})
     self.assertTrue(t.equals(df_truth))
     # straight mutate
     t = df >> gr.tf_mutate(n=gr.n_distinct(X.col_2))
     df_truth = df.copy()
     df_truth["n"] = 5
     self.assertTrue(t.equals(df_truth))
     # grouped mutate
     t = df >> gr.tf_group_by(
         X.col_1) >> gr.tf_mutate(n=gr.n_distinct(X.col_2))
     df_truth["n"] = pd.Series([1, 1, 1, 2, 2, 2, 2, 2])
     self.assertTrue(t.equals(df_truth))
Example #20
0
    def test_fit_polyridge(self):
        """Test the functionality and correctness of ft_polyridge()
        """
        df_test = (gr.df_make(x=range(10)) >> gr.tf_outer(
            gr.df_make(y=range(10))) >> gr.tf_outer(gr.df_make(z=range(10))) >>
                   gr.tf_mutate(f=DF.x - DF.y))

        md = gr.fit_polyridge(df_test, out="f", n_degree=1, n_dim=1)

        df1 = gr.eval_df(md, df=gr.df_make(x=[2, 1], y=[1], z=[0]))
        df2 = gr.df_make(x=[2, 1], y=[1], z=[0], f_mean=[1, 0])

        self.assertTrue(gr.df_equal(
            df1,
            df2,
            close=True,
        ))
Example #21
0
    def test_iocorr(self):
        df = (
            gr.df_make(x=[1., 2., 3., 4.])
            >> gr.tf_mutate(
                y=+0.5 * DF.x,
                z=-0.5 * DF.x,
            )
            >> gr.tf_iocorr(var=["x"], out=["y", "z"])
        )
        df_true = gr.df_make(
            var=["x", "x"],
            out=["y", "z"],
            rho=[1.0, -1.0],
        )

        ## Check for correct values
        self.assertTrue(gr.df_equal(df, df_true))
Example #22
0
    def test_corr(self):
        df_data = gr.df_make(x=[1., 2., 3., 4.])
        df_data["y"] = 0.5 * df_data.x
        df_data["z"] = -0.5 * df_data.x

        self.assertTrue(abs(gr.corr(df_data.x, df_data.y) - 1.0) < 1e-6)
        self.assertTrue(abs(gr.corr(df_data.x, df_data.z) + 1.0) < 1e-6)

        ## Test NaN handling
        df_nan = (df_data >> gr.tf_mutate(
            x=gr.if_else(X.x == 1, gr.NaN, X.x),
            y=gr.if_else(X.x == 4, gr.NaN, X.y),
        ))

        with self.assertRaises(ValueError):
            gr.corr(df_nan.x, df_nan.y)
        self.assertTrue(
            abs(gr.corr(df_nan.x, df_nan.y, nan_drop=True) - 1.0) < 1e-6)
Example #23
0
    def test_tran_reweight(self):
        """Test the functionality of tran_reweight()

        """
        ## Correctness
        # Choose scale based on Owen (2013) Exercise 9.7
        md_new = (self.md >> gr.cp_marginals(
            x=dict(dist="norm", loc=0, scale=sqrt(4 / 5))))

        df_base = (self.md >> gr.ev_sample(n=500, df_det="nom", seed=101))

        df = (df_base >> gr.tf_reweight(md_base=self.md, md_new=md_new) >>
              gr.tf_summarize(
                  mu=gr.mean(DF.y * DF.weight),
                  se=gr.sd(DF.y * DF.weight) / gr.sqrt(gr.n(DF.weight)),
                  se_orig=gr.sd(DF.y) / gr.sqrt(gr.n(DF.weight)),
              ))
        mu = df.mu[0]
        se = df.se[0]
        se_orig = df.se_orig[0]

        self.assertTrue(mu - se * 2 < 0 and 0 < mu + se * 2)

        ## Optimized IS should be more precise than ordinary monte carlo
        # print("se_orig = {0:4.3f}".format(se_orig))
        # print("se      = {0:4.3f}".format(se))
        self.assertTrue(se < se_orig)

        ## Invariants
        # Missing input in data
        with self.assertRaises(ValueError):
            gr.tran_reweight(df_base[["y"]], md_base=self.md, md_new=self.md)
        # Input mismatch
        with self.assertRaises(ValueError):
            gr.tran_reweight(df_base, md_base=self.md, md_new=gr.Model())
        # Weights collision
        with self.assertRaises(ValueError):
            gr.tran_reweight(df_base >> gr.tf_mutate(weight=0),
                             md_base=self.md,
                             md_new=self.md)
Example #24
0
 def test_group_mutate(self):
     df = data.df_diamonds.copy()
     df = df.groupby("cut").apply(group_mutate_helper)
     d = (data.df_diamonds >> gr.tf_group_by("cut") >>
          gr.tf_mutate(testcol=X.x * X.shape[0]) >> gr.tf_ungroup())
     self.assertTrue(df.equals(d.sort_index()))
Example #25
0
 def test_linspace(self):
     # Works in pipeline
     (gr.df_make(i=range(10)) >> gr.tf_mutate(
         x=gr.linspace(0, 1, gr.n(X.index)),
         l=gr.logspace(0, 1, gr.n(X.index)),
     ))
Example #26
0
    def test_nls(self):
        ## Ground-truth model
        c_true = 2
        a_true = 1

        md_true = (gr.Model() >> gr.cp_function(
            fun=lambda x: a_true * np.exp(x[0] * c_true) + x[1],
            var=["x", "epsilon"],
            out=["y"],
        ) >> gr.cp_marginals(epsilon={
            "dist": "norm",
            "loc": 0,
            "scale": 0.5
        }) >> gr.cp_copula_independence())
        df_data = md_true >> gr.ev_sample(
            n=5, seed=101, df_det=gr.df_make(x=[0, 1, 2, 3, 4]))

        ## Model to fit
        md_param = (gr.Model() >> gr.cp_function(
            fun=lambda x: x[2] * np.exp(x[0] * x[1]),
            var=["x", "c", "a"],
            out=["y"]) >> gr.cp_bounds(c=[0, 4], a=[0.1, 2.0]))

        ## Fit the model
        md_fit = df_data >> gr.ft_nls(
            md=md_param,
            verbose=False,
            uq_method="linpool",
        )

        ## Unidentifiable model throws warning
        # -------------------------
        md_unidet = (gr.Model() >> gr.cp_function(
            fun=lambda x: x[2] / x[3] * np.exp(x[0] * x[1]),
            var=["x", "c", "a", "z"],
            out=["y"],
        ) >> gr.cp_bounds(c=[0, 4], a=[0.1, 2.0], z=[0, 1]))
        with self.assertWarns(RuntimeWarning):
            gr.fit_nls(
                df_data,
                md=md_unidet,
                uq_method="linpool",
            )

        ## True parameters in wide confidence region
        # -------------------------
        alpha = 1e-3
        self.assertTrue(
            (md_fit.density.marginals["c"].q(alpha / 2) <= c_true)
            and (c_true <= md_fit.density.marginals["c"].q(1 - alpha / 2)))

        self.assertTrue(
            (md_fit.density.marginals["a"].q(alpha / 2) <= a_true)
            and (a_true <= md_fit.density.marginals["a"].q(1 - alpha / 2)))

        ## Model with fixed parameter
        # -------------------------
        md_fixed = (gr.Model() >> gr.cp_function(
            fun=lambda x: x[2] * np.exp(x[0] * x[1]),
            var=["x", "c", "a"],
            out=["y"]) >> gr.cp_bounds(c=[0, 4], a=[1, 1]))
        md_fit_fixed = df_data >> gr.ft_nls(
            md=md_fixed, verbose=False, uq_method="linpool")

        # Test that fixed model can evaluate successfully
        gr.eval_sample(md_fit_fixed, n=1, df_det="nom")

        ## Trajectory model
        # -------------------------
        md_base = models.make_trajectory_linear()
        md_fit = data.df_trajectory_windowed >> gr.ft_nls(
            md=md_base, method="SLSQP", tol=1e-3)
        df_tmp = md_fit >> gr.ev_nominal(df_det="nom")

        ## Select output for fitting
        # -------------------------
        # Split model has inconsistent "true" parameter value
        md_split = (gr.Model("Split") >> gr.cp_vec_function(
            fun=lambda df: gr.df_make(
                f=1 * df.c * df.x,
                g=2 * df.c * df.x,
            ),
            var=["c", "x"],
            out=["f", "g"],
        ) >> gr.cp_bounds(
            x=(-1, +1),
            c=(-1, +1),
        ))

        df_split = (gr.df_make(x=gr.linspace(-1, +1, 100)) >> gr.tf_mutate(
            f=X.x, g=X.x))

        # Fitting both outputs: cannot achieve mse ~= 0
        df_both = (df_split >> gr.ft_nls(md_split, out=["f", "g"]) >>
                   gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >>
                   gr.tf_summarize(
                       mse_f=gr.mse(X.f, X.f_t),
                       mse_g=gr.mse(X.g, X.g_t),
                   ))
        self.assertTrue(df_both.mse_f[0] > 0)
        self.assertTrue(df_both.mse_g[0] > 0)

        # Fitting "f" only
        df_f = (df_split >> gr.ft_nls(md_split, out=["f"]) >>
                gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >>
                gr.tf_summarize(
                    mse_f=gr.mse(X.f, X.f_t),
                    mse_g=gr.mse(X.g, X.g_t),
                ))
        self.assertTrue(df_f.mse_f[0] < 1e-16)
        self.assertTrue(df_f.mse_g[0] > 0)

        # Fitting "g" only
        df_g = (df_split >> gr.ft_nls(md_split, out=["g"]) >>
                gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >>
                gr.tf_summarize(
                    mse_f=gr.mse(X.f, X.f_t),
                    mse_g=gr.mse(X.g, X.g_t),
                ))
        self.assertTrue(df_g.mse_f[0] > 0)
        self.assertTrue(df_g.mse_g[0] < 1e-16)