def test_last(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(l=gr.last(X.x)) df_truth = pd.DataFrame({"l": [4.34]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(l=gr.last(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "l": [4.34, 3.95, 4.20] }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(f=gr.last( X.x, order_by=[gr.desc(X.cut), gr.desc(X.x)])) df_truth = pd.DataFrame({"f": [4.05]}) assert df_truth.equals(t) # straight mutate t = df >> gr.tf_mutate(l=gr.last(X.x)) df_truth = df.copy() df_truth["l"] = df_truth.x.iloc[4] self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(l=gr.last(X.x)) df_truth["l"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) self.assertTrue(t.sort_index().equals(df_truth))
def test_n(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(n=gr.n(X.x)) df_truth = pd.DataFrame({"n": [5]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(n=gr.n(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "n": [2, 1, 2] }) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(n=gr.n(X.x)) df_truth = df.copy() df_truth["n"] = 5 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n(X.x)) df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2]) self.assertTrue(t.sort_index().equals(df_truth)) # Implicit mode summarize t = df >> gr.tf_summarize(n=gr.n()) df_truth = pd.DataFrame({"n": [5]}) self.assertTrue(t.equals(df_truth)) # Implicit mode mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n()) df_truth = df.copy() df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2]) self.assertTrue(t.sort_index().equals(df_truth))
def test_first(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(f=gr.first(X.x)) df_truth = pd.DataFrame({"f": [3.95]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(f=gr.first(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "f": [4.05, 3.95, 3.89] }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(f=gr.first(X.x, order_by=gr.desc(X.cut))) df_truth = pd.DataFrame({"f": [3.89]}) # straight mutate t = df >> gr.tf_mutate(f=gr.first(X.x)) df_truth = df.copy() df_truth["f"] = df_truth.x.iloc[0] self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(f=gr.first(X.x)) df_truth["f"] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) self.assertTrue(t.sort_index().equals(df_truth))
def test_nth(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10) # straight summarize t = df >> gr.tf_summarize(second=gr.nth(X.x, 1)) df_truth = pd.DataFrame({"second": [3.89]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by( X.cut) >> gr.tf_summarize(first=gr.nth(X.x, 0)) df_truth = pd.DataFrame({ "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "first": [3.87, 4.05, 3.95, 3.89, 3.94], }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(last=gr.nth( X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)])) df_truth = pd.DataFrame({"last": [3.87]}) self.assertTrue(df_truth.equals(t)) # straight mutate t = df >> gr.tf_mutate(out_of_range=gr.nth(X.x, 500)) df_truth = df.copy() df_truth["out_of_range"] = np.nan self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by( X.cut) >> gr.tf_mutate(penultimate=gr.nth(X.x, -2)) df_truth = df.copy() df_truth["penultimate"] = pd.Series( [np.nan, 3.89, 4.05, 3.89, 4.05, 4.07, 4.07, 4.07, np.nan, 4.07]) self.assertTrue(t.sort_index().equals(df_truth))
def test_plot_xbs(self): r"""Tests that Xbar and S chart runs""" ## Basic functionality (data.df_shewhart >> gr.tf_mutate(idx=DF.index // 10) >> gr.pt_xbs( group="idx", var="tensile_strength")) ## Works with discrete group variable (data.df_shewhart >> gr.tf_mutate(idx=gr.as_factor(DF.index // 10)) >> gr.pt_xbs(group="idx", var="tensile_strength"))
def test_sd(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame({"s": [0.829091]}) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342], } ) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(s=gr.sd(X.x)) df_truth = df.copy() df_truth["s"] = 0.829091 test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x)) # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436, # 0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934, # 0.072342, 0.072342, 0.072342], # index=t.index) # test_vector = abs(t.s - df_truth.s) # print(t) # print(df_truth) self.assertTrue(all(test_vector < 0.00001)) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def test_mutate(self): df = data.df_diamonds.copy() df["testcol"] = 1 self.assertTrue(df.equals(data.df_diamonds >> gr.tf_mutate(testcol=1))) df["testcol"] = df["x"] self.assertTrue( df.equals(data.df_diamonds >> gr.tf_mutate(testcol=X.x))) df["testcol"] = df["x"] * df["y"] self.assertTrue( df.equals(data.df_diamonds >> gr.tf_mutate(testcol=X.x * X.y))) df["testcol"] = df["x"].mean() self.assertTrue( df.equals(data.df_diamonds >> gr.tf_mutate(testcol=np.mean(X.x))))
def test_if_else(self): df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) b_truth = [ "odd", "even", "odd", "even", "odd", "even", "odd", "even", "odd" ] d = df >> gr.tf_mutate(b=gr.if_else(X.a % 2 == 0, "even", "odd")) self.assertTrue(d.equals(df.assign(b=b_truth))) df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1, 2, 2, 2]}) b_truth = [5, 5, 5, 5, 5, 5, 9, 9, 9] d = df >> gr.tf_mutate(b=gr.if_else( X.a < 2, [5, 5, 5, 5, 5, 5, 5, 5, 5], [9, 9, 9, 9, 9, 9, 9, 9, 9])) self.assertTrue(d.equals(df.assign(b=b_truth)))
def test_stratum_min(self): df_test = gr.df_make( x=[1, 2, 0, 1, 2, 0, 1, 2], y=[0, 0, 1, 1, 1, 2, 2, 2], p=[1, 2, 1, 2, 3, 2, 3, 4], ) # Test for accuracy self.assertTrue( (df_test >> gr.tf_mutate(p_comp=gr.stratum_min(X.x, X.y)) >> gr.tf_mutate(flag=X.p == X.p_comp)).flag.all()) # Check for ValueError with self.assertRaises(ValueError): gr.stratum_min([1], [1, 2, 3])
def test_na_if(self): df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) d = df >> gr.tf_mutate(b=gr.na_if(X.a, 3), c=gr.na_if(X.a, 1, 2, 3)) d = d[["a", "b", "c"]] df_true = df.assign(b=[1, 2, np.nan, 4, 5], c=[np.nan, np.nan, np.nan, 4, 5]) self.assertTrue(df_true.equals(d))
def test_mean_ci(self): # Basic functionality y = pd.Series([-1, -1, 0, +1, +1]) # sd == 1 lo_true = 0 - (-norm.ppf(0.005)) * 1 / np.sqrt(5) up_true = 0 + (-norm.ppf(0.005)) * 1 / np.sqrt(5) self.assertTrue((lo_true - gr.mean_lo(y, alpha=0.005)) < 1e-6) self.assertTrue((up_true - gr.mean_up(y, alpha=0.005)) < 1e-6) # Grouped functionality df = (gr.df_grid( y=[-1, -1, 0, +1, +1], x=[0, 1], ) >> gr.tf_mutate(y=X.y + X.x) >> gr.tf_group_by(X.x) >> gr.tf_summarize( mean_lo=gr.mean_lo(X.y), mean_up=gr.mean_up(X.y), )) self.assertTrue((df[df.x == 0].mean_lo.values[0] - lo_true) < 1e-6) self.assertTrue((df[df.x == 0].mean_up.values[0] - up_true) < 1e-6) self.assertTrue((df[df.x == 1].mean_lo.values[0] - (lo_true + 1)) < 1e-6) self.assertTrue((df[df.x == 1].mean_up.values[0] - (up_true + 1)) < 1e-6)
def test_coalesce(self): df = pd.DataFrame({ "a": [1, np.nan, np.nan, np.nan, np.nan], "b": [2, 3, np.nan, np.nan, np.nan], "c": [np.nan, np.nan, 4, 5, np.nan], "d": [6, 7, 8, 9, np.nan], }) truth_df = df.assign(coal=[1, 3, 4, 5, np.nan]) d = df >> gr.tf_mutate(coal=gr.coalesce(X.a, X.b, X.c, X.d)) self.assertTrue(truth_df.equals(d))
def test_var(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame({"v": [0.687392]}) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233], } ) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(v=gr.var(X.x)) df_truth = df.copy() df_truth["v"] = 0.687392 test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(v=var(X.x)) # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033, # 0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100, # 0.005233, 0.005233, 0.005233], # index=t.index) # test_vector = abs(t.v - df_truth.v) # assert all(test_vector < .00001) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def test_max(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(m=gr.max(X.x)) df_truth = pd.DataFrame({"m": [4.34]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.max(X.x)) df_truth = pd.DataFrame( {"cut": ["Good", "Ideal", "Premium"], "m": [4.34, 3.95, 4.20]} ) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(m=gr.max(X.x)) df_truth = df.copy() df_truth["m"] = 4.34 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(m=gr.max(X.x)) df_truth["m"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) self.assertTrue(t.sort_index().equals(df_truth))
def test_tran_polyridge(self): """Test the functionality and correctness of tran_polyridge() """ ## Setup df_test = (gr.df_make(x=range(10)) >> gr.tf_outer( gr.df_make(y=range(10))) >> gr.tf_outer(gr.df_make(z=range(10))) >> gr.tf_mutate(f=DF.x - DF.y)) ## Assertions # No `out` column with self.assertRaises(ValueError): gr.tran_polyridge(df_test) # Unrecognized `out` column with self.assertRaises(ValueError): gr.tran_polyridge(df_test, out="foo") # Unrecognized `var` column(s) with self.assertRaises(ValueError): gr.tran_polyridge(df_test, var=["foo", "bar"]) # Invalid degree with self.assertRaises(ValueError): gr.tran_polyridge(df_test, out="f", n_degree=1, n_dim=2) ## Correctness df_res = (df_test >> gr.tf_polyridge( out="f", n_dim=1, n_degree=1, )) df_true = gr.df_make(x=1 / gr.sqrt(2), y=-1 / gr.sqrt(2), z=0) self.assertTrue(gr.df_equal(df_res, df_true, close=True)) ## Higher-dimensional functionality df_higher = (gr.df_grid( x=range(10), y=range(10), z=range(10), ) >> gr.tf_mutate(f=DF.x + DF.y + DF.z)) gr.tran_polyridge(df_higher, out="f", n_degree=2, n_dim=2)
def test_case_when(self): df = pd.DataFrame({"num": np.arange(31)}) df_truth = df.assign(strnum=[ "fizzbuzz" if (i % 15 == 0) else "fizz" if ( i % 3 == 0) else "buzz" if (i % 5 == 0) else str(i) for i in np.arange(31) ]) d = df >> gr.tf_mutate(strnum=gr.case_when( [X.num % 15 == 0, "fizzbuzz"], [X.num % 3 == 0, "fizz"], [X.num % 5 == 0, "buzz"], [True, X.num.astype(str)], )) self.assertTrue(df_truth.equals(d))
def test_IQR(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(i=gr.IQR(X.x)) df_truth = pd.DataFrame({"i": [0.25]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(i=gr.IQR(X.x)) df_truth = pd.DataFrame( {"cut": ["Good", "Ideal", "Premium"], "i": [0.145, 0.000, 0.155]} ) test_vector = abs(t.i - df_truth.i) assert all(test_vector < 0.000000001) # straight mutate t = df >> gr.tf_mutate(i=gr.IQR(X.x)) df_truth = df.copy() df_truth["i"] = 0.25 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(i=gr.IQR(X.x)) df_truth["i"] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145]) test_vector = abs(t.i - df_truth.i) self.assertTrue(all(test_vector < 0.000000001))
def test_median(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame({"m": [4.05]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [6.27, 4.25, 3.95, 3.89, 3.95], } ) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(m=gr.median(X.x)) df_truth = df.copy() df_truth["m"] = 4.05 self.assertTrue(t.equals(df_truth)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(m=median(X.x)) # df_truth['m'] = pd.Series( # [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95], # index=t.index) # assert t.equals(df_truth) # make sure it handles case with even counts properly df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(2) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [5.160, 4.195, 3.940, 4.045, 3.945], } ) test_vector = abs(t.m - df_truth.m) self.assertTrue(all(test_vector < 0.000000001))
def test_n_distinct(self): df = pd.DataFrame({ "col_1": ["a", "a", "a", "b", "b", "b", "c", "c"], "col_2": [1, 1, 1, 2, 3, 3, 4, 5], }) # straight summarize t = df >> gr.tf_summarize(n=gr.n_distinct(X.col_2)) df_truth = pd.DataFrame({"n": [5]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by( X.col_1) >> gr.tf_summarize(n=gr.n_distinct(X.col_2)) df_truth = pd.DataFrame({"col_1": ["a", "b", "c"], "n": [1, 2, 2]}) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(n=gr.n_distinct(X.col_2)) df_truth = df.copy() df_truth["n"] = 5 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by( X.col_1) >> gr.tf_mutate(n=gr.n_distinct(X.col_2)) df_truth["n"] = pd.Series([1, 1, 1, 2, 2, 2, 2, 2]) self.assertTrue(t.equals(df_truth))
def test_fit_polyridge(self): """Test the functionality and correctness of ft_polyridge() """ df_test = (gr.df_make(x=range(10)) >> gr.tf_outer( gr.df_make(y=range(10))) >> gr.tf_outer(gr.df_make(z=range(10))) >> gr.tf_mutate(f=DF.x - DF.y)) md = gr.fit_polyridge(df_test, out="f", n_degree=1, n_dim=1) df1 = gr.eval_df(md, df=gr.df_make(x=[2, 1], y=[1], z=[0])) df2 = gr.df_make(x=[2, 1], y=[1], z=[0], f_mean=[1, 0]) self.assertTrue(gr.df_equal( df1, df2, close=True, ))
def test_iocorr(self): df = ( gr.df_make(x=[1., 2., 3., 4.]) >> gr.tf_mutate( y=+0.5 * DF.x, z=-0.5 * DF.x, ) >> gr.tf_iocorr(var=["x"], out=["y", "z"]) ) df_true = gr.df_make( var=["x", "x"], out=["y", "z"], rho=[1.0, -1.0], ) ## Check for correct values self.assertTrue(gr.df_equal(df, df_true))
def test_corr(self): df_data = gr.df_make(x=[1., 2., 3., 4.]) df_data["y"] = 0.5 * df_data.x df_data["z"] = -0.5 * df_data.x self.assertTrue(abs(gr.corr(df_data.x, df_data.y) - 1.0) < 1e-6) self.assertTrue(abs(gr.corr(df_data.x, df_data.z) + 1.0) < 1e-6) ## Test NaN handling df_nan = (df_data >> gr.tf_mutate( x=gr.if_else(X.x == 1, gr.NaN, X.x), y=gr.if_else(X.x == 4, gr.NaN, X.y), )) with self.assertRaises(ValueError): gr.corr(df_nan.x, df_nan.y) self.assertTrue( abs(gr.corr(df_nan.x, df_nan.y, nan_drop=True) - 1.0) < 1e-6)
def test_tran_reweight(self): """Test the functionality of tran_reweight() """ ## Correctness # Choose scale based on Owen (2013) Exercise 9.7 md_new = (self.md >> gr.cp_marginals( x=dict(dist="norm", loc=0, scale=sqrt(4 / 5)))) df_base = (self.md >> gr.ev_sample(n=500, df_det="nom", seed=101)) df = (df_base >> gr.tf_reweight(md_base=self.md, md_new=md_new) >> gr.tf_summarize( mu=gr.mean(DF.y * DF.weight), se=gr.sd(DF.y * DF.weight) / gr.sqrt(gr.n(DF.weight)), se_orig=gr.sd(DF.y) / gr.sqrt(gr.n(DF.weight)), )) mu = df.mu[0] se = df.se[0] se_orig = df.se_orig[0] self.assertTrue(mu - se * 2 < 0 and 0 < mu + se * 2) ## Optimized IS should be more precise than ordinary monte carlo # print("se_orig = {0:4.3f}".format(se_orig)) # print("se = {0:4.3f}".format(se)) self.assertTrue(se < se_orig) ## Invariants # Missing input in data with self.assertRaises(ValueError): gr.tran_reweight(df_base[["y"]], md_base=self.md, md_new=self.md) # Input mismatch with self.assertRaises(ValueError): gr.tran_reweight(df_base, md_base=self.md, md_new=gr.Model()) # Weights collision with self.assertRaises(ValueError): gr.tran_reweight(df_base >> gr.tf_mutate(weight=0), md_base=self.md, md_new=self.md)
def test_group_mutate(self): df = data.df_diamonds.copy() df = df.groupby("cut").apply(group_mutate_helper) d = (data.df_diamonds >> gr.tf_group_by("cut") >> gr.tf_mutate(testcol=X.x * X.shape[0]) >> gr.tf_ungroup()) self.assertTrue(df.equals(d.sort_index()))
def test_linspace(self): # Works in pipeline (gr.df_make(i=range(10)) >> gr.tf_mutate( x=gr.linspace(0, 1, gr.n(X.index)), l=gr.logspace(0, 1, gr.n(X.index)), ))
def test_nls(self): ## Ground-truth model c_true = 2 a_true = 1 md_true = (gr.Model() >> gr.cp_function( fun=lambda x: a_true * np.exp(x[0] * c_true) + x[1], var=["x", "epsilon"], out=["y"], ) >> gr.cp_marginals(epsilon={ "dist": "norm", "loc": 0, "scale": 0.5 }) >> gr.cp_copula_independence()) df_data = md_true >> gr.ev_sample( n=5, seed=101, df_det=gr.df_make(x=[0, 1, 2, 3, 4])) ## Model to fit md_param = (gr.Model() >> gr.cp_function( fun=lambda x: x[2] * np.exp(x[0] * x[1]), var=["x", "c", "a"], out=["y"]) >> gr.cp_bounds(c=[0, 4], a=[0.1, 2.0])) ## Fit the model md_fit = df_data >> gr.ft_nls( md=md_param, verbose=False, uq_method="linpool", ) ## Unidentifiable model throws warning # ------------------------- md_unidet = (gr.Model() >> gr.cp_function( fun=lambda x: x[2] / x[3] * np.exp(x[0] * x[1]), var=["x", "c", "a", "z"], out=["y"], ) >> gr.cp_bounds(c=[0, 4], a=[0.1, 2.0], z=[0, 1])) with self.assertWarns(RuntimeWarning): gr.fit_nls( df_data, md=md_unidet, uq_method="linpool", ) ## True parameters in wide confidence region # ------------------------- alpha = 1e-3 self.assertTrue( (md_fit.density.marginals["c"].q(alpha / 2) <= c_true) and (c_true <= md_fit.density.marginals["c"].q(1 - alpha / 2))) self.assertTrue( (md_fit.density.marginals["a"].q(alpha / 2) <= a_true) and (a_true <= md_fit.density.marginals["a"].q(1 - alpha / 2))) ## Model with fixed parameter # ------------------------- md_fixed = (gr.Model() >> gr.cp_function( fun=lambda x: x[2] * np.exp(x[0] * x[1]), var=["x", "c", "a"], out=["y"]) >> gr.cp_bounds(c=[0, 4], a=[1, 1])) md_fit_fixed = df_data >> gr.ft_nls( md=md_fixed, verbose=False, uq_method="linpool") # Test that fixed model can evaluate successfully gr.eval_sample(md_fit_fixed, n=1, df_det="nom") ## Trajectory model # ------------------------- md_base = models.make_trajectory_linear() md_fit = data.df_trajectory_windowed >> gr.ft_nls( md=md_base, method="SLSQP", tol=1e-3) df_tmp = md_fit >> gr.ev_nominal(df_det="nom") ## Select output for fitting # ------------------------- # Split model has inconsistent "true" parameter value md_split = (gr.Model("Split") >> gr.cp_vec_function( fun=lambda df: gr.df_make( f=1 * df.c * df.x, g=2 * df.c * df.x, ), var=["c", "x"], out=["f", "g"], ) >> gr.cp_bounds( x=(-1, +1), c=(-1, +1), )) df_split = (gr.df_make(x=gr.linspace(-1, +1, 100)) >> gr.tf_mutate( f=X.x, g=X.x)) # Fitting both outputs: cannot achieve mse ~= 0 df_both = (df_split >> gr.ft_nls(md_split, out=["f", "g"]) >> gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >> gr.tf_summarize( mse_f=gr.mse(X.f, X.f_t), mse_g=gr.mse(X.g, X.g_t), )) self.assertTrue(df_both.mse_f[0] > 0) self.assertTrue(df_both.mse_g[0] > 0) # Fitting "f" only df_f = (df_split >> gr.ft_nls(md_split, out=["f"]) >> gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >> gr.tf_summarize( mse_f=gr.mse(X.f, X.f_t), mse_g=gr.mse(X.g, X.g_t), )) self.assertTrue(df_f.mse_f[0] < 1e-16) self.assertTrue(df_f.mse_g[0] > 0) # Fitting "g" only df_g = (df_split >> gr.ft_nls(md_split, out=["g"]) >> gr.ev_df(df_split >> gr.tf_rename(f_t=X.f, g_t=X.g)) >> gr.tf_summarize( mse_f=gr.mse(X.f, X.f_t), mse_g=gr.mse(X.g, X.g_t), )) self.assertTrue(df_g.mse_f[0] > 0) self.assertTrue(df_g.mse_g[0] < 1e-16)