def test_arrange(self): df = ( data.df_diamonds.groupby("cut") .apply(arrange_apply_helperfunc) .reset_index(drop=True) ) d = ( data.df_diamonds >> gr.tf_group_by("cut") >> gr.tf_arrange("depth", ascending=False) >> gr.tf_head(5) >> gr.tf_ungroup() ).reset_index(drop=True) self.assertTrue(df.equals(d)) d = ( data.df_diamonds >> gr.tf_group_by("cut") >> gr.tf_arrange(X.depth, ascending=False) >> gr.tf_head(5) >> gr.tf_ungroup() ).reset_index(drop=True) assert df.equals(d) df = data.df_diamonds.sort_values(["cut", "price"], ascending=False) d = data.df_diamonds >> gr.tf_arrange(gr.desc(X.cut), gr.desc(X.price)) self.assertTrue(df.equals(d))
def test_sd(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame({"s": [0.829091]}) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342], } ) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(s=gr.sd(X.x)) df_truth = df.copy() df_truth["s"] = 0.829091 test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x)) # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436, # 0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934, # 0.072342, 0.072342, 0.072342], # index=t.index) # test_vector = abs(t.s - df_truth.s) # print(t) # print(df_truth) self.assertTrue(all(test_vector < 0.00001)) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def test_var(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame({"v": [0.687392]}) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233], } ) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(v=gr.var(X.x)) df_truth = df.copy() df_truth["v"] = 0.687392 test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(v=var(X.x)) # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033, # 0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100, # 0.005233, 0.005233, 0.005233], # index=t.index) # test_vector = abs(t.v - df_truth.v) # assert all(test_vector < .00001) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def test_median(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame({"m": [4.05]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [6.27, 4.25, 3.95, 3.89, 3.95], } ) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(m=gr.median(X.x)) df_truth = df.copy() df_truth["m"] = 4.05 self.assertTrue(t.equals(df_truth)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(m=median(X.x)) # df_truth['m'] = pd.Series( # [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95], # index=t.index) # assert t.equals(df_truth) # make sure it handles case with even counts properly df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(2) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [5.160, 4.195, 3.940, 4.045, 3.945], } ) test_vector = abs(t.m - df_truth.m) self.assertTrue(all(test_vector < 0.000000001))
def test_group_mutate(self): df = data.df_diamonds.copy() df = df.groupby("cut").apply(group_mutate_helper) d = (data.df_diamonds >> gr.tf_group_by("cut") >> gr.tf_mutate(testcol=X.x * X.shape[0]) >> gr.tf_ungroup()) self.assertTrue(df.equals(d.sort_index()))