def test_aggregate_csql(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({ 'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0] }) groups_df = pd.DataFrame({'A': ['a', 'b', 'c']}) ctx.column_sql("TABLE Facts (A, M, N)", lambda **m: facts_df) ctx.column_sql("TABLE Groups (A)", lambda **m: groups_df) ctx.column_sql("LINK Facts (A) -> new_column -> Groups (A)") ctx.column_sql("AGGREGATE Facts (M) -> new_column -> Groups (Aggregate)", lambda x, bias, **model: x.sum() + bias, {"bias": 0.0}) assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert list( ctx.get_table("Groups").get_series('Aggregate')) == [3.0, 7.0, 0.0]
def test_calc_csql(): # # Test 2: function in-query # ctx = Prosto("My Prosto") ctx.column_sql("TABLE My_table (A) FUNC lambda **m: pd.DataFrame({'A': [1, 2, 3]})") ctx.column_sql("CALCULATE My_table (A) -> new_column FUNC lambda x: float(x)") assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0] # # Test 2: function by-reference # ctx = Prosto("My Prosto") df = pd.DataFrame({'A': [1, 2, 3]}) # Use FUNC "lambda **m: df" (df cannot be resolved during population) ctx.column_sql("TABLE My_table (A)", df) ctx.column_sql("CALCULATE My_table (A) -> new_column", lambda x: float(x)) assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0]
def test_roll_csql(): ctx = Prosto("My Prosto") df = pd.DataFrame({'A': [1.0, 2.0, 3.0]}) ctx.column_sql("TABLE My_table (A)", lambda **m: df) ctx.column_sql("ROLL My_table (A) -> new_column WINDOW 2", lambda x: x.sum()) assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [ None, 3.0, 5.0 ]
def test_link_csql(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) groups_df = pd.DataFrame({'A': ['a', 'b', 'c']}) ctx.column_sql("TABLE Facts (A)", lambda **m: facts_df) ctx.column_sql("TABLE Groups (A)", lambda **m: groups_df) ctx.column_sql("LINK Facts (A) -> new_column -> Groups (A)") assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
def test_csql_project(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) ctx.column_sql("TABLE Facts (A)", lambda **m: facts_df) ctx.column_sql("PROJECT Facts (A) -> new_column -> Groups (A)") assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert len(ctx.get_table("Groups").get_df()) == 2 assert len(ctx.get_table("Groups").get_df().columns) == 1 assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
def test_product_csql(): ctx = Prosto("My Prosto") t1_df = pd.DataFrame({'A': [1.0, 2.0, 3.0]}) t2_df = pd.DataFrame({'B': ['x', 'y', 'z']}) ctx.column_sql("TABLE Table_1 (A)", lambda **m: t1_df) ctx.column_sql("TABLE Table_2 (B)", lambda **m: t2_df) ctx.column_sql("PRODUCT Table_1; Table_2 -> t1; t2 -> Product") assert ctx.get_table("Product") ctx.run() product = ctx.get_table("Product") assert len(product.get_df().columns) == 2 assert len(product.get_df()) == 9 assert product.get_df().columns.to_list() == ["t1", "t2"]
def test_filter_csql(): ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", lambda **m: base_df) ctx.column_sql("CALCULATE Base (A, B) -> filter_column", lambda x, param: (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5}) ctx.column_sql("FILTER Base (filter_column) -> super -> Filtered") assert ctx.get_table("Base") assert ctx.get_table("Filtered") ctx.run() assert list(ctx.get_table("Filtered").get_series('super')) == [1] # # Filter with a predicate function and no explicit calculate column # ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", base_df) ctx.column_sql("FILTER Base (A, B) -> super -> Filtered", lambda x, param: (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5}) assert ctx.get_table("Base") assert ctx.get_table("Filtered") ctx.run() assert list(ctx.get_table("Filtered").get_series('super')) == [1]
def test_filter_calculate(): """ Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table. Scenario: populate, filter, calculate column in filtered table using column in base table (which has to be inherited) """ ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", lambda **m: base_df) ctx.column_sql("FILTER Base (A) -> super -> Filtered", lambda x: x < 3.0) ctx.column_sql( "CALCULATE Filtered (B) -> filter_column", # <-- Here we use columns A and B which exist only in the base table lambda x: len(x)) ctx.run() assert ctx.get_table("Filtered").get_series('filter_column').to_list() == [ 1, 2 ]
def test_filter_project(): """ Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table. Scenario: populate, filter, project the filtered table using a column in the base table (which has to be inherited) """ ctx = Prosto("My Prosto") base_df = pd.DataFrame({ 'A': [1.0, 2.0, 3.0, 4.0], 'B': ['x', 'x', 'y', 'zzz'] }) ctx.column_sql("TABLE Base(A, B)", lambda **m: base_df) ctx.column_sql("FILTER Base (A, B) -> super -> Filtered", lambda x: x['A'] < 4.0) ctx.column_sql("FILTER Filtered (A) -> super -> Filtered_2", lambda x: x < 3.0) ctx.column_sql( "PROJECT Filtered_2 (B) -> new_column -> Groups(C)" ) # <-- Here we use columns which exist only in the base table ctx.run() assert ctx.get_table("Groups").get_series('C').to_list() == ['x']