def test_product(): ctx = Prosto("My Prosto") t1 = ctx.populate( table_name="Table 1", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0]})", tables=[] ) t2 = ctx.populate( table_name="Table 2", attributes=["B"], func="lambda **m: pd.DataFrame({'B': ['x', 'y', 'z']})", tables=[] ) product = ctx.product( table_name="Product", attributes=["t1", "t2"], tables=["Table 1", "Table 2"] ) t1.evaluate() t2.evaluate() product.evaluate() assert len(product.get_df().columns) == 2 assert len(product.get_df()) == 9 assert product.get_df().columns.to_list() == ["t1", "t2"]
def test_calculate_value(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1, 2, 3]})", tables=[] ) clm = ctx.calculate( name="My column", table=tbl.id, func="lambda x: float(x)", columns=["A"], model=None ) tbl.evaluate() clm.evaluate() clm_data = tbl.get_series('My column') v0 = clm_data[0] v1 = clm_data[1] v2 = clm_data[2] assert np.isclose(v0, 1.0) assert np.isclose(v1, 2.0) assert np.isclose(v2, 3.0) assert isinstance(v0, float) assert isinstance(v1, float) assert isinstance(v2, float)
def test_merge_path2(): """ Here we do the same as previous test, but specify complex path using separators (rather than a list of simple segment names). So only the definition of merge operation changes. """ ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[]) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # SuperGroups sg_tbl = ctx.populate( table_name="SuperGroups", attributes=["B", "C"], func= "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})", tables=[]) # SuperLink sl_clm = ctx.link(name="SuperLink", table=g_tbl.id, type=sg_tbl.id, columns=["B"], linked_columns=["B"]) # Merge m_clm = ctx.merge("Merge", f_tbl.id, ["Link::SuperLink::C"]) ctx.run() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y']
def test_populate(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'y', 'z']})", tables=[], model={"nrows": 3}) tbl.evaluate() assert len(tbl.get_df().columns) == 2 assert len(tbl.get_df()) == 3
def test_one_key(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[] ) # Groups g_tbl = ctx.project( table_name="Groups", attributes=["X"], tables=["Facts"], columns=["A"] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["X"] ) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() g_tbl_data = g_tbl.get_df() assert len(g_tbl_data) == 2 assert len(g_tbl_data.columns) == 1 l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 0 assert l_data[2] == 1 assert l_data[3] == 1 # # Test topology # topology = Topology(ctx) topology.translate() layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts"} assert set([x.id for x in layers[1]]) == {"Groups"} assert set([x.id for x in layers[2]]) == {"Link"}
def test_filter_table(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="Base table", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})", tables=[]) # This (boolean) column will be used for filtering clm = ctx.compute( name="filter_column", table=tbl.id, func= "lambda x, param: (x['A'] > param) & (x['B'].str.len() < 3)", # Return a boolean Series columns=["A", "B"], model={"param": 1.5}) tbl.evaluate() clm.evaluate() tbl = ctx.filter(table_name="Filtered table", attributes=["super"], func=None, tables=["Base table"], columns=["filter_column"]) tbl.evaluate() assert len(tbl.get_df().columns) == 1 # Only one link-attribute is created assert len(tbl.get_df()) == 1 assert tbl.get_df()['super'][0] == 1 # # Test topology # topology = Topology(ctx) topology.translate() layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Base table"} assert set([x.id for x in layers[1]]) == {"filter_column"} assert set([x.id for x in layers[2]]) == {"Filtered table"}
def test_product_csql(): ctx = Prosto("My Prosto") t1_df = pd.DataFrame({'A': [1.0, 2.0, 3.0]}) t2_df = pd.DataFrame({'B': ['x', 'y', 'z']}) ctx.column_sql("TABLE Table_1 (A)", lambda **m: t1_df) ctx.column_sql("TABLE Table_2 (B)", lambda **m: t2_df) ctx.column_sql("PRODUCT Table_1; Table_2 -> t1; t2 -> Product") assert ctx.get_table("Product") ctx.run() product = ctx.get_table("Product") assert len(product.get_df().columns) == 2 assert len(product.get_df()) == 9 assert product.get_df().columns.to_list() == ["t1", "t2"]
def test_integers2(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9]})", tables=[]) clm = ctx.discretize(name="My column", table=tbl.id, columns=["A"], model={ "origin": 5, "step": 3, "label": "right", "closed": "right" }) # 1, 2], 3, 4, 5], 6, 7, 8], 9 # -1 -1 0 1 2 ctx.run() clm_data = tbl.get_series('My column') assert list(clm_data) == [-1, -1, 0, 0, 0, 1, 1, 1, 2]
def test_two_keys(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[] ) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'c', 'c'], 'C': [1, 2, 3]})", tables=[] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A", "B"], linked_columns=["A", "B"] ) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 1 assert l_data[2] == 1 assert pd.isna(l_data[3]) # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 2 assert set([x.id for x in layers[0]]) == {"Facts", "Groups"} assert set([x.id for x in layers[1]]) == {"Link"} ctx.run() l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 1 assert l_data[2] == 1 assert pd.isna(l_data[3])
def test_groll_single(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["G", "A"], func= "lambda **m: pd.DataFrame({'G': [1, 2, 1, 2], 'A': [1.0, 2.0, 3.0, 4.0]})", tables=[]) clm = ctx.roll(name="Roll", table=tbl.id, window="2", link="G", func="lambda x: x.sum()", columns=["A"], model={}) ctx.run() clm_data = tbl.get_series('Roll') assert pd.isna(clm_data[0]) assert pd.isna(clm_data[1]) assert np.isclose(clm_data[2], 4.0) assert np.isclose(clm_data[3], 6.0)
def test_filter_inheritance(): """Test topology augmentation. Use columns from the parent table by automatically adding the merge operation to topology.""" ctx = Prosto("My Prosto") base_tbl = ctx.populate( table_name="Base table", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})", tables=[]) # This (boolean) column will be used for filtering clm = ctx.compute( name="filter_column", table=base_tbl.id, func= "lambda x, param: (x['A'] > param) & (x['B'].str.len() < 3)", # Return a boolean Series columns=["A", "B"], model={"param": 1.5}) f_tbl = ctx.filter(table_name="Filtered table", attributes=["super"], func=None, tables=["Base table"], columns=["filter_column"]) # In this calculate column, we use a column of the filtered table which actually exists only in the base table clm = ctx.calculate(name="My column", table=f_tbl.id, func="lambda x: x + 1.0", columns=["A"], model=None) ctx.run() clm_data = f_tbl.get_series('My column') assert np.isclose(len(clm_data), 1) assert np.isclose(clm_data[0], 3.0) # This column had to be added automatically by the augmentation procedure # It is inherited from the base table and materialized via merge operation # It stores original values of the inherited base column clm_data = f_tbl.get_series('A') assert np.isclose(clm_data[0], 2)
def test_aggregate_with_path(): """Aggregation with column paths as measures which have to be automatically produce merge operation.""" ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func= "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[]) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]}) g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # Aggregation a_clm = ctx.aggregate(name="Aggregate", table=g_tbl.id, tables=["Facts"], link="Link", func="lambda x, bias, **model: x.sum() + bias", columns=["Link::B"], model={"bias": 0.0}) ctx.run() a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 6.0 assert a_clm_data[1] == 4.0 assert a_clm_data[2] == 0.0
def test_product_inheritance(): """ We add an addition calculate column to the product table which uses a column of a base table. The system has to automatically insert a new operation by resolving this missing column. """ ctx = Prosto("My Prosto") t1 = ctx.populate( table_name="Table 1", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0]})", tables=[] ) t2 = ctx.populate( table_name="Table 2", attributes=["B"], func="lambda **m: pd.DataFrame({'B': ['x', 'y', 'z']})", tables=[] ) product = ctx.product( table_name="Product", attributes=["t1", "t2"], tables=["Table 1", "Table 2"] ) # In this calculate column, we use a column of the product table which actually exists only in a base table clm = ctx.calculate( name="My column", table=product.id, func="lambda x: x + 1.0", columns=["A"], model=None ) ctx.run() # We get two columns in addition to two attributes: one merge (augmented) and one calculate column assert len(product.get_df().columns) == 4 clm_data = product.get_series('My column') assert clm_data.to_list() == [2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0]
def test_calculate_with_path(): """Test topology augmentation. Calculation with column paths which have to be automatically produce merge operation.""" ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[] ) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]}) g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"] ) # Calculate clm = ctx.calculate( name="My column", table=f_tbl.id, func="lambda x: x['M'] + x['Link::B']", columns=["M", "Link::B"], model=None ) ctx.run() clm_data = f_tbl.get_series('My column') assert clm_data[0] == 4.0 assert clm_data[1] == 5.0 assert clm_data[2] == 5.0 assert clm_data[3] == 6.0
def test_roll_multiple(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})", tables=[]) clm = ctx.roll(name="Roll", table=tbl.id, window="2", link=None, func="lambda x: x['A'].sum() + x['B'].sum()", columns=["A", "B"], model={}) tbl.evaluate() clm.evaluate() clm_data = tbl.get_series('Roll') assert pd.isna(clm_data[0]) assert np.isclose(clm_data[1], 8.0) assert np.isclose(clm_data[2], 8.0) # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 2 assert set([x.id for x in layers[0]]) == {"My table"} assert set([x.id for x in layers[1]]) == {"Roll"} ctx.run() clm_data = tbl.get_series('Roll') assert pd.isna(clm_data[0]) assert np.isclose(clm_data[1], 8.0) assert np.isclose(clm_data[2], 8.0)
def test_compute(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1, 2, 3]})", tables=[] ) clm = ctx.compute( name="My column", table=tbl.id, func="lambda x, **model: x.shift(**model)", columns=["A"], model={"periods": -1} ) tbl.evaluate() clm.evaluate() clm_data = tbl.get_series('My column') assert np.isclose(clm_data[0], 2.0) assert np.isclose(clm_data[1], 3.0) assert pd.isna(clm_data[2]) # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 2 assert set([x.id for x in layers[0]]) == {"My table"} assert set([x.id for x in layers[1]]) == {"My column"} ctx.run() clm_data = tbl.get_series('My column') assert np.isclose(clm_data[0], 2.0) assert np.isclose(clm_data[1], 3.0) assert pd.isna(clm_data[2])
def test_integers(): ctx = Prosto("My Prosto") tbl = ctx.populate( table_name="My table", attributes=["A"], func="lambda **m: pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9]})", tables=[]) clm = ctx.discretize(name="My column", table=tbl.id, columns=["A"], model={ "origin": 5, "step": 3 }) # 1, [2, 3, 4, [5, 6, 7, [8, 9 # -2 -1 0 1 ctx.run() clm_data = tbl.get_series('My column') assert list(clm_data) == [-2, -1, -1, -1, 0, 0, 0, 1, 1]
def test_calculate_value(): ctx = Prosto("My Prosto") ctx.incremental = True tbl = ctx.create_table( table_name="My table", attributes=["A"], ) clm = ctx.calculate(name="My column", table=tbl.id, func="lambda x: float(x)", columns=["A"], model=None) ctx.run() # Inference on empty data tbl.data.add({"A": 1}) # New record is added and marked as added # Assert new change status assert tbl.data.added_length() == 1 ctx.run() # Assert clean change status and results of inference assert tbl.data.added_length() == 0 tbl.data.add({"A": 2}) tbl.data.add({"A": 3}) # Assert new change status assert tbl.data.added_length() == 2 # For debug purpose, modify an old row (which has not been recently added but was evaluated before) tbl_df = tbl.data.get_df() tbl_df['A'][0] = 10 # Old value is 1. Prosto does not see this change ctx.run() # The manual modification is invisible for Prosto and hence it should not be re-computed and the derived column will have to have the old value assert tbl_df['My column'][0] == 1 # Assert clean change status and results of inference assert tbl.data.added_length() == 0 tbl.data.remove(1) # Remove one oldest record by marking it as removed # Assert new change status assert tbl.data.removed_length() == 1 ctx.run() # Assert clean change status and results of inference assert tbl.data.removed_length() == 0 tbl.data.remove_all() # Remove all records by marking them as removed # Assert new change status ctx.run() # Assert clean change status and results of inference assert tbl.data.added_range.start == 3 assert tbl.data.added_range.end == 3 assert tbl.data.removed_range.start == 3 assert tbl.data.removed_range.end == 3
def test_merge_path(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[]) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # SuperGroups sg_tbl = ctx.populate( table_name="SuperGroups", attributes=["B", "C"], func= "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})", tables=[]) # SuperLink sl_clm = ctx.link(name="SuperLink", table=g_tbl.id, type=sg_tbl.id, columns=["B"], linked_columns=["B"]) # Merge m_clm = ctx.merge("Merge", f_tbl.id, ["Link", "SuperLink", "C"]) f_tbl.evaluate() g_tbl.evaluate() sg_tbl.evaluate() l_clm.evaluate() sl_clm.evaluate() m_clm.evaluate() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y'] # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts", "Groups", "SuperGroups"} assert set([x.id for x in layers[1]]) == {"Link", "SuperLink"} assert set([x.id for x in layers[2]]) == {"Merge"} ctx.run() m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y']
def test_filter_project(): """ Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table. Scenario: populate, filter, project the filtered table using a column in the base table (which has to be inherited) """ ctx = Prosto("My Prosto") base_df = pd.DataFrame({ 'A': [1.0, 2.0, 3.0, 4.0], 'B': ['x', 'x', 'y', 'zzz'] }) ctx.column_sql("TABLE Base(A, B)", lambda **m: base_df) ctx.column_sql("FILTER Base (A, B) -> super -> Filtered", lambda x: x['A'] < 4.0) ctx.column_sql("FILTER Filtered (A) -> super -> Filtered_2", lambda x: x < 3.0) ctx.column_sql( "PROJECT Filtered_2 (B) -> new_column -> Groups(C)" ) # <-- Here we use columns which exist only in the base table ctx.run() assert ctx.get_table("Groups").get_series('C').to_list() == ['x']
def test_filter_calculate(): """ Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table. Scenario: populate, filter, calculate column in filtered table using column in base table (which has to be inherited) """ ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", lambda **m: base_df) ctx.column_sql("FILTER Base (A) -> super -> Filtered", lambda x: x < 3.0) ctx.column_sql( "CALCULATE Filtered (B) -> filter_column", # <-- Here we use columns A and B which exist only in the base table lambda x: len(x)) ctx.run() assert ctx.get_table("Filtered").get_series('filter_column').to_list() == [ 1, 2 ]
def test_filter_csql(): ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", lambda **m: base_df) ctx.column_sql("CALCULATE Base (A, B) -> filter_column", lambda x, param: (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5}) ctx.column_sql("FILTER Base (filter_column) -> super -> Filtered") assert ctx.get_table("Base") assert ctx.get_table("Filtered") ctx.run() assert list(ctx.get_table("Filtered").get_series('super')) == [1] # # Filter with a predicate function and no explicit calculate column # ctx = Prosto("My Prosto") base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']}) ctx.column_sql("TABLE Base (A, B)", base_df) ctx.column_sql("FILTER Base (A, B) -> super -> Filtered", lambda x, param: (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5}) assert ctx.get_table("Base") assert ctx.get_table("Filtered") ctx.run() assert list(ctx.get_table("Filtered").get_series('super')) == [1]
def test_aggregate_csql(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({ 'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0] }) groups_df = pd.DataFrame({'A': ['a', 'b', 'c']}) ctx.column_sql("TABLE Facts (A, M, N)", lambda **m: facts_df) ctx.column_sql("TABLE Groups (A)", lambda **m: groups_df) ctx.column_sql("LINK Facts (A) -> new_column -> Groups (A)") ctx.column_sql("AGGREGATE Facts (M) -> new_column -> Groups (Aggregate)", lambda x, bias, **model: x.sum() + bias, {"bias": 0.0}) assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert list( ctx.get_table("Groups").get_series('Aggregate')) == [3.0, 7.0, 0.0]
def test_link_csql(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) groups_df = pd.DataFrame({'A': ['a', 'b', 'c']}) ctx.column_sql("TABLE Facts (A)", lambda **m: facts_df) ctx.column_sql("TABLE Groups (A)", lambda **m: groups_df) ctx.column_sql("LINK Facts (A) -> new_column -> Groups (A)") assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
def test_aggregate(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func= "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0]})", tables=[]) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c']}) g_tbl = ctx.populate( table_name="Groups", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c']})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # Aggregation a_clm = ctx.aggregate(name="Aggregate", table=g_tbl.id, tables=["Facts"], link="Link", func="lambda x, bias, **model: x.sum() + bias", columns=["M"], model={"bias": 0.0}) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() a_clm.evaluate() g_tbl_data = g_tbl.get_df() assert len(g_tbl_data) == 3 # Same number of rows assert len( g_tbl_data.columns ) == 2 # One aggregate column was added (and one technical "id" column was added which might be removed in future) a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 3.0 assert a_clm_data[1] == 7.0 assert a_clm_data[2] == 0.0 # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts", "Groups"} assert set([x.id for x in layers[1]]) == {"Link"} assert set([x.id for x in layers[2]]) == {"Aggregate"} ctx.run() a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 3.0 assert a_clm_data[1] == 7.0 assert a_clm_data[2] == 0.0 # # Aggregation of multiple columns # # Aggregation a_clm2 = ctx.aggregate( name="Aggregate 2", table=g_tbl.id, tables=["Facts"], link="Link", func= "lambda x, my_param, **model: x['M'].sum() + x['N'].sum() + my_param", columns=["M", "N"], model={"my_param": 0.0}) #a_clm2.evaluate() ctx.translate() # All data will be reset ctx.run( ) # A new column is NOT added to the existing data frame (not clear where it is) a_clm2_data = g_tbl.get_series('Aggregate 2') assert a_clm2_data[0] == 10.0 assert a_clm2_data[1] == 10.0 assert a_clm2_data[2] == 0.0
def test_csql_project(): ctx = Prosto("My Prosto") facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']}) ctx.column_sql("TABLE Facts (A)", lambda **m: facts_df) ctx.column_sql("PROJECT Facts (A) -> new_column -> Groups (A)") assert ctx.get_table("Facts") assert ctx.get_table("Groups") assert ctx.get_column("Facts", "new_column") ctx.run() assert len(ctx.get_table("Groups").get_df()) == 2 assert len(ctx.get_table("Groups").get_df().columns) == 1 assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
def test_calc_csql(): # # Test 2: function in-query # ctx = Prosto("My Prosto") ctx.column_sql("TABLE My_table (A) FUNC lambda **m: pd.DataFrame({'A': [1, 2, 3]})") ctx.column_sql("CALCULATE My_table (A) -> new_column FUNC lambda x: float(x)") assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0] # # Test 2: function by-reference # ctx = Prosto("My Prosto") df = pd.DataFrame({'A': [1, 2, 3]}) # Use FUNC "lambda **m: df" (df cannot be resolved during population) ctx.column_sql("TABLE My_table (A)", df) ctx.column_sql("CALCULATE My_table (A) -> new_column", lambda x: float(x)) assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0]
def test_roll_csql(): ctx = Prosto("My Prosto") df = pd.DataFrame({'A': [1.0, 2.0, 3.0]}) ctx.column_sql("TABLE My_table (A)", lambda **m: df) ctx.column_sql("ROLL My_table (A) -> new_column WINDOW 2", lambda x: x.sum()) assert ctx.get_table("My_table") assert ctx.get_column("My_table", "new_column") ctx.run() assert list(ctx.get_table("My_table").get_series('new_column')) == [ None, 3.0, 5.0 ]