def test_merge_path2(): """ Here we do the same as previous test, but specify complex path using separators (rather than a list of simple segment names). So only the definition of merge operation changes. """ ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[]) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # SuperGroups sg_tbl = ctx.populate( table_name="SuperGroups", attributes=["B", "C"], func= "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})", tables=[]) # SuperLink sl_clm = ctx.link(name="SuperLink", table=g_tbl.id, type=sg_tbl.id, columns=["B"], linked_columns=["B"]) # Merge m_clm = ctx.merge("Merge", f_tbl.id, ["Link::SuperLink::C"]) ctx.run() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y']
def test_two_keys(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[] ) # Groups g_tbl = ctx.project( table_name="Groups", attributes=["X", "Y"], tables=["Facts"], columns=["A", "B"] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A", "B"], linked_columns=["X", "Y"] ) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() g_tbl_data = g_tbl.get_df() assert len(g_tbl_data) == 3 assert len(g_tbl_data.columns) == 2 l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 1 assert l_data[2] == 1 assert l_data[3] == 2 # # Test topology # topology = Topology(ctx) topology.translate() layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts"} assert set([x.id for x in layers[1]]) == {"Groups"} assert set([x.id for x in layers[2]]) == {"Link"} g_tbl_data = g_tbl.get_df() g_tbl_data.drop(g_tbl_data.index, inplace=True) # Empty ctx.run() g_tbl_data = g_tbl.get_df() assert len(g_tbl_data) == 3 assert len(g_tbl_data.columns) == 2
def test_two_keys(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[] ) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'c', 'c'], 'C': [1, 2, 3]})", tables=[] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A", "B"], linked_columns=["A", "B"] ) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 1 assert l_data[2] == 1 assert pd.isna(l_data[3]) # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 2 assert set([x.id for x in layers[0]]) == {"Facts", "Groups"} assert set([x.id for x in layers[1]]) == {"Link"} ctx.run() l_data = f_tbl.get_series("Link") assert l_data[0] == 0 assert l_data[1] == 1 assert l_data[2] == 1 assert pd.isna(l_data[3])
def test_aggregate_with_path(): """Aggregation with column paths as measures which have to be automatically produce merge operation.""" ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func= "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[]) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]}) g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # Aggregation a_clm = ctx.aggregate(name="Aggregate", table=g_tbl.id, tables=["Facts"], link="Link", func="lambda x, bias, **model: x.sum() + bias", columns=["Link::B"], model={"bias": 0.0}) ctx.run() a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 6.0 assert a_clm_data[1] == 4.0 assert a_clm_data[2] == 0.0
def test_calculate_with_path(): """Test topology augmentation. Calculation with column paths which have to be automatically produce merge operation.""" ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[] ) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]}) g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[] ) # Link l_clm = ctx.link( name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"] ) # Calculate clm = ctx.calculate( name="My column", table=f_tbl.id, func="lambda x: x['M'] + x['Link::B']", columns=["M", "Link::B"], model=None ) ctx.run() clm_data = f_tbl.get_series('My column') assert clm_data[0] == 4.0 assert clm_data[1] == 5.0 assert clm_data[2] == 5.0 assert clm_data[3] == 6.0
def test_aggregate(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A", "M"], func= "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0]})", tables=[]) # Groups df = pd.DataFrame({'A': ['a', 'b', 'c']}) g_tbl = ctx.populate( table_name="Groups", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c']})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # Aggregation a_clm = ctx.aggregate(name="Aggregate", table=g_tbl.id, tables=["Facts"], link="Link", func="lambda x, bias, **model: x.sum() + bias", columns=["M"], model={"bias": 0.0}) f_tbl.evaluate() g_tbl.evaluate() l_clm.evaluate() a_clm.evaluate() g_tbl_data = g_tbl.get_df() assert len(g_tbl_data) == 3 # Same number of rows assert len( g_tbl_data.columns ) == 2 # One aggregate column was added (and one technical "id" column was added which might be removed in future) a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 3.0 assert a_clm_data[1] == 7.0 assert a_clm_data[2] == 0.0 # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts", "Groups"} assert set([x.id for x in layers[1]]) == {"Link"} assert set([x.id for x in layers[2]]) == {"Aggregate"} ctx.run() a_clm_data = g_tbl.get_series('Aggregate') assert a_clm_data[0] == 3.0 assert a_clm_data[1] == 7.0 assert a_clm_data[2] == 0.0 # # Aggregation of multiple columns # # Aggregation a_clm2 = ctx.aggregate( name="Aggregate 2", table=g_tbl.id, tables=["Facts"], link="Link", func= "lambda x, my_param, **model: x['M'].sum() + x['N'].sum() + my_param", columns=["M", "N"], model={"my_param": 0.0}) #a_clm2.evaluate() ctx.translate() # All data will be reset ctx.run( ) # A new column is NOT added to the existing data frame (not clear where it is) a_clm2_data = g_tbl.get_series('Aggregate 2') assert a_clm2_data[0] == 10.0 assert a_clm2_data[1] == 10.0 assert a_clm2_data[2] == 0.0
def test_merge_path(): ctx = Prosto("My Prosto") # Facts f_tbl = ctx.populate( table_name="Facts", attributes=["A"], func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[]) # Groups g_tbl = ctx.populate( table_name="Groups", attributes=["A", "B"], func= "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})", tables=[]) # Link l_clm = ctx.link(name="Link", table=f_tbl.id, type=g_tbl.id, columns=["A"], linked_columns=["A"]) # SuperGroups sg_tbl = ctx.populate( table_name="SuperGroups", attributes=["B", "C"], func= "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})", tables=[]) # SuperLink sl_clm = ctx.link(name="SuperLink", table=g_tbl.id, type=sg_tbl.id, columns=["B"], linked_columns=["B"]) # Merge m_clm = ctx.merge("Merge", f_tbl.id, ["Link", "SuperLink", "C"]) f_tbl.evaluate() g_tbl.evaluate() sg_tbl.evaluate() l_clm.evaluate() sl_clm.evaluate() m_clm.evaluate() f_tbl_data = f_tbl.get_df() assert len(f_tbl_data) == 4 # Same number of rows assert len(f_tbl_data.columns) == 3 m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y'] # # Test topology # topology = Topology(ctx) topology.translate() # All data will be reset layers = topology.elem_layers assert len(layers) == 3 assert set([x.id for x in layers[0]]) == {"Facts", "Groups", "SuperGroups"} assert set([x.id for x in layers[1]]) == {"Link", "SuperLink"} assert set([x.id for x in layers[2]]) == {"Merge"} ctx.run() m_data = f_tbl.get_series("Merge") assert m_data.to_list() == ['x', 'x', 'y', 'y']