Exemple #1
0
def test_product():
    ctx = Prosto("My Prosto")

    t1 = ctx.populate(
        table_name="Table 1", attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0]})", tables=[]
    )

    t2 = ctx.populate(
        table_name="Table 2", attributes=["B"],
        func="lambda **m: pd.DataFrame({'B': ['x', 'y', 'z']})", tables=[]
    )

    product = ctx.product(
        table_name="Product", attributes=["t1", "t2"],
        tables=["Table 1", "Table 2"]
    )

    t1.evaluate()
    t2.evaluate()
    product.evaluate()

    assert len(product.get_df().columns) == 2
    assert len(product.get_df()) == 9

    assert product.get_df().columns.to_list() == ["t1", "t2"]
Exemple #2
0
def test_calculate_value():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table", attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1, 2, 3]})", tables=[]
    )

    clm = ctx.calculate(
        name="My column", table=tbl.id,
        func="lambda x: float(x)", columns=["A"], model=None
    )

    tbl.evaluate()
    clm.evaluate()

    clm_data = tbl.get_series('My column')
    v0 = clm_data[0]
    v1 = clm_data[1]
    v2 = clm_data[2]

    assert np.isclose(v0, 1.0)
    assert np.isclose(v1, 2.0)
    assert np.isclose(v2, 3.0)

    assert isinstance(v0, float)
    assert isinstance(v1, float)
    assert isinstance(v2, float)
Exemple #3
0
def test_merge_path2():
    """
    Here we do the same as previous test, but specify complex path using separators (rather than a list of simple segment names).
    So only the definition of merge operation changes.
    """
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})",
        tables=[])

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})",
        tables=[])
    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # SuperGroups
    sg_tbl = ctx.populate(
        table_name="SuperGroups",
        attributes=["B", "C"],
        func=
        "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})",
        tables=[])
    # SuperLink
    sl_clm = ctx.link(name="SuperLink",
                      table=g_tbl.id,
                      type=sg_tbl.id,
                      columns=["B"],
                      linked_columns=["B"])

    # Merge
    m_clm = ctx.merge("Merge", f_tbl.id, ["Link::SuperLink::C"])

    ctx.run()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']
Exemple #4
0
def test_populate():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'y', 'z']})",
        tables=[],
        model={"nrows": 3})

    tbl.evaluate()

    assert len(tbl.get_df().columns) == 2
    assert len(tbl.get_df()) == 3
Exemple #5
0
def test_one_key():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})", tables=[]
    )

    # Groups
    g_tbl = ctx.project(
        table_name="Groups", attributes=["X"],
        tables=["Facts"], columns=["A"]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A"], linked_columns=["X"]
    )

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()

    g_tbl_data = g_tbl.get_df()
    assert len(g_tbl_data) == 2
    assert len(g_tbl_data.columns) == 1

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 0
    assert l_data[2] == 1
    assert l_data[3] == 1

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts"}
    assert set([x.id for x in layers[1]]) == {"Groups"}
    assert set([x.id for x in layers[2]]) == {"Link"}
Exemple #6
0
def test_filter_table():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="Base table",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})",
        tables=[])

    # This (boolean) column will be used for filtering
    clm = ctx.compute(
        name="filter_column",
        table=tbl.id,
        func=
        "lambda x, param: (x['A'] > param) & (x['B'].str.len() < 3)",  # Return a boolean Series
        columns=["A", "B"],
        model={"param": 1.5})

    tbl.evaluate()
    clm.evaluate()

    tbl = ctx.filter(table_name="Filtered table",
                     attributes=["super"],
                     func=None,
                     tables=["Base table"],
                     columns=["filter_column"])

    tbl.evaluate()

    assert len(tbl.get_df().columns) == 1  # Only one link-attribute is created
    assert len(tbl.get_df()) == 1
    assert tbl.get_df()['super'][0] == 1

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Base table"}
    assert set([x.id for x in layers[1]]) == {"filter_column"}
    assert set([x.id for x in layers[2]]) == {"Filtered table"}
Exemple #7
0
def test_product_csql():
    ctx = Prosto("My Prosto")

    t1_df = pd.DataFrame({'A': [1.0, 2.0, 3.0]})
    t2_df = pd.DataFrame({'B': ['x', 'y', 'z']})

    ctx.column_sql("TABLE  Table_1 (A)", lambda **m: t1_df)
    ctx.column_sql("TABLE  Table_2 (B)", lambda **m: t2_df)
    ctx.column_sql("PRODUCT  Table_1; Table_2 -> t1; t2 -> Product")

    assert ctx.get_table("Product")

    ctx.run()

    product = ctx.get_table("Product")

    assert len(product.get_df().columns) == 2
    assert len(product.get_df()) == 9

    assert product.get_df().columns.to_list() == ["t1", "t2"]
Exemple #8
0
def test_integers2():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9]})",
        tables=[])

    clm = ctx.discretize(name="My column",
                         table=tbl.id,
                         columns=["A"],
                         model={
                             "origin": 5,
                             "step": 3,
                             "label": "right",
                             "closed": "right"
                         })
    #  1, 2], 3, 4, 5], 6, 7, 8], 9
    # -1 -1         0         1   2

    ctx.run()

    clm_data = tbl.get_series('My column')
    assert list(clm_data) == [-1, -1, 0, 0, 0, 1, 1, 1, 2]
Exemple #9
0
def test_two_keys():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[]
    )

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'c', 'c'], 'C': [1, 2, 3]})", tables=[]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A", "B"], linked_columns=["A", "B"]
    )

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 1
    assert l_data[2] == 1
    assert pd.isna(l_data[3])

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 2

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups"}
    assert set([x.id for x in layers[1]]) == {"Link"}

    ctx.run()

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 1
    assert l_data[2] == 1
    assert pd.isna(l_data[3])
Exemple #10
0
def test_groll_single():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table",
        attributes=["G", "A"],
        func=
        "lambda **m: pd.DataFrame({'G': [1, 2, 1, 2], 'A': [1.0, 2.0, 3.0, 4.0]})",
        tables=[])

    clm = ctx.roll(name="Roll",
                   table=tbl.id,
                   window="2",
                   link="G",
                   func="lambda x: x.sum()",
                   columns=["A"],
                   model={})

    ctx.run()

    clm_data = tbl.get_series('Roll')

    assert pd.isna(clm_data[0])
    assert pd.isna(clm_data[1])
    assert np.isclose(clm_data[2], 4.0)
    assert np.isclose(clm_data[3], 6.0)
Exemple #11
0
def test_filter_inheritance():
    """Test topology augmentation. Use columns from the parent table by automatically adding the merge operation to topology."""
    ctx = Prosto("My Prosto")

    base_tbl = ctx.populate(
        table_name="Base table",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})",
        tables=[])

    # This (boolean) column will be used for filtering
    clm = ctx.compute(
        name="filter_column",
        table=base_tbl.id,
        func=
        "lambda x, param: (x['A'] > param) & (x['B'].str.len() < 3)",  # Return a boolean Series
        columns=["A", "B"],
        model={"param": 1.5})

    f_tbl = ctx.filter(table_name="Filtered table",
                       attributes=["super"],
                       func=None,
                       tables=["Base table"],
                       columns=["filter_column"])

    # In this calculate column, we use a column of the filtered table which actually exists only in the base table
    clm = ctx.calculate(name="My column",
                        table=f_tbl.id,
                        func="lambda x: x + 1.0",
                        columns=["A"],
                        model=None)

    ctx.run()

    clm_data = f_tbl.get_series('My column')

    assert np.isclose(len(clm_data), 1)
    assert np.isclose(clm_data[0], 3.0)

    # This column had to be added automatically by the augmentation procedure
    # It is inherited from the base table and materialized via merge operation
    # It stores original values of the inherited base column
    clm_data = f_tbl.get_series('A')
    assert np.isclose(clm_data[0], 2)
Exemple #12
0
def test_aggregate_with_path():
    """Aggregation with column paths as measures which have to be automatically produce merge operation."""
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A", "M"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})",
        tables=[])

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})",
        tables=[])

    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # Aggregation
    a_clm = ctx.aggregate(name="Aggregate",
                          table=g_tbl.id,
                          tables=["Facts"],
                          link="Link",
                          func="lambda x, bias, **model: x.sum() + bias",
                          columns=["Link::B"],
                          model={"bias": 0.0})

    ctx.run()

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 6.0
    assert a_clm_data[1] == 4.0
    assert a_clm_data[2] == 0.0
Exemple #13
0
def test_product_inheritance():
    """
    We add an addition calculate column to the product table which uses a column of a base table.
    The system has to automatically insert a new operation by resolving this missing column.
    """
    ctx = Prosto("My Prosto")

    t1 = ctx.populate(
        table_name="Table 1", attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1.0, 2.0, 3.0]})", tables=[]
    )

    t2 = ctx.populate(
        table_name="Table 2", attributes=["B"],
        func="lambda **m: pd.DataFrame({'B': ['x', 'y', 'z']})", tables=[]
    )

    product = ctx.product(
        table_name="Product", attributes=["t1", "t2"],
        tables=["Table 1", "Table 2"]
    )

    # In this calculate column, we use a column of the product table which actually exists only in a base table
    clm = ctx.calculate(
        name="My column", table=product.id,
        func="lambda x: x + 1.0", columns=["A"], model=None
    )

    ctx.run()

    # We get two columns in addition to two attributes: one merge (augmented) and one calculate column
    assert len(product.get_df().columns) == 4

    clm_data = product.get_series('My column')

    assert clm_data.to_list() == [2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0]
Exemple #14
0
def test_calculate_with_path():
    """Test topology augmentation. Calculation with column paths which have to be automatically produce merge operation."""
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A", "M"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[]
    )

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})
    g_tbl = ctx.populate(
        table_name="Groups", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A"], linked_columns=["A"]
    )

    # Calculate
    clm = ctx.calculate(
        name="My column", table=f_tbl.id,
        func="lambda x: x['M'] + x['Link::B']", columns=["M", "Link::B"], model=None
    )

    ctx.run()

    clm_data = f_tbl.get_series('My column')
    assert clm_data[0] == 4.0
    assert clm_data[1] == 5.0
    assert clm_data[2] == 5.0
    assert clm_data[3] == 6.0
Exemple #15
0
def test_roll_multiple():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table",
        attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': [1, 2, 3], 'B': [3, 2, 1]})",
        tables=[])

    clm = ctx.roll(name="Roll",
                   table=tbl.id,
                   window="2",
                   link=None,
                   func="lambda x: x['A'].sum() + x['B'].sum()",
                   columns=["A", "B"],
                   model={})

    tbl.evaluate()
    clm.evaluate()

    clm_data = tbl.get_series('Roll')

    assert pd.isna(clm_data[0])
    assert np.isclose(clm_data[1], 8.0)
    assert np.isclose(clm_data[2], 8.0)

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 2

    assert set([x.id for x in layers[0]]) == {"My table"}
    assert set([x.id for x in layers[1]]) == {"Roll"}

    ctx.run()

    clm_data = tbl.get_series('Roll')
    assert pd.isna(clm_data[0])
    assert np.isclose(clm_data[1], 8.0)
    assert np.isclose(clm_data[2], 8.0)
Exemple #16
0
def test_compute():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table", attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1, 2, 3]})", tables=[]
    )

    clm = ctx.compute(
        name="My column", table=tbl.id,
        func="lambda x, **model: x.shift(**model)", columns=["A"], model={"periods": -1}
    )

    tbl.evaluate()
    clm.evaluate()

    clm_data = tbl.get_series('My column')
    assert np.isclose(clm_data[0], 2.0)
    assert np.isclose(clm_data[1], 3.0)
    assert pd.isna(clm_data[2])

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 2

    assert set([x.id for x in layers[0]]) == {"My table"}
    assert set([x.id for x in layers[1]]) == {"My column"}

    ctx.run()

    clm_data = tbl.get_series('My column')
    assert np.isclose(clm_data[0], 2.0)
    assert np.isclose(clm_data[1], 3.0)
    assert pd.isna(clm_data[2])
Exemple #17
0
def test_integers():
    ctx = Prosto("My Prosto")

    tbl = ctx.populate(
        table_name="My table",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9]})",
        tables=[])

    clm = ctx.discretize(name="My column",
                         table=tbl.id,
                         columns=["A"],
                         model={
                             "origin": 5,
                             "step": 3
                         })
    #  1, [2, 3, 4, [5, 6, 7, [8, 9
    # -2  -1         0         1

    ctx.run()

    clm_data = tbl.get_series('My column')
    assert list(clm_data) == [-2, -1, -1, -1, 0, 0, 0, 1, 1]
Exemple #18
0
def test_calculate_value():
    ctx = Prosto("My Prosto")
    ctx.incremental = True

    tbl = ctx.create_table(
        table_name="My table",
        attributes=["A"],
    )

    clm = ctx.calculate(name="My column",
                        table=tbl.id,
                        func="lambda x: float(x)",
                        columns=["A"],
                        model=None)

    ctx.run()  # Inference on empty data

    tbl.data.add({"A": 1})  # New record is added and marked as added

    # Assert new change status
    assert tbl.data.added_length() == 1

    ctx.run()

    # Assert clean change status and results of inference
    assert tbl.data.added_length() == 0

    tbl.data.add({"A": 2})
    tbl.data.add({"A": 3})

    # Assert new change status
    assert tbl.data.added_length() == 2

    # For debug purpose, modify an old row (which has not been recently added but was evaluated before)
    tbl_df = tbl.data.get_df()
    tbl_df['A'][0] = 10  # Old value is 1. Prosto does not see this change

    ctx.run()

    # The manual modification is invisible for Prosto and hence it should not be re-computed and the derived column will have to have the old value
    assert tbl_df['My column'][0] == 1

    # Assert clean change status and results of inference
    assert tbl.data.added_length() == 0

    tbl.data.remove(1)  # Remove one oldest record by marking it as removed

    # Assert new change status
    assert tbl.data.removed_length() == 1

    ctx.run()

    # Assert clean change status and results of inference
    assert tbl.data.removed_length() == 0

    tbl.data.remove_all()  # Remove all records by marking them as removed

    # Assert new change status

    ctx.run()

    # Assert clean change status and results of inference
    assert tbl.data.added_range.start == 3
    assert tbl.data.added_range.end == 3
    assert tbl.data.removed_range.start == 3
    assert tbl.data.removed_range.end == 3
Exemple #19
0
def test_merge_path():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})",
        tables=[])

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})",
        tables=[])
    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # SuperGroups
    sg_tbl = ctx.populate(
        table_name="SuperGroups",
        attributes=["B", "C"],
        func=
        "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})",
        tables=[])
    # SuperLink
    sl_clm = ctx.link(name="SuperLink",
                      table=g_tbl.id,
                      type=sg_tbl.id,
                      columns=["B"],
                      linked_columns=["B"])

    # Merge
    m_clm = ctx.merge("Merge", f_tbl.id, ["Link", "SuperLink", "C"])

    f_tbl.evaluate()
    g_tbl.evaluate()
    sg_tbl.evaluate()

    l_clm.evaluate()
    sl_clm.evaluate()
    m_clm.evaluate()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups", "SuperGroups"}
    assert set([x.id for x in layers[1]]) == {"Link", "SuperLink"}
    assert set([x.id for x in layers[2]]) == {"Merge"}

    ctx.run()

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']
Exemple #20
0
def test_filter_project():
    """
    Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table.
    Scenario: populate, filter, project the filtered table using a column in the base table (which has to be inherited)
    """
    ctx = Prosto("My Prosto")

    base_df = pd.DataFrame({
        'A': [1.0, 2.0, 3.0, 4.0],
        'B': ['x', 'x', 'y', 'zzz']
    })

    ctx.column_sql("TABLE  Base(A, B)", lambda **m: base_df)
    ctx.column_sql("FILTER Base (A, B) -> super -> Filtered",
                   lambda x: x['A'] < 4.0)
    ctx.column_sql("FILTER Filtered (A) -> super -> Filtered_2",
                   lambda x: x < 3.0)
    ctx.column_sql(
        "PROJECT Filtered_2 (B) -> new_column -> Groups(C)"
    )  # <-- Here we use columns which exist only in the base table

    ctx.run()

    assert ctx.get_table("Groups").get_series('C').to_list() == ['x']
Exemple #21
0
def test_filter_calculate():
    """
    Test resolution of inherited attributes which do not exist in the filtered table but must be automatically merged from the base table.
    Scenario: populate, filter, calculate column in filtered table using column in base table (which has to be inherited)
    """
    ctx = Prosto("My Prosto")

    base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})

    ctx.column_sql("TABLE  Base (A, B)", lambda **m: base_df)
    ctx.column_sql("FILTER Base (A) -> super -> Filtered", lambda x: x < 3.0)
    ctx.column_sql(
        "CALCULATE  Filtered (B) -> filter_column",  # <-- Here we use columns A and B which exist only in the base table
        lambda x: len(x))

    ctx.run()

    assert ctx.get_table("Filtered").get_series('filter_column').to_list() == [
        1, 2
    ]
Exemple #22
0
def test_filter_csql():
    ctx = Prosto("My Prosto")

    base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})

    ctx.column_sql("TABLE  Base (A, B)", lambda **m: base_df)
    ctx.column_sql("CALCULATE  Base (A, B) -> filter_column", lambda x, param:
                   (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5})
    ctx.column_sql("FILTER Base (filter_column) -> super -> Filtered")

    assert ctx.get_table("Base")
    assert ctx.get_table("Filtered")

    ctx.run()

    assert list(ctx.get_table("Filtered").get_series('super')) == [1]

    #
    # Filter with a predicate function and no explicit calculate column
    #
    ctx = Prosto("My Prosto")

    base_df = pd.DataFrame({'A': [1.0, 2.0, 3.0], 'B': ['x', 'yy', 'zzz']})

    ctx.column_sql("TABLE  Base (A, B)", base_df)
    ctx.column_sql("FILTER Base (A, B) -> super -> Filtered", lambda x, param:
                   (x['A'] > param) & (len(x['B']) < 3), {"param": 1.5})

    assert ctx.get_table("Base")
    assert ctx.get_table("Filtered")

    ctx.run()

    assert list(ctx.get_table("Filtered").get_series('super')) == [1]
Exemple #23
0
def test_aggregate_csql():
    ctx = Prosto("My Prosto")

    facts_df = pd.DataFrame({
        'A': ['a', 'a', 'b', 'b'],
        'M': [1.0, 2.0, 3.0, 4.0],
        'N': [4.0, 3.0, 2.0, 1.0]
    })
    groups_df = pd.DataFrame({'A': ['a', 'b', 'c']})

    ctx.column_sql("TABLE  Facts (A, M, N)", lambda **m: facts_df)
    ctx.column_sql("TABLE  Groups (A)", lambda **m: groups_df)

    ctx.column_sql("LINK  Facts (A) -> new_column -> Groups (A)")
    ctx.column_sql("AGGREGATE  Facts (M) -> new_column -> Groups (Aggregate)",
                   lambda x, bias, **model: x.sum() + bias, {"bias": 0.0})

    assert ctx.get_table("Facts")
    assert ctx.get_table("Groups")
    assert ctx.get_column("Facts", "new_column")

    ctx.run()

    assert list(
        ctx.get_table("Groups").get_series('Aggregate')) == [3.0, 7.0, 0.0]
Exemple #24
0
def test_link_csql():
    ctx = Prosto("My Prosto")

    facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']})
    groups_df = pd.DataFrame({'A': ['a', 'b', 'c']})

    ctx.column_sql("TABLE  Facts (A)", lambda **m: facts_df)
    ctx.column_sql("TABLE  Groups (A)", lambda **m: groups_df)
    ctx.column_sql("LINK  Facts (A) -> new_column -> Groups (A)")

    assert ctx.get_table("Facts")
    assert ctx.get_table("Groups")
    assert ctx.get_column("Facts", "new_column")

    ctx.run()

    assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
Exemple #25
0
def test_aggregate():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A", "M"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0]})",
        tables=[])

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c']})
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c']})",
        tables=[])

    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # Aggregation
    a_clm = ctx.aggregate(name="Aggregate",
                          table=g_tbl.id,
                          tables=["Facts"],
                          link="Link",
                          func="lambda x, bias, **model: x.sum() + bias",
                          columns=["M"],
                          model={"bias": 0.0})

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()
    a_clm.evaluate()

    g_tbl_data = g_tbl.get_df()
    assert len(g_tbl_data) == 3  # Same number of rows
    assert len(
        g_tbl_data.columns
    ) == 2  # One aggregate column was added (and one technical "id" column was added which might be removed in future)

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 3.0
    assert a_clm_data[1] == 7.0
    assert a_clm_data[2] == 0.0

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups"}
    assert set([x.id for x in layers[1]]) == {"Link"}
    assert set([x.id for x in layers[2]]) == {"Aggregate"}

    ctx.run()

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 3.0
    assert a_clm_data[1] == 7.0
    assert a_clm_data[2] == 0.0

    #
    # Aggregation of multiple columns
    #
    # Aggregation
    a_clm2 = ctx.aggregate(
        name="Aggregate 2",
        table=g_tbl.id,
        tables=["Facts"],
        link="Link",
        func=
        "lambda x, my_param, **model: x['M'].sum() + x['N'].sum() + my_param",
        columns=["M", "N"],
        model={"my_param": 0.0})

    #a_clm2.evaluate()
    ctx.translate()  # All data will be reset
    ctx.run(
    )  # A new column is NOT added to the existing data frame (not clear where it is)

    a_clm2_data = g_tbl.get_series('Aggregate 2')
    assert a_clm2_data[0] == 10.0
    assert a_clm2_data[1] == 10.0
    assert a_clm2_data[2] == 0.0
Exemple #26
0
def test_csql_project():
    ctx = Prosto("My Prosto")

    facts_df = pd.DataFrame({'A': ['a', 'a', 'b', 'b']})

    ctx.column_sql("TABLE  Facts (A)", lambda **m: facts_df)
    ctx.column_sql("PROJECT  Facts (A) -> new_column -> Groups (A)")

    assert ctx.get_table("Facts")
    assert ctx.get_table("Groups")
    assert ctx.get_column("Facts", "new_column")

    ctx.run()

    assert len(ctx.get_table("Groups").get_df()) == 2
    assert len(ctx.get_table("Groups").get_df().columns) == 1
    assert list(ctx.get_table("Facts").get_series('new_column')) == [0, 0, 1, 1]
Exemple #27
0
def test_calc_csql():
    #
    # Test 2: function in-query
    #
    ctx = Prosto("My Prosto")

    ctx.column_sql("TABLE  My_table (A) FUNC lambda **m: pd.DataFrame({'A': [1, 2, 3]})")
    ctx.column_sql("CALCULATE  My_table (A) -> new_column FUNC lambda x: float(x)")

    assert ctx.get_table("My_table")
    assert ctx.get_column("My_table", "new_column")

    ctx.run()

    assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0]

    #
    # Test 2: function by-reference
    #
    ctx = Prosto("My Prosto")

    df = pd.DataFrame({'A': [1, 2, 3]})  # Use FUNC "lambda **m: df" (df cannot be resolved during population)

    ctx.column_sql("TABLE  My_table (A)", df)
    ctx.column_sql("CALCULATE My_table (A) -> new_column", lambda x: float(x))

    assert ctx.get_table("My_table")
    assert ctx.get_column("My_table", "new_column")

    ctx.run()

    assert list(ctx.get_table("My_table").get_series('new_column')) == [1.0, 2.0, 3.0]
Exemple #28
0
def test_roll_csql():
    ctx = Prosto("My Prosto")

    df = pd.DataFrame({'A': [1.0, 2.0, 3.0]})

    ctx.column_sql("TABLE  My_table (A)", lambda **m: df)
    ctx.column_sql("ROLL  My_table (A) -> new_column WINDOW 2",
                   lambda x: x.sum())

    assert ctx.get_table("My_table")
    assert ctx.get_column("My_table", "new_column")

    ctx.run()

    assert list(ctx.get_table("My_table").get_series('new_column')) == [
        None, 3.0, 5.0
    ]