Esempio n. 1
0
def test_merge_path2():
    """
    Here we do the same as previous test, but specify complex path using separators (rather than a list of simple segment names).
    So only the definition of merge operation changes.
    """
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})",
        tables=[])

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})",
        tables=[])
    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # SuperGroups
    sg_tbl = ctx.populate(
        table_name="SuperGroups",
        attributes=["B", "C"],
        func=
        "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})",
        tables=[])
    # SuperLink
    sl_clm = ctx.link(name="SuperLink",
                      table=g_tbl.id,
                      type=sg_tbl.id,
                      columns=["B"],
                      linked_columns=["B"])

    # Merge
    m_clm = ctx.merge("Merge", f_tbl.id, ["Link::SuperLink::C"])

    ctx.run()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']
Esempio n. 2
0
def test_two_keys():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[]
    )

    # Groups
    g_tbl = ctx.project(
        table_name="Groups", attributes=["X", "Y"],
        tables=["Facts"], columns=["A", "B"]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A", "B"], linked_columns=["X", "Y"]
    )

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()

    g_tbl_data = g_tbl.get_df()
    assert len(g_tbl_data) == 3
    assert len(g_tbl_data.columns) == 2

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 1
    assert l_data[2] == 1
    assert l_data[3] == 2

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts"}
    assert set([x.id for x in layers[1]]) == {"Groups"}
    assert set([x.id for x in layers[2]]) == {"Link"}

    g_tbl_data = g_tbl.get_df()
    g_tbl_data.drop(g_tbl_data.index, inplace=True)  # Empty

    ctx.run()

    g_tbl_data = g_tbl.get_df()
    assert len(g_tbl_data) == 3
    assert len(g_tbl_data.columns) == 2
Esempio n. 3
0
def test_two_keys():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'b', 'a'], 'B': ['b', 'c', 'c', 'a']})", tables=[]
    )

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'c', 'c'], 'C': [1, 2, 3]})", tables=[]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A", "B"], linked_columns=["A", "B"]
    )

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 1
    assert l_data[2] == 1
    assert pd.isna(l_data[3])

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 2

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups"}
    assert set([x.id for x in layers[1]]) == {"Link"}

    ctx.run()

    l_data = f_tbl.get_series("Link")
    assert l_data[0] == 0
    assert l_data[1] == 1
    assert l_data[2] == 1
    assert pd.isna(l_data[3])
Esempio n. 4
0
def test_aggregate_with_path():
    """Aggregation with column paths as measures which have to be automatically produce merge operation."""
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A", "M"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})",
        tables=[])

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})",
        tables=[])

    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # Aggregation
    a_clm = ctx.aggregate(name="Aggregate",
                          table=g_tbl.id,
                          tables=["Facts"],
                          link="Link",
                          func="lambda x, bias, **model: x.sum() + bias",
                          columns=["Link::B"],
                          model={"bias": 0.0})

    ctx.run()

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 6.0
    assert a_clm_data[1] == 4.0
    assert a_clm_data[2] == 0.0
Esempio n. 5
0
def test_calculate_with_path():
    """Test topology augmentation. Calculation with column paths which have to be automatically produce merge operation."""
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts", attributes=["A", "M"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0]})", tables=[]
    )

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})
    g_tbl = ctx.populate(
        table_name="Groups", attributes=["A", "B"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [3.0, 2.0, 1.0]})", tables=[]
    )

    # Link
    l_clm = ctx.link(
        name="Link", table=f_tbl.id, type=g_tbl.id,
        columns=["A"], linked_columns=["A"]
    )

    # Calculate
    clm = ctx.calculate(
        name="My column", table=f_tbl.id,
        func="lambda x: x['M'] + x['Link::B']", columns=["M", "Link::B"], model=None
    )

    ctx.run()

    clm_data = f_tbl.get_series('My column')
    assert clm_data[0] == 4.0
    assert clm_data[1] == 5.0
    assert clm_data[2] == 5.0
    assert clm_data[3] == 6.0
Esempio n. 6
0
def test_aggregate():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A", "M"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b'], 'M': [1.0, 2.0, 3.0, 4.0], 'N': [4.0, 3.0, 2.0, 1.0]})",
        tables=[])

    # Groups
    df = pd.DataFrame({'A': ['a', 'b', 'c']})
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'b', 'c']})",
        tables=[])

    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # Aggregation
    a_clm = ctx.aggregate(name="Aggregate",
                          table=g_tbl.id,
                          tables=["Facts"],
                          link="Link",
                          func="lambda x, bias, **model: x.sum() + bias",
                          columns=["M"],
                          model={"bias": 0.0})

    f_tbl.evaluate()
    g_tbl.evaluate()

    l_clm.evaluate()
    a_clm.evaluate()

    g_tbl_data = g_tbl.get_df()
    assert len(g_tbl_data) == 3  # Same number of rows
    assert len(
        g_tbl_data.columns
    ) == 2  # One aggregate column was added (and one technical "id" column was added which might be removed in future)

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 3.0
    assert a_clm_data[1] == 7.0
    assert a_clm_data[2] == 0.0

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups"}
    assert set([x.id for x in layers[1]]) == {"Link"}
    assert set([x.id for x in layers[2]]) == {"Aggregate"}

    ctx.run()

    a_clm_data = g_tbl.get_series('Aggregate')
    assert a_clm_data[0] == 3.0
    assert a_clm_data[1] == 7.0
    assert a_clm_data[2] == 0.0

    #
    # Aggregation of multiple columns
    #
    # Aggregation
    a_clm2 = ctx.aggregate(
        name="Aggregate 2",
        table=g_tbl.id,
        tables=["Facts"],
        link="Link",
        func=
        "lambda x, my_param, **model: x['M'].sum() + x['N'].sum() + my_param",
        columns=["M", "N"],
        model={"my_param": 0.0})

    #a_clm2.evaluate()
    ctx.translate()  # All data will be reset
    ctx.run(
    )  # A new column is NOT added to the existing data frame (not clear where it is)

    a_clm2_data = g_tbl.get_series('Aggregate 2')
    assert a_clm2_data[0] == 10.0
    assert a_clm2_data[1] == 10.0
    assert a_clm2_data[2] == 0.0
Esempio n. 7
0
def test_merge_path():
    ctx = Prosto("My Prosto")

    # Facts
    f_tbl = ctx.populate(
        table_name="Facts",
        attributes=["A"],
        func="lambda **m: pd.DataFrame({'A': ['a', 'a', 'b', 'b']})",
        tables=[])

    # Groups
    g_tbl = ctx.populate(
        table_name="Groups",
        attributes=["A", "B"],
        func=
        "lambda **m: pd.DataFrame({'A': ['a', 'b', 'c'], 'B': [2.0, 3.0, 3.0]})",
        tables=[])
    # Link
    l_clm = ctx.link(name="Link",
                     table=f_tbl.id,
                     type=g_tbl.id,
                     columns=["A"],
                     linked_columns=["A"])

    # SuperGroups
    sg_tbl = ctx.populate(
        table_name="SuperGroups",
        attributes=["B", "C"],
        func=
        "lambda **m: pd.DataFrame({'B': [2.0, 3.0, 4.0], 'C': ['x', 'y', 'z']})",
        tables=[])
    # SuperLink
    sl_clm = ctx.link(name="SuperLink",
                      table=g_tbl.id,
                      type=sg_tbl.id,
                      columns=["B"],
                      linked_columns=["B"])

    # Merge
    m_clm = ctx.merge("Merge", f_tbl.id, ["Link", "SuperLink", "C"])

    f_tbl.evaluate()
    g_tbl.evaluate()
    sg_tbl.evaluate()

    l_clm.evaluate()
    sl_clm.evaluate()
    m_clm.evaluate()

    f_tbl_data = f_tbl.get_df()
    assert len(f_tbl_data) == 4  # Same number of rows
    assert len(f_tbl_data.columns) == 3

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']

    #
    # Test topology
    #
    topology = Topology(ctx)
    topology.translate()  # All data will be reset
    layers = topology.elem_layers

    assert len(layers) == 3

    assert set([x.id for x in layers[0]]) == {"Facts", "Groups", "SuperGroups"}
    assert set([x.id for x in layers[1]]) == {"Link", "SuperLink"}
    assert set([x.id for x in layers[2]]) == {"Merge"}

    ctx.run()

    m_data = f_tbl.get_series("Merge")
    assert m_data.to_list() == ['x', 'x', 'y', 'y']