Esempio n. 1
0
def test_basic_spacetime():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            "create table events (entity_id int, event_date date, outcome bool)"
        )
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, as_of_date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "2y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column="event_date",
            output_date_column="as_of_date",
        )

        st.execute(engine.connect())

        r = engine.execute(
            "select * from events_entity_id order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[0]["entity_id"] == 1
        assert rows[0]["as_of_date"] == date(2015, 1, 1)
        assert rows[0]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[1]["entity_id"] == 1
        assert rows[1]["as_of_date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[1]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_2y_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[2]["entity_id"] == 2
        assert rows[2]["as_of_date"] == date(2015, 1, 1)
        assert rows[2]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[2]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[2]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[3]["entity_id"] == 2
        assert rows[3]["as_of_date"] == date(2016, 1, 1)
        assert rows[3]["events_entity_id_1y_outcome::int_sum"] is None
        assert rows[3]["events_entity_id_1y_outcome::int_avg"] is None
        assert rows[3]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[3]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[4]["entity_id"] == 3
        assert rows[4]["as_of_date"] == date(2015, 1, 1)
        assert rows[4]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[5]["entity_id"] == 3
        assert rows[5]["as_of_date"] == date(2016, 1, 1)
        assert rows[5]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[5]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_2y_outcome::int_avg"] == 0.25
        assert rows[5]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_all_outcome::int_avg"] == 0.25

        assert rows[6]["entity_id"] == 4
        # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data!
        assert rows[6]["as_of_date"] == date(2016, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0
        assert len(rows) == 7

        # check some imputation results
        r = engine.execute(
            "select * from events_aggregation_imputed order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[6]["entity_id"] == 4
        assert rows[6]["as_of_date"] == date(2015, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_1y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_1y_outcome::int_avg"]), 4) == 0.1667
        )
        assert rows[6]["events_entity_id_1y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_2y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_2y_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_2y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_all_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_all_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_all_outcome::int_avg_imp"] == 1
        assert rows[7]["entity_id"] == 4
        assert rows[7]["as_of_date"] == date(2016, 1, 1)
        assert rows[7]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg_imp"] == 0
        assert len(rows) == 8
Esempio n. 2
0
def test_input_min_date():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute("create table events (entity_id int, date date, outcome bool)")
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["all"],
            dates=["2016-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2015-11-10",
        )

        st.execute(engine.connect())

        r = engine.execute("select * from events_entity_id order by entity_id")
        rows = [x for x in r]

        assert rows[0]["entity_id"] == 1
        assert rows[0]["date"] == date(2016, 1, 1)
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 1
        assert rows[1]["entity_id"] == 4
        assert rows[1]["date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0

        assert len(rows) == 2

        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2014-11-10",
        )
        with pytest.raises(ValueError):
            st.validate(engine.connect())
        with pytest.raises(ValueError):
            st.execute(engine.connect())
Esempio n. 3
0
def test_basic_spacetime():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            'create table events (entity_id int, event_date date, outcome bool)'
        )
        for event in events_data:
            engine.execute(
                'insert into events values (%s, %s, %s::bool)',
                event
            )

        engine.execute(
            'create table states (entity_id int, as_of_date date)'
        )
        for state in state_data:
            engine.execute(
                'insert into states values (%s, %s)',
                state
            )

        agg = Aggregate('outcome::int', ['sum','avg'], {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"}
        })
        st = SpacetimeAggregation(
                aggregates = [agg],
                from_obj = 'events',
                groups = ['entity_id'],
                intervals = ['1y', '2y', 'all'],
                dates = ['2016-01-01', '2015-01-01'],
                state_table = 'states',
                state_group = 'entity_id',
                date_column = 'event_date',
                output_date_column = 'as_of_date'
            )

        st.execute(engine.connect())

        r = engine.execute('select * from events_entity_id order by entity_id, as_of_date')
        rows = [x for x in r]
        assert rows[0]['entity_id'] == 1
        assert rows[0]['as_of_date'] == date(2015, 1, 1)
        assert rows[0]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[0]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_all_outcome::int_avg'] == 0.5
        assert rows[1]['entity_id'] == 1
        assert rows[1]['as_of_date'] == date(2016, 1, 1)
        assert rows[1]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[1]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[1]['events_entity_id_2y_outcome::int_sum'] == 2
        assert rows[1]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[1]['events_entity_id_all_outcome::int_sum'] == 2
        assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0.5

        assert rows[2]['entity_id'] == 2
        assert rows[2]['as_of_date'] == date(2015, 1, 1)
        assert rows[2]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[2]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[2]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[2]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[2]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[2]['events_entity_id_all_outcome::int_avg'] == 0.5
        assert rows[3]['entity_id'] == 2
        assert rows[3]['as_of_date'] == date(2016, 1, 1)
        assert rows[3]['events_entity_id_1y_outcome::int_sum'] == None
        assert rows[3]['events_entity_id_1y_outcome::int_avg'] == None
        assert rows[3]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[3]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[3]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[3]['events_entity_id_all_outcome::int_avg'] == 0.5

        assert rows[4]['entity_id'] == 3
        assert rows[4]['as_of_date'] == date(2015, 1, 1)
        assert rows[4]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[4]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[4]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_all_outcome::int_avg'] == 0
        assert rows[5]['entity_id'] == 3
        assert rows[5]['as_of_date'] == date(2016, 1, 1)
        assert rows[5]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[5]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_2y_outcome::int_avg'] == 0.25
        assert rows[5]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_all_outcome::int_avg'] == 0.25

        assert rows[6]['entity_id'] == 4
        # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data!
        assert rows[6]['as_of_date'] == date(2016, 1, 1)
        assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[6]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_all_outcome::int_avg'] == 0
        assert len(rows) == 7

        # check some imputation results
        r = engine.execute('select * from events_aggregation_imputed order by entity_id, as_of_date')
        rows = [x for x in r]
        assert rows[6]['entity_id'] == 4
        assert rows[6]['as_of_date'] == date(2015, 1, 1)
        assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_1y_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_1y_outcome::int_avg']), 4) == 0.1667
        assert rows[6]['events_entity_id_1y_outcome::int_avg_imp'] == 1
        assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_2y_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_2y_outcome::int_avg']), 4) == 0.3333
        assert rows[6]['events_entity_id_2y_outcome::int_avg_imp'] == 1
        assert rows[6]['events_entity_id_all_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_all_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_all_outcome::int_avg']), 4) == 0.3333
        assert rows[6]['events_entity_id_all_outcome::int_avg_imp'] == 1
        assert rows[7]['entity_id'] == 4
        assert rows[7]['as_of_date'] == date(2016, 1, 1)
        assert rows[7]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_avg_imp'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_avg_imp'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_avg_imp'] == 0
        assert len(rows) == 8
Esempio n. 4
0
def test_join_with_cohort_table(db_engine):
    # if we specify joining with the cohort table
    # only entity_id/date pairs in the cohort table should show up
    db_engine.execute("create table events (entity_id int, date date, outcome bool)")
    for event in events_data:
        db_engine.execute("insert into events values (%s, %s, %s::bool)", event)

    db_engine.execute("create table cohort (entity_id int, date date)")

    # use the states list from above except only include entities 1 and 2 in the cohort
    smaller_cohort = sorted(
        product(
            set([l[0] for l in events_data if l[0] == 1 or l[0] == 2]),
            set([l[1] for l in events_data] + [date(2016, 1, 1)]),
        )
    )
    for state in smaller_cohort:
        db_engine.execute("insert into cohort values (%s, %s)", state)

    # create our test aggregation with the important 'join_with_cohort_table' flag
    agg = Aggregate(
        "outcome::int",
        ["sum", "avg"],
        {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"},
        },
    )
    st = SpacetimeAggregation(
        aggregates=[agg],
        from_obj="events",
        groups=["entity_id"],
        intervals=["all"],
        dates=["2016-01-01", "2015-01-01"],
        state_table="cohort",
        state_group="entity_id",
        date_column='"date"',
        join_with_cohort_table=True,
    )

    st.execute(db_engine.connect())

    r = db_engine.execute("select * from events_entity_id order by entity_id, date")
    rows = [x for x in r]

    # these rows should be similar to the rows in the basic spacetime test,
    # except only the rows for entities 1 and 2 are present
    assert len(rows) == 4

    assert rows[0]["entity_id"] == 1
    assert rows[0]["date"] == date(2015, 1, 1)
    assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[1]["entity_id"] == 1
    assert rows[1]["date"] == date(2016, 1, 1)
    assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
    assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

    assert rows[2]["entity_id"] == 2
    assert rows[2]["date"] == date(2015, 1, 1)
    assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[3]["entity_id"] == 2
    assert rows[3]["date"] == date(2016, 1, 1)
    assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5
Esempio n. 5
0
def test_input_min_date():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            'create table events (entity_id int, date date, outcome bool)'
        )
        for event in events_data:
            engine.execute(
                'insert into events values (%s, %s, %s::bool)',
                event
            )

        engine.execute(
            'create table states (entity_id int, date date)'
        )
        for state in state_data:
            engine.execute(
                'insert into states values (%s, %s)',
                state
            )

        agg = Aggregate('outcome::int', ['sum','avg'], {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"}
        })
        st = SpacetimeAggregation(
            aggregates = [agg],
            from_obj = 'events',
            groups = ['entity_id'],
            intervals = ['all'],
            dates = ['2016-01-01'],
            state_table = 'states',
            state_group = 'entity_id',
            date_column = '"date"',
            input_min_date = '2015-11-10'
        )

        st.execute(engine.connect())

        r = engine.execute('select * from events_entity_id order by entity_id')
        rows = [x for x in r]

        assert rows[0]['entity_id'] == 1
        assert rows[0]['date'] == date(2016, 1, 1)
        assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_all_outcome::int_avg'] == 1
        assert rows[1]['entity_id'] == 4
        assert rows[1]['date'] == date(2016, 1, 1)
        assert rows[1]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0

        assert len(rows) == 2

        st = SpacetimeAggregation(
            aggregates = [agg],
            from_obj = 'events',
            groups = ['entity_id'],
            intervals = ['1y', 'all'],
            dates = ['2016-01-01', '2015-01-01'],
            state_table = 'states',
            state_group = 'entity_id',
            date_column = '"date"',
            input_min_date = '2014-11-10'
        )
        with pytest.raises(ValueError):
            st.validate(engine.connect())
        with pytest.raises(ValueError):
            st.execute(engine.connect())