Esempio n. 1
0
def test_distinct():
    assert list(map(str, Aggregate("distinct x", "count", {}).get_columns())) == [
        "count(distinct x)"
    ]

    assert list(
        map(
            str,
            Aggregate("distinct x", "count", {}).get_columns(
                when="date < '2012-01-01'"
            ),
        )
    ) == ["count(distinct x) FILTER (WHERE date < '2012-01-01')"]

    assert list(
        map(
            str,
            Aggregate("distinct(x)", "count", {}).get_columns(
                when="date < '2012-01-01'"
            ),
        )
    ) == ["count(distinct (x)) FILTER (WHERE date < '2012-01-01')"]

    assert list(
        map(
            str,
            Aggregate("distinct(x,y)", "count", {}).get_columns(
                when="date < '2012-01-01'"
            ),
        )
    ) == ["count(distinct (x,y)) FILTER (WHERE date < '2012-01-01')"]
Esempio n. 2
0
def test_aggregate_arithmetic():
    n = Aggregate("x", "sum", {})
    d = Aggregate("1", "count", {})
    m = Aggregate("y", "avg", {})

    (e,) = (n / d + m).get_columns(prefix="prefix_")
    assert str(e) == "((sum(x)*1.0 / count(1)) + avg(y))"
    assert e.name == "prefix_x_sum/1_count+y_avg"
Esempio n. 3
0
def test_Aggregation_colname_aggregate_lookup():
    n = Aggregate("x", "sum", {})
    d = Aggregate("1", "count", {})
    m = Aggregate("y", "avg", {})
    aggregation = Aggregation([n, d, m],
                              groups=['entity_id'],
                              from_obj="source",
                              prefix="mysource",
                              state_table="tbl")
    assert aggregation.colname_aggregate_lookup == {
        'mysource_entity_id_x_sum': 'sum',
        'mysource_entity_id_1_count': 'count',
        'mysource_entity_id_y_avg': 'avg'
    }
Esempio n. 4
0
def test_Aggregation_colname_agg_function():
    n = Aggregate("x", "sum", {})
    d = Aggregate("1", "count", {})
    m = Aggregate("y", "stddev_samp", {})
    aggregation = Aggregation([n, d, m],
                              groups=['entity_id'],
                              from_obj="source",
                              prefix="mysource",
                              state_table="tbl")

    assert aggregation.colname_agg_function(
        'mysource_entity_id_x_sum') == 'sum'
    assert aggregation.colname_agg_function(
        'mysource_entity_id_y_stddev_samp') == 'stddev_samp'
Esempio n. 5
0
def test_aggregate_format_kwargs():
    agg = Aggregate("'{collate_date}' - date", "min", {})
    assert list(
        map(str, agg.get_columns(
            format_kwargs={"collate_date": "2012-01-01"}))) == [
                "min('2012-01-01' - date)"
            ]
Esempio n. 6
0
def test_st_explicit_execute():
    agg = Aggregate({"F": "results='Fail'"}, ["count"], IMPUTE_RULES)
    mode = Aggregate("", "mode", IMPUTE_RULES, order="zip")
    st = SpacetimeAggregation(
        [agg, agg + agg, mode],
        from_obj=ex.table("food_inspections"),
        groups={"license": ex.column("license_no"), "zip": ex.column("zip")},
        intervals={"license": ["1 year", "2 years", "all"], "zip": ["1 year"]},
        dates=["2016-08-30", "2015-11-06"],
        state_table="inspection_states",
        state_group="license_no",
        date_column="inspection_date",
        prefix="food_inspections",
    )
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 7
0
def test_Aggregation_imputation_flag_base():
    n = Aggregate("x", ["sum", "count"], {})
    m = Aggregate("y", "stddev_samp", {})
    aggregation = Aggregation([n, m],
                              groups=['entity_id'],
                              from_obj="source",
                              prefix="mysource",
                              state_table="tbl")

    assert aggregation.imputation_flag_base(
        'mysource_entity_id_x_sum') == 'mysource_entity_id_x'
    assert aggregation.imputation_flag_base(
        'mysource_entity_id_x_count') == 'mysource_entity_id_x'
    assert aggregation.imputation_flag_base(
        'mysource_entity_id_y_stddev_samp'
    ) == 'mysource_entity_id_y_stddev_samp'
    with pytest.raises(KeyError):
        aggregation.imputation_flag_base('mysource_entity_id_x_stddev_samp')
Esempio n. 8
0
def test_execute():
    agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES)
    st = Aggregation(
        [agg],
        from_obj='food_inspections',
        groups=['license_no', 'zip'],
        state_table='all_licenses',
        state_group='license_no',
    )
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 9
0
def test_st_explicit_execute():
    agg = Aggregate({'F': "results='Fail'"}, ["count"], IMPUTE_RULES)
    mode = Aggregate("", "mode", IMPUTE_RULES, order="zip")
    st = SpacetimeAggregation([agg, agg + agg, mode],
                              from_obj=ex.table('food_inspections'),
                              groups={
                                  'license': ex.column('license_no'),
                                  'zip': ex.column('zip')
                              },
                              intervals={
                                  'license': ["1 year", "2 years", "all"],
                                  'zip': ["1 year"]
                              },
                              dates=['2016-08-30', '2015-11-06'],
                              state_table='inspection_states',
                              state_group='license_no',
                              date_column='inspection_date',
                              prefix='food_inspections')
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 10
0
def test_index_column_lookup(test_engine):
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id", "zip_code"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]

    features_schema_name = "features"
    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    )
    lookup = feature_generator.index_column_lookup(aggregations)
    assert lookup == {
        "prefix1_aggregation_imputed": ["as_of_date", "entity_id"],
        "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"],
    }
Esempio n. 11
0
def test_st_execute_broadcast_intervals():
    agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES)
    st = SpacetimeAggregation([agg],
                              from_obj='food_inspections',
                              groups=['license_no', 'zip'],
                              intervals=["1 year", "2 years", "all"],
                              dates=['2016-08-30', '2015-11-06'],
                              state_table='inspection_states',
                              state_group='license_no',
                              date_column='"inspection_date"')
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 12
0
def test_st_lazy_execute():
    agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES)
    st = SpacetimeAggregation(
        [agg],
        from_obj="food_inspections",
        groups=["license_no", "zip"],
        intervals={"license_no": ["1 year", "2 years", "all"], "zip": ["1 year"]},
        dates=["2016-08-30", "2015-11-06"],
        state_table="inspection_states",
        state_group="license_no",
        date_column='"inspection_date"',
    )
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 13
0
def test_aggregate_imputation_lookup_all():
    agg = Aggregate(
        "a",
        ["avg", "sum"],
        {
            "coltype": "aggregate",
            "all": {"type": "zero"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "mean"},
        },
    )
    assert agg.column_imputation_lookup()["a_avg"]["type"] == "zero"
    assert agg.column_imputation_lookup()["a_avg"]["coltype"] == "aggregate"
    assert agg.column_imputation_lookup()["a_sum"]["type"] == "constant"
    assert agg.column_imputation_lookup()["a_sum"]["value"] == 3
    assert agg.column_imputation_lookup()["a_sum"]["coltype"] == "aggregate"
Esempio n. 14
0
    def _aggregation(self, aggregation_config, feature_dates, state_table):
        logging.info(
            "Building collate.SpacetimeAggregation for config %s and %s as_of_dates",
            aggregation_config,
            len(feature_dates),
        )

        # read top-level imputation rules from the aggregation config; we'll allow
        # these to be overridden by imputation rules at the individual feature
        # level as those get parsed as well
        agimp = aggregation_config.get("aggregates_imputation", {})
        catimp = aggregation_config.get("categoricals_imputation", {})
        arrcatimp = aggregation_config.get("array_categoricals_imputation", {})

        aggregates = [
            Aggregate(
                aggregate["quantity"],
                aggregate["metrics"],
                dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})),
                coltype=aggregate.get('coltype', None)
            )
            for aggregate in aggregation_config.get("aggregates", [])
        ]
        logging.info("Found %s quantity aggregates", len(aggregates))
        categoricals = self._build_categoricals(
            aggregation_config.get("categoricals", []), catimp
        )
        logging.info("Found %s categorical aggregates", len(categoricals))
        array_categoricals = self._build_array_categoricals(
            aggregation_config.get("array_categoricals", []), arrcatimp
        )
        logging.info("Found %s array categorical aggregates", len(array_categoricals))
        return SpacetimeAggregation(
            aggregates + categoricals + array_categoricals,
            from_obj=aggregation_config["from_obj"],
            intervals=aggregation_config["intervals"],
            groups=aggregation_config["groups"],
            dates=feature_dates,
            state_table=state_table,
            state_group=self.entity_id_column,
            date_column=aggregation_config["knowledge_date_column"],
            output_date_column="as_of_date",
            input_min_date=self.feature_start_time,
            schema=self.features_schema_name,
            prefix=aggregation_config["prefix"],
        )
Esempio n. 15
0
def test_execute_schema_output_date_column():
    agg = Aggregate("results='Fail'", ["count"], IMPUTE_RULES)
    st = SpacetimeAggregation([agg],
                              from_obj='food_inspections',
                              groups=['license_no', 'zip'],
                              intervals={
                                  'license_no': ["1 year", "2 years", "all"],
                                  'zip': ["1 year"]
                              },
                              dates=['2016-08-30', '2015-11-06'],
                              state_table='inspection_states_diff_colname',
                              state_group='license_no',
                              schema="agg",
                              date_column='"inspection_date"',
                              output_date_column="aggregation_date")
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        st.execute(engine.connect())
Esempio n. 16
0
def test_aggregate_imputation_lookup_all():
    agg = Aggregate(
        "a", ["avg", "sum"], {
            "coltype": "aggregate",
            "all": {
                "type": "zero"
            },
            "sum": {
                "type": "constant",
                "value": 3
            },
            "max": {
                "type": "mean"
            }
        })
    assert agg.column_imputation_lookup()['a_avg']['type'] == 'zero'
    assert agg.column_imputation_lookup()['a_avg']['coltype'] == 'aggregate'
    assert agg.column_imputation_lookup()['a_sum']['type'] == 'constant'
    assert agg.column_imputation_lookup()['a_sum']['value'] == 3
    assert agg.column_imputation_lookup()['a_sum']['coltype'] == 'aggregate'
Esempio n. 17
0
    def _aggregation(self, aggregation_config, feature_dates, state_table):
        logging.info(
            'Building collate.SpacetimeAggregation for config %s and as_of_dates %s',
            aggregation_config, feature_dates)

        # read top-level imputation rules from the aggregation config; we'll allow
        # these to be overridden by imputation rules at the individual feature
        # level as those get parsed as well
        agimp = aggregation_config.get('aggregates_imputation', {})
        catimp = aggregation_config.get('categoricals_imputation', {})
        arrcatimp = aggregation_config.get('array_categoricals_imputation', {})

        aggregates = [
            Aggregate(
                aggregate['quantity'], aggregate['metrics'],
                dict(agimp,
                     coltype='aggregate',
                     **aggregate.get('imputation', {})))
            for aggregate in aggregation_config.get('aggregates', [])
        ]
        logging.info('Found %s quantity aggregates', len(aggregates))
        categoricals = self._build_categoricals(
            aggregation_config.get('categoricals', []), catimp)
        logging.info('Found %s categorical aggregates', len(categoricals))
        array_categoricals = self._build_array_categoricals(
            aggregation_config.get('array_categoricals', []), arrcatimp)
        logging.info('Found %s array categorical aggregates',
                     len(array_categoricals))
        return SpacetimeAggregation(
            aggregates + categoricals + array_categoricals,
            from_obj=aggregation_config['from_obj'],
            intervals=aggregation_config['intervals'],
            groups=aggregation_config['groups'],
            dates=feature_dates,
            state_table=state_table,
            state_group=self.entity_id_column,
            date_column=aggregation_config['knowledge_date_column'],
            output_date_column='as_of_date',
            input_min_date=self.feature_start_time,
            schema=self.features_schema_name,
            prefix=aggregation_config['prefix'])
Esempio n. 18
0
def test_basic_spacetime():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            "create table events (entity_id int, event_date date, outcome bool)"
        )
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, as_of_date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "2y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column="event_date",
            output_date_column="as_of_date",
        )

        st.execute(engine.connect())

        r = engine.execute(
            "select * from events_entity_id order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[0]["entity_id"] == 1
        assert rows[0]["as_of_date"] == date(2015, 1, 1)
        assert rows[0]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[1]["entity_id"] == 1
        assert rows[1]["as_of_date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[1]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_2y_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[2]["entity_id"] == 2
        assert rows[2]["as_of_date"] == date(2015, 1, 1)
        assert rows[2]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[2]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[2]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[3]["entity_id"] == 2
        assert rows[3]["as_of_date"] == date(2016, 1, 1)
        assert rows[3]["events_entity_id_1y_outcome::int_sum"] is None
        assert rows[3]["events_entity_id_1y_outcome::int_avg"] is None
        assert rows[3]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[3]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[4]["entity_id"] == 3
        assert rows[4]["as_of_date"] == date(2015, 1, 1)
        assert rows[4]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[5]["entity_id"] == 3
        assert rows[5]["as_of_date"] == date(2016, 1, 1)
        assert rows[5]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[5]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_2y_outcome::int_avg"] == 0.25
        assert rows[5]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_all_outcome::int_avg"] == 0.25

        assert rows[6]["entity_id"] == 4
        # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data!
        assert rows[6]["as_of_date"] == date(2016, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0
        assert len(rows) == 7

        # check some imputation results
        r = engine.execute(
            "select * from events_aggregation_imputed order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[6]["entity_id"] == 4
        assert rows[6]["as_of_date"] == date(2015, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_1y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_1y_outcome::int_avg"]), 4) == 0.1667
        )
        assert rows[6]["events_entity_id_1y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_2y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_2y_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_2y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_all_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_all_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_all_outcome::int_avg_imp"] == 1
        assert rows[7]["entity_id"] == 4
        assert rows[7]["as_of_date"] == date(2016, 1, 1)
        assert rows[7]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg_imp"] == 0
        assert len(rows) == 8
Esempio n. 19
0
def test_input_min_date():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute("create table events (entity_id int, date date, outcome bool)")
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["all"],
            dates=["2016-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2015-11-10",
        )

        st.execute(engine.connect())

        r = engine.execute("select * from events_entity_id order by entity_id")
        rows = [x for x in r]

        assert rows[0]["entity_id"] == 1
        assert rows[0]["date"] == date(2016, 1, 1)
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 1
        assert rows[1]["entity_id"] == 4
        assert rows[1]["date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0

        assert len(rows) == 2

        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2014-11-10",
        )
        with pytest.raises(ValueError):
            st.validate(engine.connect())
        with pytest.raises(ValueError):
            st.execute(engine.connect())
Esempio n. 20
0
def test_join_with_cohort_table(db_engine):
    # if we specify joining with the cohort table
    # only entity_id/date pairs in the cohort table should show up
    db_engine.execute("create table events (entity_id int, date date, outcome bool)")
    for event in events_data:
        db_engine.execute("insert into events values (%s, %s, %s::bool)", event)

    db_engine.execute("create table cohort (entity_id int, date date)")

    # use the states list from above except only include entities 1 and 2 in the cohort
    smaller_cohort = sorted(
        product(
            set([l[0] for l in events_data if l[0] == 1 or l[0] == 2]),
            set([l[1] for l in events_data] + [date(2016, 1, 1)]),
        )
    )
    for state in smaller_cohort:
        db_engine.execute("insert into cohort values (%s, %s)", state)

    # create our test aggregation with the important 'join_with_cohort_table' flag
    agg = Aggregate(
        "outcome::int",
        ["sum", "avg"],
        {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"},
        },
    )
    st = SpacetimeAggregation(
        aggregates=[agg],
        from_obj="events",
        groups=["entity_id"],
        intervals=["all"],
        dates=["2016-01-01", "2015-01-01"],
        state_table="cohort",
        state_group="entity_id",
        date_column='"date"',
        join_with_cohort_table=True,
    )

    st.execute(db_engine.connect())

    r = db_engine.execute("select * from events_entity_id order by entity_id, date")
    rows = [x for x in r]

    # these rows should be similar to the rows in the basic spacetime test,
    # except only the rows for entities 1 and 2 are present
    assert len(rows) == 4

    assert rows[0]["entity_id"] == 1
    assert rows[0]["date"] == date(2015, 1, 1)
    assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[1]["entity_id"] == 1
    assert rows[1]["date"] == date(2016, 1, 1)
    assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
    assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

    assert rows[2]["entity_id"] == 2
    assert rows[2]["date"] == date(2015, 1, 1)
    assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[3]["entity_id"] == 2
    assert rows[3]["date"] == date(2016, 1, 1)
    assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5
Esempio n. 21
0
def test_aggregate():
    agg = Aggregate("*", "count", {})
    assert list(map(str, agg.get_columns())) == ["count(*)"]
Esempio n. 22
0
def test_aggregate_tuple_quantity_when():
    agg = Aggregate(("x", "y"), "corr", {})
    assert list(map(str, agg.get_columns(when="date < '2012-01-01'"))) == [
        "corr(x, y) FILTER (WHERE date < '2012-01-01')"
    ]
Esempio n. 23
0
def test_aggregate_tuple_quantity():
    agg = Aggregate(("x", "y"), "corr", {})
    assert list(map(str, agg.get_columns())) == ["corr(x, y)"]
Esempio n. 24
0
def test_ordered_aggregate_when():
    agg = Aggregate("", "mode", {}, "x")
    assert list(map(str, agg.get_columns(when="date < '2012-01-01'"))) == [
        "mode() WITHIN GROUP (ORDER BY x) FILTER (WHERE date < '2012-01-01')"
    ]
Esempio n. 25
0
def test_ordered_aggregate():
    agg = Aggregate("", "mode", {}, "x")
    (expression,) = agg.get_columns()
    assert str(expression) == "mode() WITHIN GROUP (ORDER BY x)"
    assert expression.name == "x_mode"
Esempio n. 26
0
def test_imputation_output(feat_list, exp_imp_cols, feat_table):
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())

        engine.execute("create table states (entity_id int, as_of_date date)")
        for state in states_table:
            engine.execute("insert into states values (%s, %s)", state)

        feat_sql = "\n".join(
            [", prefix_entity_id_1y_%s_max int" % f for f in feat_list])
        engine.execute("""create table prefix_aggregation (
                entity_id int
                , as_of_date date
                %s
                )""" % feat_sql)
        ins_sql = ("insert into prefix_aggregation values (%s, %s" +
                   (", %s" * len(feat_list)) + ")")
        for rec in feat_table:
            engine.execute(ins_sql, rec)

        for imp in available_imputations.keys():
            # skip error imputation
            if imp == "error":
                continue

            for coltype in ["aggregate", "categorical"]:
                # only consider
                if not imputation_values[imp][coltype]["avail"]:
                    continue

                impargs = imputation_values[imp][coltype]["kwargs"]
                aggs = [
                    Aggregate(
                        feat,
                        ["max"],
                        {
                            "coltype": coltype,
                            "all": dict(type=imp, **impargs)
                        },
                    ) for feat in feat_list
                ]
                st = SpacetimeAggregation(
                    aggregates=aggs,
                    from_obj="prefix_events",
                    prefix="prefix",
                    groups=["entity_id"],
                    intervals=["1y"],
                    dates=["2016-01-01", "2016-02-03", "2016-03-14"],
                    state_table="states",
                    state_group="entity_id",
                    date_column="as_of_date",
                    input_min_date="2000-01-01",
                    output_date_column="as_of_date",
                )

                conn = engine.connect()

                trans = conn.begin()

                # excute query to find columns with null values and create lists of columns
                # that do and do not need imputation when creating the imputation table
                res = conn.execute(st.find_nulls())
                null_counts = list(zip(res.keys(), res.fetchone()))
                impute_cols = [col for col, val in null_counts if val > 0]
                nonimpute_cols = [col for col, val in null_counts if val == 0]

                # sql to drop and create the imputation table
                drop_imp = st.get_drop(imputed=True)
                create_imp = st.get_impute_create(
                    impute_cols=impute_cols, nonimpute_cols=nonimpute_cols)

                # create the imputation table
                conn.execute(drop_imp)
                conn.execute(create_imp)

                trans.commit()

                # check the results
                df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed",
                                 engine)

                # we should have a record for every entity/date combo
                assert df.shape[0] == len(states_table)

                for feat in feat_list:
                    # all of the input columns should be in the result and be null-free
                    assert "prefix_entity_id_1y_%s_max" % feat in df.columns.values
                    assert df["prefix_entity_id_1y_%s_max" %
                              feat].isnull().sum() == 0

                    # for non-categoricals, should add an "imputed" column and be non-null
                    # (categoricals are expected to be handled through the null category)
                    # zero_noflag imputation should not generate a flag either
                    if (feat in exp_imp_cols and coltype != "categorical"
                            and imp != "zero_noflag"):
                        assert ("prefix_entity_id_1y_%s_imp" % feat
                                in df.columns.values)
                        assert (df["prefix_entity_id_1y_%s_imp" %
                                   feat].isnull().sum() == 0)
                    else:
                        # should not generate an imputed column when not needed
                        assert ("prefix_entity_id_1y_%s_imp" % feat
                                not in df.columns.values)
Esempio n. 27
0
def test_imputation_output(feat_list, exp_imp_cols, feat_table):
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())

        engine.execute('create table states (entity_id int, as_of_date date)')
        for state in states_table:
            engine.execute('insert into states values (%s, %s)', state)

        feat_sql = '\n'.join(
            [', prefix_entity_id_1y_%s_max int' % f for f in feat_list])
        engine.execute('''create table prefix_aggregation (
                entity_id int
                , as_of_date date
                %s
                )''' % feat_sql)
        ins_sql = 'insert into prefix_aggregation values (%s, %s'+\
            (', %s' * len(feat_list))+')'
        for rec in feat_table:
            engine.execute(ins_sql, rec)

        for imp in available_imputations.keys():
            # skip error imputation
            if imp == 'error':
                continue

            for coltype in ['aggregate', 'categorical']:
                # only consider
                if not imputation_values[imp][coltype]['avail']:
                    continue

                impargs = imputation_values[imp][coltype]['kwargs']
                aggs = [
                    Aggregate(feat, ['max'], {
                        'coltype': coltype,
                        'all': dict(type=imp, **impargs)
                    }) for feat in feat_list
                ]
                st = SpacetimeAggregation(
                    aggregates=aggs,
                    from_obj='prefix_events',
                    prefix='prefix',
                    groups=['entity_id'],
                    intervals=['1y'],
                    dates=['2016-01-01', '2016-02-03', '2016-03-14'],
                    state_table='states',
                    state_group='entity_id',
                    date_column='as_of_date',
                    input_min_date='2000-01-01',
                    output_date_column='as_of_date')

                conn = engine.connect()

                trans = conn.begin()

                # excute query to find columns with null values and create lists of columns
                # that do and do not need imputation when creating the imputation table
                res = conn.execute(st.find_nulls())
                null_counts = list(zip(res.keys(), res.fetchone()))
                impute_cols = [col for col, val in null_counts if val > 0]
                nonimpute_cols = [col for col, val in null_counts if val == 0]

                # sql to drop and create the imputation table
                drop_imp = st.get_drop(imputed=True)
                create_imp = st.get_impute_create(
                    impute_cols=impute_cols, nonimpute_cols=nonimpute_cols)

                # create the imputation table
                conn.execute(drop_imp)
                conn.execute(create_imp)

                trans.commit()

                # check the results
                df = pd.read_sql('SELECT * FROM prefix_aggregation_imputed',
                                 engine)

                # we should have a record for every entity/date combo
                assert df.shape[0] == len(states_table)

                for feat in feat_list:
                    # all of the input columns should be in the result and be null-free
                    assert 'prefix_entity_id_1y_%s_max' % feat in df.columns.values
                    assert df['prefix_entity_id_1y_%s_max' %
                              feat].isnull().sum() == 0

                    # for non-categoricals, should add an "imputed" column and be non-null
                    # (categoricals are expected to be handled through the null category)
                    # zero_noflag imputation should not generate a flag either
                    if feat in exp_imp_cols and coltype != 'categorical' and imp != 'zero_noflag':
                        assert 'prefix_entity_id_1y_%s_max_imp' % feat in df.columns.values
                        assert df['prefix_entity_id_1y_%s_max_imp' %
                                  feat].isnull().sum() == 0
                    else:
                        # should not generate an imputed column when not needed
                        assert 'prefix_entity_id_1y_%s_max_imp' % feat not in df.columns.values
Esempio n. 28
0
def test_aggregate_when():
    agg = Aggregate("1", "count", {})
    assert list(map(str, agg.get_columns(when="date < '2012-01-01'"))) == [
        "count(1) FILTER (WHERE date < '2012-01-01')"
    ]
Esempio n. 29
0
def test_generate_table_tasks(test_engine):
    test_engine.execute('create schema features')
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]
    features_schema_name = "features"

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="aggregation")
    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)

    # build the aggregation tables to check the imputation tasks
    FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).process_table_tasks(table_tasks)

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="imputation")

    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)
Esempio n. 30
0
def test_aggregate_when_cast():
    agg = Aggregate("", "mode", {}, "x", coltype="SMALLINT")
    assert list(map(str, agg.get_columns(when="date < '2012-01-01'"))) == [
        "mode() WITHIN GROUP (ORDER BY x) FILTER (WHERE date < '2012-01-01')::SMALLINT"
    ]