Ejemplo n.º 1
0
def test_categorical_cast():
    cat = Categorical("c", ['A','B','C'], "sum", {}, coltype="SMALLINT")
    assert list(map(str, cat.get_columns())) == [
        "sum((c = 'A')::INT)::SMALLINT",
        "sum((c = 'B')::INT)::SMALLINT",
        "sum((c = 'C')::INT)::SMALLINT"
    ]
Ejemplo n.º 2
0
def test_categorical_nones():
    d1 = Categorical('col', {
        'vala': 'a',
        'valb': 'b',
        'valc': 'c',
        '_NULL': None
    }, [], {}).quantities
    d2 = Compare('col',
                 '=', {
                     'vala': 'a',
                     'valb': 'b',
                     'valc': 'c'
                 }, [], {},
                 op_in_name=False,
                 include_null=True).quantities
    assert d1 == d2
    d3 = Categorical('col', ['a', 'b', 'c', None], [], {}).quantities
    assert sorted(d1.values()) == sorted(d2.values())
Ejemplo n.º 3
0
def test_categorical_same_as_compare():
    d1 = Categorical("col", {
        "vala": "a",
        "valb": "b",
        "valc": "c"
    }, [], {}).quantities
    d2 = Compare("col", "=", {
        "vala": "a",
        "valb": "b",
        "valc": "c"
    }, [], {}).quantities
    assert sorted(d1.values()) == sorted(d2.values())
    d3 = Categorical("col", {
        "vala": "a",
        "valb": "b",
        "valc": "c"
    }, [], {},
                     op_in_name=True).quantities
    assert d2 == d3
Ejemplo n.º 4
0
def test_categorical_same_as_compare():
    d1 = Categorical('col', {
        'vala': 'a',
        'valb': 'b',
        'valc': 'c'
    }, [], {}).quantities
    d2 = Compare('col', '=', {
        'vala': 'a',
        'valb': 'b',
        'valc': 'c'
    }, [], {}).quantities
    assert sorted(d1.values()) == sorted(d2.values())
    d3 = Categorical('col', {
        'vala': 'a',
        'valb': 'b',
        'valc': 'c'
    }, [], {},
                     op_in_name=True).quantities
    assert d2 == d3
Ejemplo n.º 5
0
def test_index_column_lookup(test_engine):
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id", "zip_code"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]

    features_schema_name = "features"
    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    )
    lookup = feature_generator.index_column_lookup(aggregations)
    assert lookup == {
        "prefix1_aggregation_imputed": ["as_of_date", "entity_id"],
        "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"],
    }
Ejemplo n.º 6
0
 def _build_categoricals(self, categorical_config, impute_rules):
     # TODO: only include null flag where necessary
     return [
         Categorical(col=categorical['column'],
                     choices=self._build_choices(categorical),
                     function=categorical['metrics'],
                     impute_rules=dict(impute_rules,
                                       coltype='categorical',
                                       **categorical.get('imputation', {})),
                     include_null=True)
         for categorical in categorical_config
     ]
Ejemplo n.º 7
0
 def _build_categoricals(self, categorical_config, impute_rules):
     # TODO: only include null flag where necessary
     return [
         Categorical(
             col=categorical["column"],
             choices=self._build_choices(categorical),
             function=categorical["metrics"],
             impute_rules=dict(impute_rules,
                               coltype="categorical",
                               **categorical.get("imputation", {})),
             include_null=True,
         ) for categorical in categorical_config
     ]
Ejemplo n.º 8
0
def test_categorical_nones():
    d1 = Categorical("col", {
        "vala": "a",
        "valb": "b",
        "valc": "c",
        "_NULL": None
    }, [], {}).quantities
    d2 = Compare(
        "col",
        "=",
        {
            "vala": "a",
            "valb": "b",
            "valc": "c"
        },
        [],
        {},
        op_in_name=False,
        include_null=True,
    ).quantities
    assert d1 == d2
    d3 = Categorical("col", ["a", "b", "c", None], [], {}).quantities
    assert sorted(d1.values()) == sorted(d3.values())
Ejemplo n.º 9
0
def test_generate_table_tasks(test_engine):
    test_engine.execute('create schema features')
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]
    features_schema_name = "features"

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="aggregation")
    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)

    # build the aggregation tables to check the imputation tasks
    FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).process_table_tasks(table_tasks)

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="imputation")

    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)
Ejemplo n.º 10
0
def test_generate_table_tasks():
    aggregations = [
        SpacetimeAggregation(prefix='prefix1',
                             aggregates=[
                                 Categorical(
                                     col='cat_one',
                                     function='sum',
                                     choices=['good', 'bad', 'inbetween'],
                                     impute_rules={
                                         'coltype': 'categorical',
                                         'all': {
                                             'type': 'zero'
                                         }
                                     })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data'),
        SpacetimeAggregation(prefix='prefix2',
                             aggregates=[
                                 Aggregate(quantity='quantity_one',
                                           function='count',
                                           impute_rules={
                                               'coltype': 'aggregate',
                                               'all': {
                                                   'type': 'zero'
                                               }
                                           })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data')
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'

        table_tasks = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name
        ).generate_all_table_tasks(aggregations, task_type='aggregation')
        for table_name, task in table_tasks.items():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)

        # build the aggregation tables to check the imputation tasks
        FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name).process_table_tasks(
                table_tasks)

        table_tasks = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name
        ).generate_all_table_tasks(aggregations, task_type='imputation')
        for table_name, task in table_tasks.items():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)
Ejemplo n.º 11
0
def test_index_column_lookup():
    aggregations = [
        SpacetimeAggregation(prefix='prefix1',
                             aggregates=[
                                 Categorical(
                                     col='cat_one',
                                     function='sum',
                                     choices=['good', 'bad', 'inbetween'],
                                     impute_rules={
                                         'coltype': 'categorical',
                                         'all': {
                                             'type': 'zero'
                                         }
                                     })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data'),
        SpacetimeAggregation(prefix='prefix2',
                             aggregates=[
                                 Aggregate(quantity='quantity_one',
                                           function='count',
                                           impute_rules={
                                               'coltype': 'aggregate',
                                               'all': {
                                                   'type': 'zero'
                                               }
                                           })
                             ],
                             groups=['entity_id', 'zip_code'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data')
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        feature_generator = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name)
        lookup = feature_generator.index_column_lookup(aggregations)
        assert lookup == {
            'prefix1_aggregation_imputed': [
                'as_of_date',
                'entity_id',
            ],
            'prefix2_aggregation_imputed':
            ['as_of_date', 'entity_id', 'zip_code']
        }