def test_creating_transaction(self):

        with testing.postgresql.Postgresql() as postgresql:
            print("url={}".format(postgresql.url()))
            print("data directory={}".format(postgresql.get_data_directory()))

            engine = create_engine(postgresql.url())
            Session = sessionmaker(bind=engine)

            # Create schema
            Base.metadata.create_all(engine)

            # Create session
            session = Session()
            print("session started")

            # Add transactions
            for sub_id in range(20):
                session.add(build_message(submission_id=sub_id, collection_data={'json_id': sub_id, 'two': 2}))
            session.commit()
            print("added transaction")

            # test standard query
            all_transactions = session.query(Transaction).filter(Transaction.status == TransactionStatus.ready).all()
            self.assertEqual(20, len(all_transactions))

            # test json field query
            json_query = session.query(Transaction).filter(Transaction.collection_data['json_id'].astext.cast(Integer) >= 10).all()
            self.assertEqual(10, len(json_query))

            session.close()
def test_training_label_generation():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_events(engine)

        labels_table_name = 'labels'

        label_generator = BinaryLabelGenerator(
            events_table='events',
            db_engine=engine,
        )
        label_generator._create_labels_table(labels_table_name)
        label_generator.generate(
            start_date='2014-09-30',
            label_window='6month',
            labels_table=labels_table_name
        )

        result = engine.execute(
            'select * from {} order by entity_id, as_of_date'.format(labels_table_name)
        )
        records = [row for row in result]

        expected = [
            # entity_id, as_of_date, label_window, name, type, label
            (1, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False),
            (3, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', True),
            (4, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False),
        ]

        assert records == expected
def pg_engine():
    with testing.postgresql.Postgresql() as postgresql:
        engine = sa.create_engine(postgresql.url())
        models.Base.metadata.create_all(bind=engine)

        yield engine

        engine.dispose()
    def test_train_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states
            )

            dates = [datetime.datetime(2016, 1, 1, 0, 0),
                     datetime.datetime(2016, 2, 1, 0, 0),
                     datetime.datetime(2016, 3, 1, 0, 0)]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(
                    beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                    label_names = ['booking'],
                    label_types = ['binary'],
                    states = ['state_one AND state_two'],
                    db_config = db_config,
                    matrix_directory = temp_dir,
                    user_metadata = {},
                    engine = engine
                )
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'beginning_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_window': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                planner.build_matrix(
                    as_of_times = dates,
                    label_name = 'booking',
                    label_type = 'binary',
                    feature_dictionary = feature_dictionary,
                    matrix_directory = temp_dir,
                    matrix_metadata = matrix_metadata,
                    matrix_uuid = uuid,
                    matrix_type = 'train'
                )

                matrix_filename = os.path.join(
                    temp_dir,
                    '{}.csv'.format(uuid)
                )
                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert(len([row for row in reader]) == 6)
def test_replace():
    aggregate_config = [{
        'prefix': 'aprefix',
        'aggregates': [
            {'quantity': 'quantity_one', 'metrics': ['sum', 'count']},
        ],
        'categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column': 'knowledge_date',
        'from_obj': 'data'
    }]

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        feature_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False
        ).create_all_tables(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )

        assert len(feature_tables) == 1

        table_tasks = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False
        ).generate_all_table_tasks(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )

        assert len(table_tasks['aprefix_entity_id'].keys()) == 0
Beispiel #6
0
def app(request):
    # Create PostgreSQL server on the fly
    postgresql = testing.postgresql.Postgresql()

    # And override the database URL
    app = factory('passgen.config.development')
    app.config['SQLALCHEMY_DATABASE_URI'] = postgresql.url()

    # Set up schema
    with app.app_context():
        migrate_extension.upgrade(revision='head')

    def fin():
        postgresql.stop()

    request.addfinalizer(fin)

    return app
def test_generate_table_tasks():
    aggregate_config = [{
        'prefix': 'prefix1',
        'categoricals': [
            {
                'column': 'cat_one',
                'choice_query': 'select distinct(cat_one) from data',
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column': 'knowledge_date',
        'from_obj': 'data'
    }, {
        'prefix': 'prefix2',
        'aggregates': [
            {'quantity': 'quantity_one', 'metrics': ['count']},
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column': 'knowledge_date',
        'from_obj': 'data'
    }]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'

        table_tasks = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name
        ).generate_all_table_tasks(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )
        for task in table_tasks.values():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)
def test_sparse_state_table_generator():
    input_data = [
        (5, 'permitted', datetime(2016, 1, 1), datetime(2016, 6, 1)),
        (6, 'permitted', datetime(2016, 2, 5), datetime(2016, 5, 5)),
        (1, 'injail', datetime(2014, 7, 7), datetime(2014, 7, 15)),
        (1, 'injail', datetime(2016, 3, 7), datetime(2016, 4, 2)),
    ]

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_dense_state_table(engine, 'states', input_data)

        table_generator = StateTableGenerator(engine, 'exp_hash')
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_sparse_table('states', as_of_dates)
        results = [row for row in engine.execute(
            'select entity_id, as_of_date, injail, permitted from {} order by entity_id, as_of_date'.format(
                table_generator.sparse_table_name
            ))]
        expected_output = [
            # entity_id, as_of_date, injail, permitted
            (1, datetime(2016, 4, 1), True, False),
            (5, datetime(2016, 1, 1), False, True),
            (5, datetime(2016, 2, 1), False, True),
            (5, datetime(2016, 3, 1), False, True),
            (5, datetime(2016, 4, 1), False, True),
            (5, datetime(2016, 5, 1), False, True),
            (6, datetime(2016, 3, 1), False, True),
            (6, datetime(2016, 4, 1), False, True),
            (6, datetime(2016, 5, 1), False, True),
        ]
        assert results == expected_output
        assert_index(engine, table_generator.sparse_table_name, 'entity_id')
        assert_index(engine, table_generator.sparse_table_name, 'as_of_date')
def test_write_to_csv():
    """ Test the write_to_csv function by checking whether the csv contains the
    correct number of lines.
    """
    with testing.postgresql.Postgresql() as postgresql:
        # create an engine and generate a table with fake feature data
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine,
            features_tables=features_tables,
            labels=labels,
            states=states
        )

        with TemporaryDirectory() as temp_dir:
            planner = Planner(
                beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                label_names = ['booking'],
                label_types = ['binary'],
                states = ['state_one AND state_two'],
                db_config = db_config,
                matrix_directory = temp_dir,
                user_metadata = {},
                engine = engine,
                builder_class = builders.LowMemoryCSVBuilder
            )

            # for each table, check that corresponding csv has the correct # of rows
            for table in features_tables:
                with NamedTempFile() as f:
                    planner.builder.write_to_csv(
                        '''
                            select * 
                            from features.features{}
                        '''.format(features_tables.index(table)),
                        f.name
                    )
                    f.seek(0)
                    reader = csv.reader(f)
                    assert(len([row for row in reader]) == len(table) + 1)
def test_feature_dictionary_creator():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        engine.execute('create schema features')
        engine.execute('''
            create table features.feature_table_one (
                entity_id int,
                as_of_date date,
                feature_one float,
                feature_two float
            )
        ''')
        engine.execute('''
            create table features.feature_table_two (
                entity_id int,
                as_of_date date,
                feature_three float,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.random_other_table (
                another_column float
            )
        ''')

        creator = FeatureDictionaryCreator(
            features_schema_name='features',
            db_engine=engine
        )
        feature_dictionary = creator.feature_dictionary(
            ['feature_table_one', 'feature_table_two']
        )
        assert feature_dictionary == {
            'feature_table_one': ['feature_one', 'feature_two'],
            'feature_table_two': ['feature_three', 'feature_four'],
        }
def test_generate_all_labels():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_events(engine)

        labels_table_name = 'labels'

        label_generator = BinaryLabelGenerator(
            events_table='events',
            db_engine=engine,
        )
        label_generator.generate_all_labels(
            labels_table=labels_table_name,
            as_of_dates=['2014-09-30', '2015-03-30'],
            label_windows=['6month', '3month'],
        )

        result = engine.execute('''
            select * from {}
            order by entity_id, as_of_date, label_window desc
        '''.format(labels_table_name)
        )
        records = [row for row in result]

        expected = [
            # entity_id, as_of_date, label_window, name, type, label
            (1, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False),
            (1, date(2014, 9, 30), timedelta(90), 'outcome', 'binary', False),
            (2, date(2015, 3, 30), timedelta(180), 'outcome', 'binary', False),
            (2, date(2015, 3, 30), timedelta(90), 'outcome', 'binary', False),
            (3, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', True),
            (3, date(2015, 3, 30), timedelta(180), 'outcome', 'binary', False),
            (4, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False),
            (4, date(2014, 9, 30), timedelta(90), 'outcome', 'binary', False),
        ]
        assert records == expected
Beispiel #12
0
import testing.postgresql
import settings


Postgresql = testing.postgresql.PostgresqlFactory(cache_initialized_db=False)
postgresql = Postgresql()
settings.DB_URI = postgresql.url()
    def end_to_end_run(self, format_name):
        config = """
{
    "format": {
        "name": "{{format_name}}",
        "version": 1
    },
    "queue": "on-us"
}
        """
        config = config.replace('{{format_name}}', format_name)

        # transaction template
        message_template = {
            'source': 'RBA',
            'status': TransactionStatus.ready,
            'collection_format_name': 'direct_entry',
            'collection_format_version': 1,
            'collection_data': {},
            'collection_datetime': datetime.today().date(),
            'queue': 'default',
        }

        # direct entry data
        de_data = {
            'record_type': '1',
            'reel_seq_num': '01',
            'name_fin_inst': 'SUN',
            'user_name': 'DE USER NAME',
            'user_num': '654321',
            'file_desc': 'DE FILE DESC',
            'date_for_process': '011216',
            'bsb_number': '484-799',
            'account_number': '111111111',
            'indicator': ' ',
            'tran_code': '53',
            'amount': '0000012345',  # $2.00
            'account_title': 'DE ACCT TITLE',
            'lodgement_ref': 'DE LODGE REF',
            'trace_bsb_number': '484-799',
            'trace_account_number': '222222222',
            'name_of_remitter': 'DE REMITTER NAME',
            'withholding_tax_amount': '00000000',
        }

        # json payment data
        json_data = {
            'from_account': '987654321',
            'from_routing': '484-799',
            'to_description': 'JSON TO DESC',
            'from_name': 'JSON FROM NAME',
            'tran_type': 'cr',
            'to_name': 'JSON TO NAME',
            'to_account': '333333333',
            'to_routing': '484-799',
            'amount': 54321,
            'post_date': date(2016, 12, 2)
        }

        tran_list = []
        for tran_id in range(0, 100, 2):
            tran_list.append(
                build_message(
                    submission_id=str(tran_id),
                    collection_data=de_data,
                    template=message_template,
                    queue='on-us'
                )
            )
            tran_list.append(
                build_message(
                    submission_id=str(tran_id+1),
                    collection_data=json_data,
                    collection_format_name='json',
                    template=message_template,
                    queue='on-us'
                )
            )
        tran_list.append(
            SystemControl(system_id=1, effective_date=date(2000, 1, 1))
        )

        with testing.postgresql.Postgresql() as postgresql:
            # setup test database
            LOGGER.debug('Creating postgresql instance for testing')
            LOGGER.debug('  url={}'.format(postgresql.url()))
            LOGGER.debug('  data directory={}'.format(postgresql.get_data_directory()))

            engine = create_engine(postgresql.url(), json_serializer=dumps)
            alembic_cfg = Config("alembic.ini")

            with engine.begin() as connection:
                alembic_cfg.attributes['connection'] = connection
                command.upgrade(alembic_cfg, "head")

            Session = sessionmaker(bind=engine)
            session = Session()
            session.add_all(tran_list)
            session.commit()
            session.close()

            # run the job
            runner = CliRunner()
            with runner.isolated_filesystem() as fs:
                with open('test.json', 'w') as fp:
                    fp.write(config)
                start_time = time.clock()
                result = runner.invoke(
                    pr_file_distribution,
                    ['test.json', 'out-json.txt', '--db-url', postgresql.url()],
                    catch_exceptions=False
                )
                duration = time.clock() - start_time
                divider = '.'*20
                LOGGER.debug('output:\n%s', result.output)
                LOGGER.debug('exception:\n%s', result.exception)
                outfile = os.path.join(fs, 'out-json.txt')
                print(divider)
                with open(outfile, 'r') as out_fh:
                    record_count = 0
                    for line in out_fh:
                        print(line.rstrip())
                        record_count += 1
                    print("Processed {} lines".format(record_count))
                print("{}\nRun completed in {} seconds\n{}".format(divider, duration, divider))
        super(TestConsumer, self).setUp()

    def tearDown(self):
        self.cur.execute("ROLLBACK;")
        self.cur.close()
        super(TestConsumer, self).tearDown()

    def test_insert_query(self):
        insert_query = """
            INSERT INTO tweets_by_topic 
            (topic, kafka_offset, user_id, tweet, sentiment)
            values ( %s, %s, %s, %s, %s)
        """
        self.cur.execute(insert_query, (
            "Quotes", "500", "1234567890",
            "Someone's sitting in the shade today because someone planted a tree a long time ago.",
            1))
        consumer.conn.commit()
        self.cur.execute(
            'SELECT topic, kafka_offset, user_id, tweet, sentiment FROM tweets_by_topic'
        )
        self.assertEqual(self.cur.fetchone(), (
            'Quotes', 500, "1234567890",
            "Someone's sitting in the shade today because someone planted a tree a long time ago.",
            1))


if __name__ == "__main__":
    consumer.conn = postgresql.url()
    unittest.main()
Beispiel #15
0
from sqlalchemy import create_engine
import testing.postgresql

import settings
from sequences import create_sequences

# Launch new PostgreSQL server
Postgres = testing.postgresql.PostgresqlFactory(cache_initialized_db=False)
postgresql = Postgres()
settings.DB_URL = postgresql.url()
engine = create_engine(postgresql.url())
create_sequences(engine)
Beispiel #16
0
def test_load_labels_data_include_missing_labels_as_false():
    """ Test the load_labels_data function by checking whether the query
    produces the correct labels
    """
    # set up labeling config variables
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0),
        datetime.datetime(2016, 6, 1, 0, 0)
    ]

    # same as the other load_labels_data test, except we include an extra date, 2016-06-01
    # this date does have entity 0 included via the states table, but no labels

    # make a dataframe of labels to test against
    labels_df = pd.DataFrame(labels,
                             columns=[
                                 'entity_id', 'as_of_date', 'label_timespan',
                                 'label_name', 'label_type', 'label'
                             ])

    labels_df['as_of_date'] = convert_string_column_to_date(
        labels_df['as_of_date'])
    labels_df.set_index(['entity_id', 'as_of_date'])

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine, features_tables, labels, states)
        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                engine=engine,
                include_missing_labels_in_train_as=False,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_timespan='1 month')

            result = builder.load_labels_data(
                label_name=label_name,
                label_type=label_type,
                label_timespan='1 month',
                matrix_uuid='my_uuid',
                entity_date_table_name=entity_date_table_name,
            )
            df = pd.DataFrame.from_dict({
                'entity_id': [0, 2, 3, 4, 4],
                'as_of_date':
                [dates[2], dates[1], dates[1], dates[0], dates[1]],
                'booking': [0, 0, 0, 1, 0],
            }).set_index(['entity_id', 'as_of_date'])
            # the first row would not be here if we had not configured the Builder
            # to include missing labels as false

            test = (result == df)
            assert (test.all().all())
def test_array_categoricals():
    aggregate_config = [{
        'prefix': 'aprefix',
        'array_categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad', 'inbetween'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column': 'knowledge_date',
        'from_obj': 'data'
    }]
    expected_output = {
        'aprefix_entity_id': [
            {
                'entity_id': 3,
                'as_of_date': date(2013, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 1,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 1,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 0
            },
            {
                'entity_id': 3,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 4,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
        ]

    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        input_data = [
            # entity_id, knowledge_date, cat_one, quantity_one
            (1, date(2014, 1, 1), ['good', 'good'], 10000),
            (1, date(2014, 10, 11), ['good'], None),
            (3, date(2012, 6, 8), ['bad'], 342),
            (3, date(2014, 12, 21), ['inbetween'], 600),
            (4, date(2014, 4, 4), ['bad'], 1236)
        ]

        engine.execute("""
            create table data (
                entity_id int,
                knowledge_date date,
                cat_one varchar[],
                quantity_one float
            )
        """)
        for row in input_data:
            engine.execute(
                'insert into data values (%s, %s, %s, %s)',
                row
            )

        features_schema_name = 'features'

        output_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name
        ).create_all_tables(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )

        for output_table in output_tables:
            records = pandas.read_sql(
                'select * from {}.{} order by as_of_date, entity_id'.format(
                    features_schema_name, output_table
                ),
                engine
            ).to_dict('records')
            assert records == expected_output[output_table]
Beispiel #18
0
def test_simple_experiment(experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        populate_source_data(db_engine)
        with TemporaryDirectory() as temp_dir:
            experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                project_path=os.path.join(temp_dir, "inspections"),
                cleanup=True,
            ).run()

        # assert
        # 1. that model groups entries are present
        num_mgs = len([
            row for row in db_engine.execute(
                "select * from triage_metadata.model_groups")
        ])
        assert num_mgs > 0

        # 2. that model entries are present, and linked to model groups
        num_models = len([
            row for row in db_engine.execute("""
                select * from triage_metadata.model_groups
                join triage_metadata.models using (model_group_id)
                where model_comment = 'test2-final-final'
            """)
        ])
        assert num_models > 0

        # 3. predictions, linked to models for both training and testing predictions
        for set_type in ("train", "test"):
            num_predictions = len([
                row for row in db_engine.execute("""
                    select * from {}_results.predictions
                    join triage_metadata.models using (model_id)""".format(
                    set_type, set_type))
            ])
            assert num_predictions > 0

        # 4. evaluations linked to predictions linked to models, for training and testing

        for set_type in ("train", "test"):
            num_evaluations = len([
                row for row in db_engine.execute("""
                    select * from {}_results.evaluations e
                    join triage_metadata.models using (model_id)
                    join {}_results.predictions p on (
                        e.model_id = p.model_id and
                        e.evaluation_start_time <= p.as_of_date and
                        e.evaluation_end_time >= p.as_of_date)
                """.format(set_type, set_type, set_type))
            ])
            assert num_evaluations > 0

        # 5. subset evaluations linked to subsets and predictions linked to
        #    models, for training and testing
        for set_type in ("train", "test"):
            num_evaluations = len([
                row for row in db_engine.execute("""
                        select e.model_id, e.subset_hash from {}_results.evaluations e
                        join triage_metadata.models using (model_id)
                        join triage_metadata.subsets using (subset_hash)
                        join {}_results.predictions p on (
                            e.model_id = p.model_id and
                            e.evaluation_start_time <= p.as_of_date and
                            e.evaluation_end_time >= p.as_of_date)
                        group by e.model_id, e.subset_hash
                        """.format(set_type, set_type))
            ])
            # 4 model groups trained/tested on 2 splits, with 1 metric + parameter
            assert num_evaluations == 8

        # 6. experiment
        num_experiments = len([
            row for row in db_engine.execute(
                "select * from triage_metadata.experiments")
        ])
        assert num_experiments == 1

        # 7. that models are linked to experiments
        num_models_with_experiment = len([
            row for row in db_engine.execute("""
                select * from triage_metadata.experiments
                join triage_metadata.experiment_models using (experiment_hash)
                join triage_metadata.models using (model_hash)
            """)
        ])
        assert num_models == num_models_with_experiment

        # 8. that models have the train end date and label timespan
        results = [(model["train_end_time"], model["training_label_timespan"])
                   for model in db_engine.execute(
                       "select * from triage_metadata.models")]
        assert sorted(set(results)) == [
            (datetime(2012, 6, 1), timedelta(180)),
            (datetime(2013, 6, 1), timedelta(180)),
        ]

        # 9. that the right number of individual importances are present
        individual_importances = [
            row for row in db_engine.execute("""
            select * from test_results.individual_importances
            join triage_metadata.models using (model_id)
        """)
        ]
        assert len(
            individual_importances) == num_predictions * 2  # only 2 features

        # 10. Checking the proper matrices created and stored
        matrices = [
            row for row in db_engine.execute("""
            select matrix_type, num_observations from triage_metadata.matrices"""
                                             )
        ]
        types = [i[0] for i in matrices]
        counts = [i[1] for i in matrices]
        assert types.count("train") == 2
        assert types.count("test") == 2
        for i in counts:
            assert i > 0
        assert len(matrices) == 4

        # 11. Checking that all matrices are associated with the experiment
        linked_matrices = list(
            db_engine.execute("""select * from triage_metadata.matrices
            join triage_metadata.experiment_matrices using (matrix_uuid)
            join triage_metadata.experiments using (experiment_hash)"""))
        assert len(linked_matrices) == len(matrices)
Beispiel #19
0
def test_make_entity_date_table_include_missing_labels():
    """ Test that the make_entity_date_table function contains the correct
    values.
    """
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0),
        datetime.datetime(2016, 3, 1, 0, 0),
        datetime.datetime(2016, 6, 1, 0, 0)
    ]

    # same as the other make_entity_date_label test except there is an extra date, 2016-06-01
    # entity 0 is included in this date via the states table, but has no label

    # make a dataframe of entity ids and dates to test against
    ids_dates = create_entity_date_df(labels=labels,
                                      states=states,
                                      as_of_dates=dates,
                                      state_one=True,
                                      state_two=True,
                                      label_name='booking',
                                      label_type='binary',
                                      label_timespan='1 month')
    # this line adds the new entity-date combo as an expected one
    ids_dates = ids_dates.append(
        {
            'entity_id': 0,
            'as_of_date': datetime.date(2016, 6, 1)
        },
        ignore_index=True)

    with testing.postgresql.Postgresql() as postgresql:
        # create an engine and generate a table with fake feature data
        engine = create_engine(postgresql.url())
        create_schemas(engine=engine,
                       features_tables=features_tables,
                       labels=labels,
                       states=states)

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                include_missing_labels_in_train_as=False,
                engine=engine)
            engine.execute(
                'CREATE TABLE features.tmp_entity_date (a int, b date);')
            # call the function to test the creation of the table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_uuid='my_uuid',
                matrix_type='train',
                label_timespan='1 month')

            # read in the table
            result = pd.read_sql(
                "select * from features.{} order by entity_id, as_of_date".
                format(entity_date_table_name), engine)

            # compare the table to the test dataframe
            assert sorted(result.values.tolist()) == sorted(
                ids_dates.values.tolist())
Beispiel #20
0
def test_model_scoring_inspections():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': ['precision@', 'recall@', 'fpr@'],
            'thresholds': {'percentiles': [50.0], 'top_n': [3]}
        }, {
            # ensure we test a non-thresholded metric as well
            'metrics': ['accuracy'],
        }]
        training_metric_groups = [{'metrics': ['accuracy'], 'thresholds': {'percentiles': [50.0]}}]

        model_evaluator = ModelEvaluator(metric_groups, training_metric_groups, db_engine)

        testing_labels = numpy.array([True, False, numpy.nan, True, False])
        testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3])

        training_labels = numpy.array([False, False, True, True, True, False, True, True])
        training_prediction_probas = numpy.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6])

        fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, training_labels)
        fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, testing_labels)

        trained_model, model_id = fake_trained_model(
            'myproject',
            InMemoryModelStorageEngine('myproject'),
            db_engine
        )

        # Evaluate testing matrix and test the results
        model_evaluator.evaluate(
            testing_prediction_probas,
            fake_test_matrix_store,
            model_id,
        )
        for record in db_engine.execute(
            '''select * from test_results.test_evaluations
            where model_id = %s and evaluation_start_time = %s
            order by 1''',
            (model_id, fake_test_matrix_store.as_of_dates[0])
        ):
            assert record['num_labeled_examples'] == 4
            assert record['num_positive_labels'] == 2
            if record['parameter'] == '':
                assert record['num_labeled_above_threshold'] == 4
            elif 'pct' in record['parameter']:
                assert record['num_labeled_above_threshold'] == 1
            else:
                assert record['num_labeled_above_threshold'] == 2

        # Evaluate the training matrix and test the results
        model_evaluator.evaluate(
                    training_prediction_probas,
                    fake_train_matrix_store,
                    model_id,
        )
        for record in db_engine.execute(
            '''select * from train_results.train_evaluations
            where model_id = %s and evaluation_start_time = %s
            order by 1''',
            (model_id, fake_train_matrix_store.as_of_dates[0])
        ):
            assert record['num_labeled_examples'] == 8
            assert record['num_positive_labels'] == 5
            assert record['value'] == 0.625
Beispiel #21
0
def test_evaluating_early_warning():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': ['precision@',
                        'recall@',
                        'true positives@',
                        'true negatives@',
                        'false positives@',
                        'false negatives@'],
            'thresholds': {
                'percentiles': [5.0, 10.0],
                'top_n': [5, 10]
            }
        }, {
            'metrics': ['f1',
                        'mediocre',
                        'accuracy',
                        'roc_auc',
                        'average precision score'],
        }, {
            'metrics': ['fbeta@'],
            'parameters': [{'beta': 0.75}, {'beta': 1.25}]
        }]

        training_metric_groups = [{'metrics': ['accuracy', 'roc_auc']}]

        custom_metrics = {'mediocre': always_half}

        model_evaluator = ModelEvaluator(metric_groups, training_metric_groups, db_engine,
            custom_metrics=custom_metrics
        )

        labels = fake_labels(5)
        fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, labels)
        fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, labels)

        trained_model, model_id = fake_trained_model(
            'myproject',
            InMemoryModelStorageEngine('myproject'),
            db_engine
        )

        # Evaluate the testing metrics and test for all of them.
        model_evaluator.evaluate(
            trained_model.predict_proba(labels)[:, 1],
            fake_test_matrix_store,
            model_id,
        )
        records = [
            row[0] for row in
            db_engine.execute(
                '''select distinct(metric || parameter)
                from test_results.test_evaluations
                where model_id = %s and
                evaluation_start_time = %s
                order by 1''',
                (model_id, fake_test_matrix_store.as_of_dates[0])
            )
        ]
        assert records == [
            'accuracy',
            'average precision score',
            'f1',
            'false [email protected]_pct',
            'false negatives@10_abs',
            'false [email protected]_pct',
            'false negatives@5_abs',
            'false [email protected]_pct',
            'false positives@10_abs',
            'false [email protected]_pct',
            'false positives@5_abs',
            '[email protected]_beta',
            '[email protected]_beta',
            'mediocre',
            '[email protected]_pct',
            'precision@10_abs',
            '[email protected]_pct',
            'precision@5_abs',
            '[email protected]_pct',
            'recall@10_abs',
            '[email protected]_pct',
            'recall@5_abs',
            'roc_auc',
            'true [email protected]_pct',
            'true negatives@10_abs',
            'true [email protected]_pct',
            'true negatives@5_abs',
            'true [email protected]_pct',
            'true positives@10_abs',
            'true [email protected]_pct',
            'true positives@5_abs'
        ]

        # Evaluate the training metrics and test
        model_evaluator.evaluate(
            trained_model.predict_proba(labels)[:, 1],
            fake_train_matrix_store,
            model_id,
        )
        records = [
            row[0] for row in
            db_engine.execute(
                '''select distinct(metric || parameter)
                from train_results.train_evaluations
                where model_id = %s and
                evaluation_start_time = %s
                order by 1''',
                (model_id, fake_train_matrix_store.as_of_dates[0])
            )
        ]
        assert records == ['accuracy', 'roc_auc']
Beispiel #22
0
def test_entity_date_table_generator_replace():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 3, 1), True),
        (5, datetime(2016, 4, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", input_data)
        table_generator = EntityDateTableGenerator(
            query=
            "select entity_id from events where outcome_date < '{as_of_date}'::date",
            db_engine=engine,
            entity_date_table_name="exp_hash_entity_date",
            replace=True)
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "entity_id")
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "as_of_date")

        table_generator.generate_entity_date_table(as_of_dates)
        assert results == expected_output
Beispiel #23
0
def test_entity_date_table_generator_noreplace():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 3, 1), True),
        (5, datetime(2016, 4, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", input_data)
        table_generator = EntityDateTableGenerator(
            query=
            "select entity_id from events where outcome_date < '{as_of_date}'::date",
            db_engine=engine,
            entity_date_table_name="exp_hash_entity_date",
            replace=False)

        # 1. generate a cohort for a subset of as-of-dates
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "entity_id")
        utils.assert_index(engine, table_generator.entity_date_table_name,
                           "as_of_date")

        table_generator.generate_entity_date_table(as_of_dates)
        assert results == expected_output

        # 2. generate a cohort for a different subset of as-of-dates,
        # actually including an overlap to make sure that it doesn't double-insert anything
        as_of_dates = [
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_entity_date_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = list(
            engine.execute(f"""
                select entity_id, as_of_date, active from {table_generator.entity_date_table_name}
                order by entity_id, as_of_date
            """))
        assert results == expected_output
Beispiel #24
0
def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'beginning_of_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_window': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2']
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in engine.execute(
                    'select * from results.feature_importances')
            ]
            assert len(records) == 4 * 3  # maybe exclude entity_id?

            records = [
                row for row in engine.execute(
                    'select model_hash from results.models')
            ]
            assert len(records) == 4

            cache_keys = [
                model_cache_key(project_path, model_row[0], s3_conn)
                for model_row in records
            ]

            # 2. that the model groups are distinct
            records = [
                row for row in engine.execute(
                    'select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                pickle.loads(cache_key.get()['Body'].read())
                for cache_key in cache_keys
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })
            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            assert len([
                row for row in engine.execute(
                    'select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if metadata is deleted but the cache is still there,
            # retrains that one and replaces the feature importance records
            engine.execute(
                'delete from results.feature_importances where model_id = 3')
            engine.execute('delete from results.models where model_id = 3')
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            expected_model_ids = [1, 2, 4, 5]
            assert expected_model_ids == sorted(new_model_ids)
            assert [
                row['model_id'] for row in engine.execute(
                    'select model_id from results.models order by 1 asc')
            ] == expected_model_ids

            records = [
                row for row in engine.execute(
                    'select * from results.feature_importances')
            ]
            assert len(records) == 4 * 3  # maybe exclude entity_id?

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            expected_model_ids = [1, 2, 4, 5]
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            assert expected_model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store)
            assert expected_model_ids == \
                sorted([model_id for model_id in new_model_ids])
def test_sparse_table_generator_from_entities():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 1, 1), True),
        (5, datetime(2016, 1, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, "events", input_data)
        table_generator = StateTableGeneratorFromEntities(
            entities_table="events",
            db_engine=engine,
            experiment_hash="exp_hash")
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_sparse_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 1, 1), True),
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 1, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 1, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 1, 1), True),
            (5, datetime(2016, 2, 1), True),
            (5, datetime(2016, 3, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = [
            row for row in engine.execute("""
                select entity_id, as_of_date, active from {}
                order by entity_id, as_of_date
            """.format(table_generator.sparse_table_name))
        ]
        assert results == expected_output
        utils.assert_index(engine, table_generator.sparse_table_name,
                           "entity_id")
        utils.assert_index(engine, table_generator.sparse_table_name,
                           "as_of_date")
Beispiel #26
0
def test_engine():
    with Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ((result,),) = engine.execute("SELECT COUNT(*) FROM food_inspections")
    assert result == 966
Beispiel #27
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine)
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            matrix_store = InMemoryMatrixStore(matrix, metadata)
            train_matrix_columns = ['feature_one', 'feature_two']
            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns)

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }
            new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata)
            predictor.predict(model_id,
                              new_matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            predictor.predict(model_id,
                              matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
Beispiel #28
0
    def test_replace(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                builder = builders.HighMemoryCSVBuilder(
                    db_config=db_config,
                    matrix_directory=temp_dir,
                    engine=engine,
                    replace=False)

                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'test_duration': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')

                matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid))

                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert (len([row for row in reader]) == 6)

                # rerun
                builder.make_entity_date_table = Mock()
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')
                assert not builder.make_entity_date_table.called
Beispiel #29
0
def test_load_features_data():
    dates = [
        datetime.datetime(2016, 1, 1, 0, 0),
        datetime.datetime(2016, 2, 1, 0, 0)
    ]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(labels=labels,
                                      states=states,
                                      as_of_dates=dates,
                                      state_one=True,
                                      state_two=True,
                                      label_name='booking',
                                      label_type='binary',
                                      label_timespan='1 month')

    features = [['f1', 'f2'], ['f3', 'f4']]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ['entity_id', 'as_of_date'] + features[i]
        temp_df = pd.DataFrame(table, columns=cols)
        temp_df['as_of_date'] = convert_string_column_to_date(
            temp_df['as_of_date'])
        features_dfs.append(
            ids_dates.merge(right=temp_df,
                            how='left',
                            on=['entity_id', 'as_of_date'
                                ]).set_index(['entity_id', 'as_of_date']))

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(engine=engine,
                       features_tables=features_tables,
                       labels=labels,
                       states=states)

        with get_matrix_storage_engine() as matrix_storage_engine:
            builder = MatrixBuilder(
                db_config=db_config,
                matrix_storage_engine=matrix_storage_engine,
                engine=engine,
            )

            # make the entity-date table
            entity_date_table_name = builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_timespan='1 month')

            feature_dictionary = dict(
                ('features{}'.format(i), feature_list)
                for i, feature_list in enumerate(features))

            returned_features_dfs = builder.load_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid='my_uuid')

            # get the queries and test them
            for result, df in zip(returned_features_dfs, features_dfs):
                test = (result == df)
                assert (test.all().all())
Beispiel #30
0
def test_sparse_table_generator_from_events():
    input_data = [
        (1, datetime(2016, 1, 1), True),
        (1, datetime(2016, 4, 1), False),
        (1, datetime(2016, 3, 1), True),
        (2, datetime(2016, 1, 1), False),
        (2, datetime(2016, 1, 1), True),
        (3, datetime(2016, 1, 1), True),
        (5, datetime(2016, 1, 1), True),
        (5, datetime(2016, 1, 1), True),
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        utils.create_binary_outcome_events(engine, 'events', input_data)
        table_generator = StateTableGenerator(
            engine,
            'exp_hash',
            events_table='events'
        )
        as_of_dates = [
            datetime(2016, 1, 1),
            datetime(2016, 2, 1),
            datetime(2016, 3, 1),
            datetime(2016, 4, 1),
            datetime(2016, 5, 1),
            datetime(2016, 6, 1),
        ]
        table_generator.generate_sparse_table(as_of_dates)
        expected_output = [
            (1, datetime(2016, 1, 1), True),
            (1, datetime(2016, 2, 1), True),
            (1, datetime(2016, 3, 1), True),
            (1, datetime(2016, 4, 1), True),
            (1, datetime(2016, 5, 1), True),
            (1, datetime(2016, 6, 1), True),
            (2, datetime(2016, 1, 1), True),
            (2, datetime(2016, 2, 1), True),
            (2, datetime(2016, 3, 1), True),
            (2, datetime(2016, 4, 1), True),
            (2, datetime(2016, 5, 1), True),
            (2, datetime(2016, 6, 1), True),
            (3, datetime(2016, 1, 1), True),
            (3, datetime(2016, 2, 1), True),
            (3, datetime(2016, 3, 1), True),
            (3, datetime(2016, 4, 1), True),
            (3, datetime(2016, 5, 1), True),
            (3, datetime(2016, 6, 1), True),
            (5, datetime(2016, 1, 1), True),
            (5, datetime(2016, 2, 1), True),
            (5, datetime(2016, 3, 1), True),
            (5, datetime(2016, 4, 1), True),
            (5, datetime(2016, 5, 1), True),
            (5, datetime(2016, 6, 1), True),
        ]
        results = [row for row in engine.execute(
            '''
                select entity_id, as_of_date, active from {}
                order by entity_id, as_of_date
            '''.format(
                table_generator.sparse_table_name
            )
        )]
        assert results == expected_output
        utils.assert_index(engine, table_generator.sparse_table_name, 'entity_id')
        utils.assert_index(engine, table_generator.sparse_table_name, 'as_of_date')
Beispiel #31
0
def basic_integration_test(
    state_filters,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_matrix_multiplier,
    expected_group_lists,
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                feature_start_time=datetime(2010, 1, 1),
                feature_end_time=datetime(2014, 1, 1),
                label_start_time=datetime(2011, 1, 1),
                label_end_time=datetime(2014, 1, 1),
                model_update_frequency="1year",
                training_label_timespans=["6months"],
                test_label_timespans=["6months"],
                training_as_of_date_frequencies="1day",
                test_as_of_date_frequencies="3months",
                max_training_histories=["1months"],
                test_durations=["1months"],
            )

            state_table_generator = StateTableGeneratorFromDense(
                db_engine=db_engine,
                experiment_hash="abcd",
                dense_state_table="states")

            label_generator = LabelGenerator(
                db_engine=db_engine,
                query=sample_config()["label_config"]["query"])

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name="features",
                replace=True)

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine, features_schema_name="features")

            feature_group_creator = FeatureGroupCreator(
                feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)
            project_storage = ProjectStorage(temp_dir)
            planner = Planner(
                feature_start_time=datetime(2010, 1, 1),
                label_names=["outcome"],
                label_types=["binary"],
                states=state_filters,
                user_metadata={},
            )

            builder = MatrixBuilder(
                engine=db_engine,
                db_config={
                    "features_schema_name": "features",
                    "labels_schema_name": "public",
                    "labels_table_name": "labels",
                    "sparse_state_table_name": "tmp_sparse_states_abcd",
                },
                experiment_hash=None,
                matrix_storage_engine=project_storage.matrix_storage_engine(),
                replace=True,
            )

            # chop time
            split_definitions = chopper.chop_time()
            num_split_matrices = sum(1 + len(split["test_matrices"])
                                     for split in split_definitions)

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split["train_matrix"]["as_of_times"])
                for test_matrix in split["test_matrices"]:
                    all_as_of_times.extend(test_matrix["as_of_times"])
            all_as_of_times = list(set(all_as_of_times))

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                as_of_dates=all_as_of_times)

            # create labels table
            label_generator.generate_all_labels(
                labels_table="labels",
                as_of_dates=all_as_of_times,
                label_timespans=["6months"],
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            aggregations = feature_generator.aggregations(
                feature_aggregation_config=[
                    {
                        "prefix":
                        "cat",
                        "from_obj":
                        "cat_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates": [{
                            "quantity": "cat_sightings",
                            "metrics": ["count", "avg"],
                            "imputation": {
                                "all": {
                                    "type": "mean"
                                }
                            },
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                    {
                        "prefix":
                        "dog",
                        "from_obj":
                        "dog_complaints",
                        "knowledge_date_column":
                        "as_of_date",
                        "aggregates_imputation": {
                            "count": {
                                "type": "constant",
                                "value": 7
                            },
                            "sum": {
                                "type": "mean"
                            },
                            "avg": {
                                "type": "zero"
                            },
                        },
                        "aggregates": [{
                            "quantity": "dog_sightings",
                            "metrics": ["count", "avg"]
                        }],
                        "intervals": ["1y"],
                        "groups": ["entity_id"],
                    },
                ],
                feature_dates=all_as_of_times,
                state_table=state_table_generator.sparse_table_name,
            )
            feature_table_agg_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="aggregation")

            # create feature aggregation tables
            feature_generator.process_table_tasks(feature_table_agg_tasks)

            feature_table_imp_tasks = feature_generator.generate_all_table_tasks(
                aggregations, task_type="imputation")

            # create feature imputation tables
            feature_generator.process_table_tasks(feature_table_imp_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator.feature_dictionary(
                feature_table_names=feature_table_imp_tasks.keys(),
                index_column_lookup=feature_generator.index_column_lookup(
                    aggregations),
            )

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict))

            # figure out what matrices need to be built
            _, matrix_build_tasks = planner.generate_plans(
                split_definitions, feature_dicts)

            # go and build the matrices
            builder.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrices_records = list(
                db_engine.execute(
                    """select matrix_uuid, num_observations, matrix_type
                    from model_metadata.matrices
                    """))
            matrix_directory = os.path.join(temp_dir, "matrices")
            matrices = [
                path for path in os.listdir(matrix_directory) if ".csv" in path
            ]
            metadatas = [
                path for path in os.listdir(matrix_directory)
                if ".yaml" in path
            ]
            assert len(
                matrices) == num_split_matrices * expected_matrix_multiplier
            assert len(
                metadatas) == num_split_matrices * expected_matrix_multiplier
            assert len(matrices) == len(matrices_records)
            feature_group_name_lists = []
            for metadata_path in metadatas:
                with open(os.path.join(matrix_directory, metadata_path)) as f:
                    metadata = yaml.load(f)
                    feature_group_name_lists.append(metadata["feature_groups"])

            for matrix_uuid, num_observations, matrix_type in matrices_records:
                assert matrix_uuid in matrix_build_tasks  # the hashes of the matrices
                assert type(num_observations) is int
                assert matrix_type == matrix_build_tasks[matrix_uuid][
                    "matrix_type"]

            def deep_unique_tuple(l):
                return set([tuple(i) for i in l])

            assert deep_unique_tuple(
                feature_group_name_lists) == deep_unique_tuple(
                    expected_group_lists)
def init_postgres():
    postgresql = testing.postgresql.Postgresql()
    print("postgresql up and running at {}".format(postgresql.url()))
    return postgresql
def setup_static_fetch(postgresql):
    subprocess.check_output(
        ['psql', postgresql.url(), '-q', '-f', 'create_schema.sql'])
def test_write_features_data():
    dates = [datetime.datetime(2016, 1, 1, 0, 0),
             datetime.datetime(2016, 2, 1, 0, 0)]

    # make dataframe for entity ids and dates
    ids_dates = create_entity_date_df(
        labels=labels,
        states=states,
        as_of_dates=dates,
        state_one=True,
        state_two=True,
        label_name='booking',
        label_type='binary',
        label_window='1 month'
    )

    features = [['f1', 'f2'], ['f3', 'f4']]
    # make dataframes of features to test against
    features_dfs = []
    for i, table in enumerate(features_tables):
        cols = ['entity_id', 'as_of_date'] + features[i]
        temp_df = pd.DataFrame(
            table,
            columns = cols
        )
        temp_df['as_of_date'] = convert_string_column_to_date(temp_df['as_of_date'])
        features_dfs.append(
            ids_dates.merge(
                right = temp_df,
                how = 'left',
                on = ['entity_id', 'as_of_date']
            )
        )

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine,
            features_tables=features_tables,
            labels=labels,
            states=states
        )

        with TemporaryDirectory() as temp_dir:
            planner = Planner(
                beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                label_names = ['booking'],
                label_types = ['binary'],
                states = ['state_one AND state_two'],
                db_config = db_config,
                matrix_directory = temp_dir,
                user_metadata = {},
                engine = engine,
                builder_class=builders.LowMemoryCSVBuilder
            )

            # make the entity-date table
            entity_date_table_name = planner.builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state = 'state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_window='1 month'
            )

            feature_dictionary = dict(
                ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features)
            )

            print(feature_dictionary)
            features_csv_names = planner.builder.write_features_data(
                as_of_times=dates,
                feature_dictionary=feature_dictionary,
                entity_date_table_name=entity_date_table_name,
                matrix_uuid='my_uuid'
            )

            # get the queries and test them
            for feature_csv_name, df in zip(sorted(features_csv_names), features_dfs):
                df = df.fillna(0)
                df = df.reset_index()

                result = pd.read_csv(feature_csv_name).reset_index()
                result['as_of_date'] = convert_string_column_to_date(result['as_of_date'])
                test = (result == df)
                assert(test.all().all())
def test_feature_generation():
    aggregate_config = [{
        'prefix': 'aprefix',
        'aggregates': [
            {'quantity': 'quantity_one', 'metrics': ['sum', 'count']},
        ],
        'categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column': 'knowledge_date',
        'from_obj': 'data'
    }]

    expected_output = {
        'aprefix_entity_id': [
            {
                'entity_id': 3,
                'as_of_date': date(2013, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 342,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 1,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 10000,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 1,
                'aprefix_entity_id_all_cat_one_bad_sum': 0
            },
            {
                'entity_id': 3,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 342,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 4,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 1236,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
        ]

    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        output_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name
        ).create_all_tables(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )

        for output_table in output_tables:
            records = pandas.read_sql(
                'select * from {}.{} order by as_of_date, entity_id'
                .format(features_schema_name, output_table),
                engine
            ).to_dict('records')
            assert records == expected_output[output_table]
 def setUpClass(cls):
     consumer.conn = psycopg2.connect(postgresql.url().replace(
         "test", "twitter_sentiment"))
     consumer.conn.set_isolation_level( \
             psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
Beispiel #37
0
def basic_integration_test(
    state_filters,
    feature_group_create_rules,
    feature_group_mix_rules,
    expected_num_matrices
):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        Base.metadata.create_all(db_engine)
        populate_source_data(db_engine)

        with TemporaryDirectory() as temp_dir:
            chopper = Timechop(
                beginning_of_time=datetime(2010, 1, 1),
                modeling_start_time=datetime(2011, 1, 1),
                modeling_end_time=datetime(2014, 1, 1),
                update_window='1y',
                train_label_windows=['6months'],
                test_label_windows=['6months'],
                train_example_frequency='1day',
                test_example_frequency='3months',
                train_durations=['1months'],
                test_durations=['1months'],
            )

            state_table_generator = StateTableGenerator(
                db_engine=db_engine,
                experiment_hash='abcd'
            )

            label_generator = BinaryLabelGenerator(
                db_engine=db_engine,
                events_table='events'
            )

            feature_generator = FeatureGenerator(
                db_engine=db_engine,
                features_schema_name='features',
                replace=True,
            )

            feature_dictionary_creator = FeatureDictionaryCreator(
                db_engine=db_engine,
                features_schema_name='features'
            )

            feature_group_creator = FeatureGroupCreator(feature_group_create_rules)

            feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules)

            planner = Planner(
                engine=db_engine,
                beginning_of_time=datetime(2010, 1, 1),
                label_names=['outcome'],
                label_types=['binary'],
                db_config={
                    'features_schema_name': 'features',
                    'labels_schema_name': 'public',
                    'labels_table_name': 'labels',
                    'sparse_state_table_name': 'tmp_sparse_states_abcd',
                },
                matrix_directory=os.path.join(temp_dir, 'matrices'),
                states=state_filters,
                user_metadata={},
                replace=True
            )

            # chop time
            split_definitions = chopper.chop_time()

            # generate as_of_times for feature/label/state generation
            all_as_of_times = []
            for split in split_definitions:
                all_as_of_times.extend(split['train_matrix']['as_of_times'])
                for test_matrix in split['test_matrices']:
                    all_as_of_times.extend(test_matrix['as_of_times'])
            all_as_of_times = list(set(all_as_of_times))

            # generate sparse state table
            state_table_generator.generate_sparse_table(
                dense_state_table='states',
                as_of_dates=all_as_of_times
            )

            # create labels table
            label_generator.generate_all_labels(
                labels_table='labels',
                as_of_dates=all_as_of_times,
                label_windows=['6months']
            )

            # create feature table tasks
            # we would use FeatureGenerator#create_all_tables but want to use
            # the tasks dict directly to create a feature dict
            feature_table_tasks = feature_generator.generate_all_table_tasks(
                feature_aggregation_config=[{
                    'prefix': 'cat',
                    'from_obj': 'cat_complaints',
                    'knowledge_date_column': 'as_of_date',
                    'aggregates': [{
                        'quantity': 'cat_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }, {
                    'prefix': 'dog',
                    'from_obj': 'dog_complaints',
                    'knowledge_date_column': 'as_of_date',
                    'aggregates': [{
                        'quantity': 'dog_sightings',
                        'metrics': ['count', 'avg'],
                    }],
                    'intervals': ['1y'],
                    'groups': ['entity_id']
                }],
                feature_dates=all_as_of_times,
            )

            # create feature tables
            feature_generator.process_table_tasks(feature_table_tasks)

            # build feature dictionaries from feature tables and
            # subsetting config
            master_feature_dict = feature_dictionary_creator\
                .feature_dictionary(feature_table_tasks.keys())

            feature_dicts = feature_group_mixer.generate(
                feature_group_creator.subsets(master_feature_dict)
            )

            # figure out what matrices need to be built
            _, matrix_build_tasks =\
                planner.generate_plans(
                    split_definitions,
                    feature_dicts
                )

            # go and build the matrices
            planner.build_all_matrices(matrix_build_tasks)

            # super basic assertion: did matrices we expect get created?
            matrix_directory = os.path.join(temp_dir, 'matrices')
            matrices = [path for path in os.listdir(matrix_directory) if '.csv' in path]
            metadatas = [path for path in os.listdir(matrix_directory) if '.yaml' in path]
            assert len(matrices) == expected_num_matrices
            assert len(metadatas) == expected_num_matrices
Beispiel #38
0
def run_import(url, **kwargs):
    db = Database(url=postgresql.url())
    db.create()
    database_load(os.path.join(root_dir, 'data/performance-dataset.zip'), db_url=url, **kwargs)
    db.drop_all()

def unzip_file(file_path):
    temp_dir = tempfile.mkdtemp()
    with closing(zipfile.ZipFile(file_path)) as z:
        z.extractall(temp_dir)
    return temp_dir


with testing.postgresql.Postgresql() as postgresql:
    db = Database(url=postgresql.url())
    db.engine.execute('create extension postgis;')
    db.engine.execute('create extension postgis_topology;')

    run_x = 3
    start_time = time.time()

    batch_size = config

    #batch_sizes = [1000, 5000, 10000]
    batch_sizes = [10000]
    db_threads = [1]#,5,10]

    result_list = []
    for batch_size in batch_sizes:
        for db_th in db_threads:
Beispiel #39
0
 def test_thresholder_2014_threshold(self):
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         thresholder = self.setup_data(engine)
         assert thresholder.model_groups_past_threshold(
             self.dataframe_as_of(thresholder, "2014-01-01")) == set([1])
Beispiel #40
0
async def test_allocate_parking_lot_to_user_fail(event_loop):
    with Postgresql() as postgresql:
        db = await DbAccess.create(postgresql.url(),
                                   event_loop,
                                   reset_tables=True)
        assert await db.allocate_parking_lot("test_user", 1) is False
Beispiel #41
0
 def test_thresholder_2015_close(self):
     with testing.postgresql.Postgresql() as postgresql:
         engine = create_engine(postgresql.url())
         thresholder = self.setup_data(engine)
         assert thresholder.model_groups_close_to_best_case(
             self.dataframe_as_of(thresholder, "2015-01-01")) == set([2])
Beispiel #42
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid = "1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }),
                    {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }
                )
                for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(
                project_path,
                model_storage_engine,
                db_engine
            )
            model_evaluator = ModelEvaluator(
                [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}],
                [{}],
                db_engine
            )

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=train_store
            )

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two']
                    )

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, model_id, as_of_date
                from test_results.predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in
                db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
Beispiel #43
0
def test_model_scoring_early_warning():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': [
                'precision@', 'recall@', 'true positives@', 'true negatives@',
                'false positives@', 'false negatives@'
            ],
            'thresholds': {
                'percentiles': [5.0, 10.0],
                'top_n': [5, 10]
            }
        }, {
            'metrics': [
                'f1', 'mediocre', 'accuracy', 'roc_auc',
                'average precision score'
            ],
        }, {
            'metrics': ['fbeta@'],
            'parameters': [{
                'beta': 0.75
            }, {
                'beta': 1.25
            }]
        }]

        custom_metrics = {'mediocre': always_half}

        model_scorer = ModelScorer(metric_groups,
                                   db_engine,
                                   custom_metrics=custom_metrics)

        trained_model, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = fake_labels(5)
        as_of_date = datetime.date(2016, 5, 5)
        model_scorer.score(
            trained_model.predict_proba(labels)[:, 1], labels, model_id,
            as_of_date, as_of_date, '1y')

        # assert
        # that all of the records are there
        records = [
            row[0] for row in db_engine.execute(
                '''select distinct(metric || parameter)
                from results.evaluations
                where model_id = %s and
                evaluation_start_time = %s order by 1''', (model_id,
                                                           as_of_date))
        ]
        assert records == [
            'accuracy', 'average precision score', 'f1',
            'false [email protected]_pct', 'false negatives@10_abs',
            'false [email protected]_pct', 'false negatives@5_abs',
            'false [email protected]_pct', 'false positives@10_abs',
            'false [email protected]_pct', 'false positives@5_abs',
            '[email protected]_beta', '[email protected]_beta', 'mediocre',
            '[email protected]_pct', 'precision@10_abs', '[email protected]_pct',
            'precision@5_abs', '[email protected]_pct', 'recall@10_abs',
            '[email protected]_pct', 'recall@5_abs', 'roc_auc',
            'true [email protected]_pct', 'true negatives@10_abs',
            'true [email protected]_pct', 'true negatives@5_abs',
            'true [email protected]_pct', 'true positives@10_abs',
            'true [email protected]_pct', 'true positives@5_abs'
        ]
Beispiel #44
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path,
                              model_storage_engine,
                              db_engine,
                              replace=False)
        dayone = datetime.date(2011, 1,
                               1).strftime(predictor.expected_matrix_ts_format)
        daytwo = datetime.date(2011, 1,
                               2).strftime(predictor.expected_matrix_ts_format)
        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id'],
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        session = sessionmaker(bind=db_engine)()
        obj = session.query(Prediction).first()
        session.delete(obj)
        session.commit()

        make_transient(obj)
        session = sessionmaker(bind=db_engine)()
        session.add(obj)
        session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
def test_make_entity_date_table():
    """ Test that the make_entity_date_table function contains the correct
    values.
    """
    dates = [datetime.datetime(2016, 1, 1, 0, 0),
             datetime.datetime(2016, 2, 1, 0, 0),
             datetime.datetime(2016, 3, 1, 0, 0)]

    # make a dataframe of entity ids and dates to test against
    ids_dates = create_entity_date_df(
        labels=labels,
        states=states,
        as_of_dates=dates,
        state_one=True,
        state_two=True,
        label_name='booking',
        label_type='binary',
        label_window='1 month'
    )

    with testing.postgresql.Postgresql() as postgresql:
        # create an engine and generate a table with fake feature data
        engine = create_engine(postgresql.url())
        create_schemas(
            engine=engine,
            features_tables=features_tables,
            labels=labels,
            states=states
        )

        with TemporaryDirectory() as temp_dir:
            planner = Planner(
                beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                label_names = ['booking'],
                label_types = ['binary'],
                states = ['state_one AND state_two'],
                db_config = db_config,
                matrix_directory = temp_dir,
                user_metadata = {},
                engine = engine
            )
            engine.execute(
                'CREATE TABLE features.tmp_entity_date (a int, b date);'
            )
            # call the function to test the creation of the table
            entity_date_table_name = planner.builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state='state_one AND state_two',
                matrix_uuid='my_uuid',
                matrix_type='train',
                label_window='1 month'
            )

            # read in the table
            result = pd.read_sql(
                "select * from features.{} order by entity_id, as_of_date".format(entity_date_table_name),
                engine
            )
            labels_df = pd.read_sql('select * from labels.labels', engine)

            # compare the table to the test dataframe
            print("ids_dates")
            for i, row in ids_dates.iterrows():
                print(row.values)
            print("result")
            for i, row in result.iterrows():
                print(row.values)
            test = (result == ids_dates)
            print(test)
            assert(test.all().all())
def test_feature_dictionary_creator():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        engine.execute('create schema features')
        engine.execute('''
            create table features.prefix1_entity_id (
                entity_id int,
                as_of_date date,
                feature_one float,
                feature_two float
            )
        ''')
        engine.execute('''
            create table features.prefix1_zipcode (
                zipcode text,
                as_of_date date,
                feature_three float,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.prefix1_aggregation (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.prefix1_aggregation_imputed (
                entity_id int,
                as_of_date date,
                zipcode text,
                feature_one float,
                feature_two float,
                feature_three float,
                feature_three_imp int,
                feature_four float
            )
        ''')
        engine.execute('''
            create table features.random_other_table (
                another_column float
            )
        ''')

        creator = FeatureDictionaryCreator(
            features_schema_name='features',
            db_engine=engine
        )
        feature_dictionary = creator.feature_dictionary(
            feature_table_names=[
                'prefix1_entity_id', 'prefix1_zip_code', 
                'prefix1_aggregation', 'prefix1_aggregation_imputed'
                ],
            index_column_lookup={
                'prefix1_aggregation_imputed': ['entity_id', 'zipcode', 'as_of_date']
            }
        )
        assert feature_dictionary == {
            'prefix1_aggregation_imputed': [
                'feature_one', 'feature_two', 'feature_three', 'feature_three_imp', 'feature_four'
                ],
        }
def test_write_labels_data():
    """ Test the write_labels_data function by checking whether the query
    produces the correct labels
    """
    # set up labeling config variables
    dates = [datetime.datetime(2016, 1, 1, 0, 0),
             datetime.datetime(2016, 2, 1, 0, 0)]


    # make a dataframe of labels to test against
    labels_df = pd.DataFrame(
        labels,
        columns = [
            'entity_id',
            'as_of_date',
            'label_window',
            'label_name',
            'label_type',
            'label'
        ]
    )

    labels_df['as_of_date'] = convert_string_column_to_date(labels_df['as_of_date'])
    labels_df.set_index(['entity_id', 'as_of_date'])

    # create an engine and generate a table with fake feature data
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        create_schemas(
            engine,
            features_tables,
            labels,
            states
        )
        with TemporaryDirectory() as temp_dir:
            planner = Planner(
                beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0),
                label_names = ['booking'],
                label_types = ['binary'],
                states = ['state_one AND state_two'],
                db_config = db_config,
                matrix_directory = temp_dir,
                user_metadata = {},
                engine = engine,
                builder_class=builders.LowMemoryCSVBuilder
            )       

            # make the entity-date table
            entity_date_table_name = planner.builder.make_entity_date_table(
                as_of_times=dates,
                label_type='binary',
                label_name='booking',
                state = 'state_one AND state_two',
                matrix_type='train',
                matrix_uuid='my_uuid',
                label_window='1 month'
            )

            csv_filename = planner.builder.write_labels_data(
                label_name=label_name,
                label_type=label_type,
                label_window='1 month',
                matrix_uuid='my_uuid',
                entity_date_table_name=entity_date_table_name,
            )
            df = pd.DataFrame.from_dict({
                'entity_id': [2, 3, 4, 4],
                'as_of_date': ['2016-02-01', '2016-02-01', '2016-01-01', '2016-02-01'],
                'booking': [0, 0, 1, 0],
            }).set_index(['entity_id', 'as_of_date'])

            result = pd.read_csv(csv_filename).set_index(['entity_id', 'as_of_date'])
            test = (result == df)
            assert(test.all().all())
Beispiel #48
0
def test_baselines_with_missing_features(experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)

        # set up the config with the baseline model and feature group mixing
        config = sample_config()
        config['grid_config'] = {
            'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
            {
                'feature':
                ['entity_features_entity_id_1year_cat_sightings_count']
            }
        }
        config['feature_group_definition'] = {
            'tables': [
                'entity_features_aggregation_imputed',
                'zip_code_features_aggregation_imputed'
            ]
        }
        config['feature_group_strategies'] = ['leave-one-in']
        with TemporaryDirectory() as temp_dir:
            experiment_class(config=config,
                             db_engine=db_engine,
                             model_storage_class=FSModelStorageEngine,
                             project_path=os.path.join(temp_dir,
                                                       'inspections')).run()

        # assert
        # 1. that model groups entries are present
        num_mgs = len([
            row for row in db_engine.execute(
                'select * from model_metadata.model_groups')
        ])
        assert num_mgs > 0

        # 2. that model entries are present, and linked to model groups
        num_models = len([
            row for row in db_engine.execute('''
                select * from model_metadata.model_groups
                join model_metadata.models using (model_group_id)
                where model_comment = 'test2-final-final'
            ''')
        ])
        assert num_models > 0

        # 3. predictions, linked to models
        num_predictions = len([
            row for row in db_engine.execute('''
                select * from test_results.predictions
                join model_metadata.models using (model_id)''')
        ])
        assert num_predictions > 0

        # 4. evaluations linked to predictions linked to models
        num_evaluations = len([
            row for row in db_engine.execute('''
                select * from test_results.evaluations e
                join model_metadata.models using (model_id)
                join test_results.predictions p on (
                    e.model_id = p.model_id and
                    e.evaluation_start_time <= p.as_of_date and
                    e.evaluation_end_time >= p.as_of_date)
            ''')
        ])
        assert num_evaluations > 0

        # 5. experiment
        num_experiments = len([
            row for row in db_engine.execute(
                'select * from model_metadata.experiments')
        ])
        assert num_experiments == 1

        # 6. that models are linked to experiments
        num_models_with_experiment = len([
            row for row in db_engine.execute('''
                select * from model_metadata.experiments
                join model_metadata.models using (experiment_hash)
            ''')
        ])
        assert num_models == num_models_with_experiment

        # 7. that models have the train end date and label timespan
        results = [(model['train_end_time'], model['training_label_timespan'])
                   for model in db_engine.execute(
                       'select * from model_metadata.models')]
        assert sorted(set(results)) == [
            (datetime(2012, 6, 1), timedelta(180)),
            (datetime(2013, 6, 1), timedelta(180)),
        ]

        # 8. that the right number of individual importances are present
        individual_importances = [
            row for row in db_engine.execute('''
            select * from test_results.individual_importances
            join model_metadata.models using (model_id)
        ''')
        ]
        assert len(
            individual_importances) == num_predictions * 2  # only 2 features
Beispiel #49
0
def test_simple_experiment(experiment_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        with TemporaryDirectory() as temp_dir:
            experiment_class(
                config=sample_config(),
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections'),
                cleanup=True,
            ).run()

        # assert
        # 1. that model groups entries are present
        num_mgs = len([
            row for row in db_engine.execute(
                'select * from model_metadata.model_groups')
        ])
        assert num_mgs > 0

        # 2. that model entries are present, and linked to model groups
        num_models = len([
            row for row in db_engine.execute('''
                select * from model_metadata.model_groups
                join model_metadata.models using (model_group_id)
                where model_comment = 'test2-final-final'
            ''')
        ])
        assert num_models > 0

        # 3. predictions, linked to models for both training and testing predictions
        for set_type in ("train", "test"):
            num_predictions = len([
                row for row in db_engine.execute('''
                    select * from {}_results.predictions
                    join model_metadata.models using (model_id)'''.format(
                    set_type, set_type))
            ])
            assert num_predictions > 0

        # 4. evaluations linked to predictions linked to models, for training and testing
        for set_type in ("train", "test"):
            num_evaluations = len([
                row for row in db_engine.execute('''
                    select * from {}_results.evaluations e
                    join model_metadata.models using (model_id)
                    join {}_results.predictions p on (
                        e.model_id = p.model_id and
                        e.evaluation_start_time <= p.as_of_date and
                        e.evaluation_end_time >= p.as_of_date)
                '''.format(set_type, set_type, set_type))
            ])
            assert num_evaluations > 0

        # 5. experiment
        num_experiments = len([
            row for row in db_engine.execute(
                'select * from model_metadata.experiments')
        ])
        assert num_experiments == 1

        # 6. that models are linked to experiments
        num_models_with_experiment = len([
            row for row in db_engine.execute('''
                select * from model_metadata.experiments
                join model_metadata.models using (experiment_hash)
            ''')
        ])
        assert num_models == num_models_with_experiment

        # 7. that models have the train end date and label timespan
        results = [(model['train_end_time'], model['training_label_timespan'])
                   for model in db_engine.execute(
                       'select * from model_metadata.models')]
        assert sorted(set(results)) == [
            (datetime(2012, 6, 1), timedelta(180)),
            (datetime(2013, 6, 1), timedelta(180)),
        ]

        # 8. that the right number of individual importances are present
        individual_importances = [
            row for row in db_engine.execute('''
            select * from test_results.individual_importances
            join model_metadata.models using (model_id)
        ''')
        ]
        assert len(
            individual_importances) == num_predictions * 2  # only 2 features

        # 9. Checking the proper matrices created and stored
        matrices = [
            row for row in db_engine.execute('''
            select matrix_type, num_observations from model_metadata.matrices'''
                                             )
        ]
        types = [i[0] for i in matrices]
        counts = [i[1] for i in matrices]
        assert types.count('train') == 2
        assert types.count('test') == 2
        for i in counts:
            assert i > 0
        assert len(matrices) == 4
def test_simple_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        model_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            with fake_metta({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            }, {'label_name': 'label'}) as (matrix_path, metadata_path):

                trainer = SimpleModelTrainer(
                    training_set_path=matrix_path,
                    training_metadata_path=metadata_path,
                    model_config=model_config,
                    project_path='econ-dev/inspections',
                    s3_conn=s3_conn,
                    db_engine=engine
                )
                cache_keys = trainer.train_models()

                # assert
                # 1. that all four models are cached
                model_pickles = [
                    pickle.loads(cache_key.get()['Body'].read())
                    for cache_key in cache_keys
                ]
                assert len(model_pickles) == 4
                assert len([x for x in model_pickles if x is not None]) == 4

                # 2. that their results can have predictions made on it
                test_matrix = pandas.DataFrame.from_dict({
                    'entity_id': [3, 4],
                    'feature_one': [4, 4],
                    'feature_two': [6, 5],
                })
                for model_pickle in model_pickles:
                    predictions = model_pickle.predict(test_matrix)
                    assert len(predictions) == 2

                # 3. that the models table entries are present
                records = [
                    row for row in
                    engine.execute('select * from results.models')
                ]
                assert len(records) == 4

                records = [
                    row for row in
                    engine.execute('select * from results.feature_importances')
                ]
                assert len(records) == 4 * 3  # maybe exclude entity_id?
Beispiel #51
0
def run_import(url, **kwargs):
    db = Database(url=postgresql.url())
    db.create()
    database_load(os.path.join(root_dir, 'data/performance-dataset.zip'), db_url=url, **kwargs)
    db.drop_all()
def test_replace():
    aggregate_config = [{
        "prefix":
        "aprefix",
        "aggregates_imputation": {
            "all": {
                "type": "mean"
            }
        },
        "aggregates": [{
            "quantity": "quantity_one",
            "metrics": ["sum", "count"]
        }],
        "categoricals": [{
            "column": "cat_one",
            "choices": ["good", "bad"],
            "metrics": ["sum"],
            "imputation": {
                "all": {
                    "type": "null_category"
                }
            },
        }],
        "groups": ["entity_id"],
        "intervals": ["all"],
        "knowledge_date_column":
        "knowledge_date",
        "from_obj":
        "data",
    }]

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = "features"
        feature_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False).create_all_tables(
                feature_dates=["2013-09-30", "2014-09-30"],
                feature_aggregation_config=aggregate_config,
                state_table="states",
            )

        assert len(feature_tables) == 1
        assert list(feature_tables)[0] == "aprefix_aggregation_imputed"

        feature_generator = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False)
        aggregations = feature_generator.aggregations(
            feature_dates=["2013-09-30", "2014-09-30"],
            feature_aggregation_config=aggregate_config,
            state_table="states",
        )
        table_tasks = feature_generator.generate_all_table_tasks(
            aggregations, task_type="aggregation")

        assert len(table_tasks["aprefix_entity_id"].keys()) == 0

        imp_tasks = feature_generator.generate_all_table_tasks(
            aggregations, task_type="imputation")

        assert len(imp_tasks["aprefix_aggregation_imputed"].keys()) == 0

        engine.dispose()