def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                InMemoryMatrixStore(matrix, {
                    'label_timespan': '1d',
                    'end_time': datetime.datetime.now(),
                    'feature_start_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2'],
                    'indices': ['entity_id'],
                })
            )
            assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks
                if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in engine.execute(
                'select model_parameters from results.model_groups'
            ):
                assert 'n_jobs' not in row[0]
Exemple #2
0
    def test_retry_recovery(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        port = None
        with testing.postgresql.Postgresql() as postgresql:
            port = postgresql.settings['port']
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(
                matrix, {
                    'label_window': '1d',
                    'end_time': datetime.datetime.now(),
                    'beginning_of_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2']
                })

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            engine = create_engine(self.new_server.url())
            ensure_db(engine)

        with patch('time.sleep') as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config, dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
Exemple #3
0
def test_save_experiment_and_get_hash():
    # no reason to make assertions on the config itself, use a basic dict
    experiment_config = {'one': 'two'}
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert isinstance(exp_hash, str)
        new_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert new_hash == exp_hash
    def test_retry_max(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        # set up a basic model training run
        # TODO abstract the setup of a basic model training run where
        # we don't worry about the specific values used? it would make
        # tests like this require a bit less noise to read past
        with testing.postgresql.Postgresql() as postgresql:
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(matrix, {
                'label_timespan': '1d',
                'end_time': datetime.datetime.now(),
                'feature_start_time': datetime.date(2012, 12, 20),
                'label_name': 'label',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            })
        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config, dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
Exemple #5
0
def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine)
        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id'],
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # assert
        # 1. that the returned predictions are of the desired length
        assert len(predict_proba) == 4

        # 2. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from results.predictions
            join results.models using (model_id)''')
        ]
        assert len(records) == 4
def test_calculate_and_save():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        with tempfile.TemporaryDirectory() as temp_dir:
            train_store, test_store = sample_metta_csv_diff_order(temp_dir)
            model_storage_engine = InMemoryModelStorageEngine(project_path)
            calculator = IndividualImportanceCalculator(db_engine,
                                                        methods=['sample'],
                                                        replace=False)
            # given a trained model
            # and a test matrix
            _, model_id = \
                fake_trained_model(
                    project_path,
                    model_storage_engine,
                    db_engine,
                    train_matrix_uuid=train_store.uuid
                )
            # i expect to be able to call calculate and save
            calculator.calculate_and_save_all_methods_and_dates(
                model_id, test_store)
            # and find individual importances in the results schema afterwards
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.individual_importances
                join results.models using (model_id)''')
            ]
            assert len(records) > 0
            # and that when run again, has the same result
            calculator.calculate_and_save_all_methods_and_dates(
                model_id, test_store)
            new_records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.individual_importances
                join results.models using (model_id)''')
            ]
            assert len(records) == len(new_records)
            assert records == new_records
Exemple #7
0
def test_uniform_distribution_entity_date_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {
            'entity_id': [1, 1],
            'as_of_date': ['2016-01-01', '2017-01-01']
        }
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        test_store = InMemoryMatrixStore(
            matrix=pandas.DataFrame.from_dict(data_dict).set_index(
                ['entity_id', 'as_of_date']),
            metadata=sample_metadata())
        session.commit()
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 5  # 5 features x 1 entity for this as_of_date
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
Exemple #8
0
def test_predictor_get_train_columns():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        with tempfile.TemporaryDirectory() as temp_dir:
            train_store, test_store = sample_metta_csv_diff_order(temp_dir)

            model_storage_engine = InMemoryModelStorageEngine(project_path)
            _, model_id = \
                fake_trained_model(
                    project_path,
                    model_storage_engine,
                    db_engine,
                    train_matrix_uuid=train_store.uuid
                )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)

            predict_proba = predictor.predict(
                model_id,
                test_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_store.columns())
            # assert
            # 1. that we calculated predictions
            assert len(predict_proba) > 0

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) > 0
Exemple #9
0
def test_model_scoring_inspections():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': ['precision@', 'recall@', 'fpr@'],
            'thresholds': {
                'percentiles': [50.0],
                'top_n': [3]
            }
        }]

        model_evaluator = ModelEvaluator(metric_groups, db_engine)

        _, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = numpy.array([True, False, numpy.nan, True, False])
        prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3])
        evaluation_start = datetime.datetime(2016, 4, 1)
        evaluation_end = datetime.datetime(2016, 7, 1)
        example_frequency = '1d'
        model_evaluator.evaluate(prediction_probas, labels, model_id,
                                 evaluation_start, evaluation_end,
                                 example_frequency)

        for record in db_engine.execute(
                '''select * from results.evaluations
            where model_id = %s and evaluation_start_time = %s order by 1''',
            (model_id, evaluation_start)):
            assert record['num_labeled_examples'] == 4
            assert record['num_positive_labels'] == 2
            if 'pct' in record['parameter']:
                assert record['num_labeled_above_threshold'] == 1
            else:
                assert record['num_labeled_above_threshold'] == 2
Exemple #10
0
def test_evaluating_early_warning():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': [
                'precision@', 'recall@', 'true positives@', 'true negatives@',
                'false positives@', 'false negatives@'
            ],
            'thresholds': {
                'percentiles': [5.0, 10.0],
                'top_n': [5, 10]
            }
        }, {
            'metrics': [
                'f1', 'mediocre', 'accuracy', 'roc_auc',
                'average precision score'
            ],
        }, {
            'metrics': ['fbeta@'],
            'parameters': [{
                'beta': 0.75
            }, {
                'beta': 1.25
            }]
        }]

        custom_metrics = {'mediocre': always_half}

        model_evaluator = ModelEvaluator(metric_groups,
                                         db_engine,
                                         custom_metrics=custom_metrics)

        trained_model, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = fake_labels(5)
        as_of_date = datetime.date(2016, 5, 5)
        model_evaluator.evaluate(
            trained_model.predict_proba(labels)[:, 1], labels, model_id,
            as_of_date, as_of_date, '1y')

        # assert
        # that all of the records are there
        records = [
            row[0] for row in db_engine.execute(
                '''select distinct(metric || parameter)
                from results.evaluations
                where model_id = %s and
                evaluation_start_time = %s order by 1''', (model_id,
                                                           as_of_date))
        ]
        assert records == [
            'accuracy', 'average precision score', 'f1',
            'false [email protected]_pct', 'false negatives@10_abs',
            'false [email protected]_pct', 'false negatives@5_abs',
            'false [email protected]_pct', 'false positives@10_abs',
            'false [email protected]_pct', 'false positives@5_abs',
            '[email protected]_beta', '[email protected]_beta', 'mediocre',
            '[email protected]_pct', 'precision@10_abs', '[email protected]_pct',
            'precision@5_abs', '[email protected]_pct', 'recall@10_abs',
            '[email protected]_pct', 'recall@5_abs', 'roc_auc',
            'true [email protected]_pct', 'true negatives@10_abs',
            'true [email protected]_pct', 'true negatives@5_abs',
            'true [email protected]_pct', 'true positives@10_abs',
            'true [email protected]_pct', 'true positives@5_abs'
        ]
Exemple #11
0
def test_DistanceFromBestTable():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        init_engine(engine)
        model_groups = {
            'stable': ModelGroupFactory(model_type='myStableClassifier'),
            'bad': ModelGroupFactory(model_type='myBadClassifier'),
            'spiky': ModelGroupFactory(model_type='mySpikeClassifier'),
        }

        class StableModelFactory(ModelFactory):
            model_group_rel = model_groups['stable']

        class BadModelFactory(ModelFactory):
            model_group_rel = model_groups['bad']

        class SpikyModelFactory(ModelFactory):
            model_group_rel = model_groups['spiky']

        models = {
            'stable_3y_ago': StableModelFactory(train_end_time='2014-01-01'),
            'stable_2y_ago': StableModelFactory(train_end_time='2015-01-01'),
            'stable_1y_ago': StableModelFactory(train_end_time='2016-01-01'),
            'bad_3y_ago': BadModelFactory(train_end_time='2014-01-01'),
            'bad_2y_ago': BadModelFactory(train_end_time='2015-01-01'),
            'bad_1y_ago': BadModelFactory(train_end_time='2016-01-01'),
            'spiky_3y_ago': SpikyModelFactory(train_end_time='2014-01-01'),
            'spiky_2y_ago': SpikyModelFactory(train_end_time='2015-01-01'),
            'spiky_1y_ago': SpikyModelFactory(train_end_time='2016-01-01'),
        }

        class ImmediateEvalFactory(EvaluationFactory):
            evaluation_start_time = factory.LazyAttribute(
                lambda o: o.model_rel.train_end_time)
            evaluation_end_time = factory.LazyAttribute(
                lambda o: _sql_add_days(o.model_rel.train_end_time, 1))

        class MonthOutEvalFactory(EvaluationFactory):
            evaluation_start_time = factory.LazyAttribute(
                lambda o: _sql_add_days(o.model_rel.train_end_time, 31))
            evaluation_end_time = factory.LazyAttribute(
                lambda o: _sql_add_days(o.model_rel.train_end_time, 32))

        class Precision100Factory(ImmediateEvalFactory):
            metric = 'precision@'
            parameter = '100_abs'

        class Precision100FactoryMonthOut(MonthOutEvalFactory):
            metric = 'precision@'
            parameter = '100_abs'

        class Recall100Factory(ImmediateEvalFactory):
            metric = 'recall@'
            parameter = '100_abs'

        class Recall100FactoryMonthOut(MonthOutEvalFactory):
            metric = 'recall@'
            parameter = '100_abs'

        for add_val, PrecFac, RecFac in [
            (0, Precision100Factory, Recall100Factory),
            (-0.15, Precision100FactoryMonthOut, Recall100FactoryMonthOut)
        ]:
            PrecFac(model_rel=models['stable_3y_ago'], value=0.6 + add_val)
            PrecFac(model_rel=models['stable_2y_ago'], value=0.57 + add_val)
            PrecFac(model_rel=models['stable_1y_ago'], value=0.59 + add_val)
            PrecFac(model_rel=models['bad_3y_ago'], value=0.4 + add_val)
            PrecFac(model_rel=models['bad_2y_ago'], value=0.39 + add_val)
            PrecFac(model_rel=models['bad_1y_ago'], value=0.43 + add_val)
            PrecFac(model_rel=models['spiky_3y_ago'], value=0.8 + add_val)
            PrecFac(model_rel=models['spiky_2y_ago'], value=0.4 + add_val)
            PrecFac(model_rel=models['spiky_1y_ago'], value=0.4 + add_val)
            RecFac(model_rel=models['stable_3y_ago'], value=0.55 + add_val)
            RecFac(model_rel=models['stable_2y_ago'], value=0.56 + add_val)
            RecFac(model_rel=models['stable_1y_ago'], value=0.55 + add_val)
            RecFac(model_rel=models['bad_3y_ago'], value=0.35 + add_val)
            RecFac(model_rel=models['bad_2y_ago'], value=0.34 + add_val)
            RecFac(model_rel=models['bad_1y_ago'], value=0.36 + add_val)
            RecFac(model_rel=models['spiky_3y_ago'], value=0.35 + add_val)
            RecFac(model_rel=models['spiky_2y_ago'], value=0.8 + add_val)
            RecFac(model_rel=models['spiky_1y_ago'], value=0.36 + add_val)
        session.commit()
        distance_table = DistanceFromBestTable(db_engine=engine,
                                               models_table='models',
                                               distance_table='dist_table')
        metrics = [{
            'metric': 'precision@',
            'parameter': '100_abs'
        }, {
            'metric': 'recall@',
            'parameter': '100_abs'
        }]
        model_group_ids = [mg.model_group_id for mg in model_groups.values()]
        distance_table.create_and_populate(
            model_group_ids, ['2014-01-01', '2015-01-01', '2016-01-01'],
            metrics)

        # get an ordered list of the models/groups for a particular metric/time
        query = '''
            select model_id, raw_value, dist_from_best_case, dist_from_best_case_next_time
            from dist_table where metric = %s and parameter = %s and train_end_time = %s
            order by dist_from_best_case
        '''

        prec_3y_ago = engine.execute(query,
                                     ('precision@', '100_abs', '2014-01-01'))
        assert [row for row in prec_3y_ago] == [
            (models['spiky_3y_ago'].model_id, 0.8, 0, 0.17),
            (models['stable_3y_ago'].model_id, 0.6, 0.2, 0),
            (models['bad_3y_ago'].model_id, 0.4, 0.4, 0.18),
        ]

        recall_2y_ago = engine.execute(query,
                                       ('recall@', '100_abs', '2015-01-01'))
        assert [row for row in recall_2y_ago] == [
            (models['spiky_2y_ago'].model_id, 0.8, 0, 0.19),
            (models['stable_2y_ago'].model_id, 0.56, 0.24, 0),
            (models['bad_2y_ago'].model_id, 0.34, 0.46, 0.19),
        ]

        assert distance_table.observed_bounds == {
            ('precision@', '100_abs'): (0.39, 0.8),
            ('recall@', '100_abs'): (0.34, 0.8),
        }
Exemple #12
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'beginning_of_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_window': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
            }

            train_store = InMemoryMatrixStore(train_matrix, train_metadata)

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }).set_index('entity_id'), {
                        'label_name': 'label',
                        'label_window': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
                model_group_keys=['label_name', 'label_window'])
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(predictions_proba,
                                             test_store.labels(), model_id,
                                             as_of_date, as_of_date, '6month')

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from results.predictions
                join results.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
Exemple #13
0
def create_sample_distance_table(engine):
    ensure_db(engine)
    init_engine(engine)
    model_groups = {
        'stable': ModelGroupFactory(model_type='myStableClassifier'),
        'spiky': ModelGroupFactory(model_type='mySpikeClassifier'),
    }

    class StableModelFactory(ModelFactory):
        model_group_rel = model_groups['stable']

    class SpikyModelFactory(ModelFactory):
        model_group_rel = model_groups['spiky']

    models = {
        'stable_3y_ago': StableModelFactory(train_end_time='2014-01-01'),
        'stable_2y_ago': StableModelFactory(train_end_time='2015-01-01'),
        'stable_1y_ago': StableModelFactory(train_end_time='2016-01-01'),
        'spiky_3y_ago': SpikyModelFactory(train_end_time='2014-01-01'),
        'spiky_2y_ago': SpikyModelFactory(train_end_time='2015-01-01'),
        'spiky_1y_ago': SpikyModelFactory(train_end_time='2016-01-01'),
    }
    session.commit()
    distance_table = DistanceFromBestTable(
        db_engine=engine,
        models_table='models',
        distance_table='dist_table'
    )
    distance_table._create()
    stable_grp = model_groups['stable'].model_group_id
    spiky_grp = model_groups['spiky'].model_group_id
    stable_3y_id = models['stable_3y_ago'].model_id
    stable_3y_end = models['stable_3y_ago'].train_end_time
    stable_2y_id = models['stable_2y_ago'].model_id
    stable_2y_end = models['stable_2y_ago'].train_end_time
    stable_1y_id = models['stable_1y_ago'].model_id
    stable_1y_end = models['stable_1y_ago'].train_end_time
    spiky_3y_id = models['spiky_3y_ago'].model_id
    spiky_3y_end = models['spiky_3y_ago'].train_end_time
    spiky_2y_id = models['spiky_2y_ago'].model_id
    spiky_2y_end = models['spiky_2y_ago'].train_end_time
    spiky_1y_id = models['spiky_1y_ago'].model_id
    spiky_1y_end = models['spiky_1y_ago'].train_end_time
    distance_rows = [
        (stable_grp, stable_3y_id, stable_3y_end, 'precision@', '100_abs', 0.5, 0.6, 0.1, 0.5, 0.15),
        (stable_grp, stable_2y_id, stable_2y_end, 'precision@', '100_abs', 0.5, 0.84, 0.34, 0.5, 0.18),
        (stable_grp, stable_1y_id, stable_1y_end, 'precision@', '100_abs', 0.46, 0.67, 0.21, 0.5, 0.11),
        (spiky_grp, spiky_3y_id, spiky_3y_end, 'precision@', '100_abs', 0.45, 0.6, 0.15, 0.5, 0.19),
        (spiky_grp, spiky_2y_id, spiky_2y_end, 'precision@', '100_abs', 0.84, 0.84, 0.0, 0.5, 0.3),
        (spiky_grp, spiky_1y_id, spiky_1y_end, 'precision@', '100_abs', 0.45, 0.67, 0.22, 0.5, 0.12),
        (stable_grp, stable_3y_id, stable_3y_end, 'recall@', '100_abs', 0.4, 0.4, 0.0, 0.4, 0.0),
        (stable_grp, stable_2y_id, stable_2y_end, 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.5, 0.0),
        (stable_grp, stable_1y_id, stable_1y_end, 'recall@', '100_abs', 0.6, 0.6, 0.0, 0.6, 0.0),
        (spiky_grp, spiky_3y_id, spiky_3y_end, 'recall@', '100_abs', 0.65, 0.65, 0.0, 0.65, 0.0),
        (spiky_grp, spiky_2y_id, spiky_2y_end, 'recall@', '100_abs', 0.55, 0.55, 0.0, 0.55, 0.0),
        (spiky_grp, spiky_1y_id, spiky_1y_end, 'recall@', '100_abs', 0.45, 0.45, 0.0, 0.45, 0.0),
    ]
    for dist_row in distance_rows:
        engine.execute(
            'insert into dist_table values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
            dist_row
        )
    return distance_table, model_groups
Exemple #14
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine)
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_window': '3month',
                'metta-uuid': '1234',
            }

            matrix_store = InMemoryMatrixStore(matrix, metadata)
            train_matrix_columns = ['feature_one', 'feature_two']
            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns)

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_window': '3month',
                'metta-uuid': '1234',
            }
            new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata)
            predictor.predict(model_id,
                              new_matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            predictor.predict(model_id,
                              matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
Exemple #15
0
def test_Auditioner():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        # set up data, randomly generated by the factories but conforming
        # generally to what we expect results schema data to look like
        num_model_groups = 10
        model_types = [
            'classifier type {}'.format(i) for i in range(0, num_model_groups)
        ]
        model_groups = [
            ModelGroupFactory(model_type=model_type)
            for model_type in model_types
        ]
        train_end_times = [
            datetime(2013, 1, 1),
            datetime(2014, 1, 1),
            datetime(2015, 1, 1),
            datetime(2016, 1, 1),
        ]
        models = [
            ModelFactory(model_group_rel=model_group,
                         train_end_time=train_end_time)
            for model_group in model_groups
            for train_end_time in train_end_times
        ]
        metrics = [
            ('precision@', '100_abs'),
            ('recall@', '100_abs'),
            ('precision@', '50_abs'),
            ('recall@', '50_abs'),
            ('fpr@', '10_pct'),
        ]

        class ImmediateEvalFactory(EvaluationFactory):
            evaluation_start_time = factory.LazyAttribute(
                lambda o: o.model_rel.train_end_time)

        _ = [
            ImmediateEvalFactory(model_rel=model,
                                 metric=metric,
                                 parameter=parameter)
            for metric, parameter in metrics for model in models
        ]
        session.commit()

        # define a very loose filtering that should admit all model groups
        no_filtering = [{
            'metric': 'precision@',
            'parameter': '100_abs',
            'max_from_best': 1.0,
            'threshold_value': 0.0
        }, {
            'metric': 'recall@',
            'parameter': '100_abs',
            'max_from_best': 1.0,
            'threshold_value': 0.0
        }]
        model_group_ids = [mg.model_group_id for mg in model_groups]
        auditioner = Auditioner(
            db_engine,
            model_group_ids,
            train_end_times,
            no_filtering,
        )
        assert len(auditioner.thresholded_model_group_ids) == num_model_groups
        auditioner.plot_model_groups()

        # here, we pick thresholding rules that should definitely remove
        # all model groups from contention because they are too strict.
        remove_all = [{
            'metric': 'precision@',
            'parameter': '100_abs',
            'max_from_best': 0.0,
            'threshold_value': 1.1
        }, {
            'metric': 'recall@',
            'parameter': '100_abs',
            'max_from_best': 0.0,
            'threshold_value': 1.1
        }]

        auditioner.update_metric_filters(remove_all)
        assert len(auditioner.thresholded_model_group_ids) == 0

        # one potential place for bugs would be when we pull back the rules
        # for being too restrictive. we want to make sure that the original list is
        # always used for thresholding, or else such a move would be impossible
        auditioner.update_metric_filters(no_filtering)
        assert len(auditioner.thresholded_model_group_ids) == num_model_groups

        # now, we want to take this partially thresholded list and run it through
        # a grid of selection rules, meant to pick winners by a variety of user-defined
        # criteria
        rule_grid = [{
            'shared_parameters': [
                {
                    'metric': 'precision@',
                    'parameter': '100_abs'
                },
                {
                    'metric': 'recall@',
                    'parameter': '100_abs'
                },
            ],
            'selection_rules': [{
                'name': 'most_frequent_best_dist',
                'dist_from_best_case': [0.1, 0.2, 0.3]
            }, {
                'name': 'best_current_value'
            }]
        }, {
            'shared_parameters': [
                {
                    'metric1': 'precision@',
                    'parameter1': '100_abs'
                },
            ],
            'selection_rules': [
                {
                    'name': 'best_average_two_metrics',
                    'metric2': ['recall@'],
                    'parameter2': ['100_abs'],
                    'metric1_weight': [0.4, 0.5, 0.6]
                },
            ]
        }]
        auditioner.register_selection_rule_grid(rule_grid, plot=False)
        final_model_group_ids = auditioner.selection_rule_model_group_ids

        # we expect the result to be a mapping of selection rule name to model group id
        assert isinstance(final_model_group_ids, dict)

        # we expect that there is one winner for each selection rule
        assert sorted(final_model_group_ids.keys()) == \
            sorted([rule.descriptive_name for rule in auditioner.selection_rules])

        # we expect that the results written to the yaml file are the
        # chosen model groups and their rules
        # however because the source data is randomly generated we could have a
        # different list on consecutive runs
        # and don't want to introduce non-determinism to the test
        with tempfile.NamedTemporaryFile() as tf:
            auditioner.write_tyra_config(tf.name)
            assert sorted(yaml.load(tf)['selection_rule_model_groups'].keys()) == \
                sorted(final_model_group_ids.keys())
 def replace_db(arg):
     self.new_server = testing.postgresql.Postgresql(port=port)
     engine = create_engine(self.new_server.url())
     ensure_db(engine)
def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in
                engine.execute('select model_hash from results.models')
            ]
            assert len(records) == 4

            cache_keys = [
                model_cache_key(project_path, model_row[0], s3_conn)
                for model_row in records
            ]

            # 2. that the model groups are distinct
            records = [
                row for row in
                engine.execute('select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                pickle.loads(cache_key.get()['Body'].read())
                for cache_key in cache_keys
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert len([
                row for row in
                engine.execute('select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan'],
                replace=True
            )
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in
                engine.execute('select model_id from results.models order by 1 asc')
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
 def setup_data(self, engine):
     ensure_db(engine)
     init_engine(engine)
     ModelGroupFactory(model_group_id=1, model_type='modelType1')
     ModelGroupFactory(model_group_id=2, model_type='modelType2')
     ModelGroupFactory(model_group_id=3, model_type='modelType3')
     ModelGroupFactory(model_group_id=4, model_type='modelType4')
     ModelGroupFactory(model_group_id=5, model_type='modelType5')
     session.commit()
     distance_table = DistanceFromBestTable(db_engine=engine,
                                            models_table='models',
                                            distance_table='dist_table')
     distance_table._create()
     distance_rows = [
         # 2014: model group 1 should pass both close and min checks
         (1, 1, '2014-01-01', 'precision@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (1, 1, '2014-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (1, 1, '2014-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2015: model group 1 should not pass close check
         (1, 2, '2015-01-01', 'precision@', '100_abs', 0.5, 0.88, 0.38, 0.0
          ),
         (1, 2, '2015-01-01', 'recall@', '100_abs', 0.5, 0.88, 0.38, 0.0),
         (1, 2, '2015-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (1, 3, '2016-01-01', 'precision@', '100_abs', 0.46, 0.46, 0.0,
          0.11),
         (1, 3, '2016-01-01', 'recall@', '100_abs', 0.46, 0.46, 0.0, 0.11),
         (1, 3, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2014: model group 2 should not pass min check
         (2, 4, '2014-01-01', 'precision@', '100_abs', 0.39, 0.5, 0.11, 0.5
          ),
         (2, 4, '2014-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (2, 4, '2014-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2015: model group 2 should pass both checks
         (2, 5, '2015-01-01', 'precision@', '100_abs', 0.69, 0.88, 0.19,
          0.12),
         (2, 5, '2015-01-01', 'recall@', '100_abs', 0.69, 0.88, 0.19, 0.0),
         (2, 5, '2015-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (2, 6, '2016-01-01', 'precision@', '100_abs', 0.34, 0.46, 0.12,
          0.11),
         (2, 6, '2016-01-01', 'recall@', '100_abs', 0.46, 0.46, 0.0, 0.11),
         (2, 6, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # model group 3 not included in this round
         (3, 7, '2014-01-01', 'precision@', '100_abs', 0.28, 0.5, 0.22, 0.0
          ),
         (3, 7, '2014-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (3, 7, '2014-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (3, 8, '2015-01-01', 'precision@', '100_abs', 0.88, 0.88, 0.0,
          0.02),
         (3, 8, '2015-01-01', 'recall@', '100_abs', 0.5, 0.88, 0.38, 0.0),
         (3, 8, '2015-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (3, 9, '2016-01-01', 'precision@', '100_abs', 0.44, 0.46, 0.02,
          0.11),
         (3, 9, '2016-01-01', 'recall@', '100_abs', 0.46, 0.46, 0.0, 0.11),
         (3, 9, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2014: model group 4 should not pass any checks
         (4, 10, '2014-01-01', 'precision@', '100_abs', 0.29, 0.5, 0.21,
          0.21),
         (4, 10, '2014-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (4, 10, '2014-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2015: model group 4 should not pass close check
         (4, 11, '2015-01-01', 'precision@', '100_abs', 0.67, 0.88, 0.21,
          0.21),
         (4, 11, '2015-01-01', 'recall@', '100_abs', 0.5, 0.88, 0.38, 0.0),
         (4, 11, '2015-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (4, 12, '2016-01-01', 'precision@', '100_abs', 0.25, 0.46, 0.21,
          0.21),
         (4, 12, '2016-01-01', 'recall@', '100_abs', 0.46, 0.46, 0.0, 0.11),
         (4, 12, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2014: model group 5 should not pass because precision is good but not recall
         (5, 13, '2014-01-01', 'precision@', '100_abs', 0.5, 0.38, 0.0, 0.38
          ),
         (5, 13, '2014-01-01', 'recall@', '100_abs', 0.3, 0.5, 0.2, 0.38),
         (5, 13, '2014-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2015: model group 5 should not pass because precision is good but not recall
         (5, 14, '2015-01-01', 'precision@', '100_abs', 0.5, 0.88, 0.38, 0.0
          ),
         (5, 14, '2015-01-01', 'recall@', '100_abs', 0.3, 0.88, 0.58, 0.0),
         (5, 14, '2015-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         (5, 15, '2016-01-01', 'precision@', '100_abs', 0.46, 0.46, 0.0,
          0.11),
         (5, 15, '2016-01-01', 'recall@', '100_abs', 0.3, 0.46, 0.16, 0.11),
         (5, 15, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
         # 2014: model group 6 is failed by false positives
         (6, 16, '2014-01-01', 'precision@', '100_abs', 0.5, 0.5, 0.0, 0.38
          ),
         (6, 16, '2014-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (6, 16, '2014-01-01', 'false positives@', '100_abs', 60, 30, 30,
          10),
         # 2015: model group 6 is failed by false positives
         (6, 17, '2015-01-01', 'precision@', '100_abs', 0.5, 0.88, 0.38, 0.0
          ),
         (6, 17, '2015-01-01', 'recall@', '100_abs', 0.5, 0.38, 0.0, 0.38),
         (6, 17, '2015-01-01', 'false positives@', '100_abs', 60, 30, 30,
          10),
         (6, 18, '2016-01-01', 'precision@', '100_abs', 0.46, 0.46, 0.0,
          0.11),
         (6, 18, '2016-01-01', 'recall@', '100_abs', 0.5, 0.5, 0.0, 0.38),
         (6, 18, '2016-01-01', 'false positives@', '100_abs', 40, 30, 10,
          10),
     ]
     for dist_row in distance_rows:
         engine.execute(
             'insert into dist_table values (%s, %s, %s, %s, %s, %s, %s, %s, %s)',
             dist_row)
     thresholder = ModelGroupThresholder(
         distance_from_best_table=distance_table,
         train_end_times=['2014-01-01', '2015-01-01'],
         initial_model_group_ids=[1, 2, 4, 5, 6],
         initial_metric_filters=self.metric_filters)
     return thresholder
Exemple #19
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path,
                              model_storage_engine,
                              db_engine,
                              replace=False)
        dayone = datetime.date(2011, 1,
                               1).strftime(predictor.expected_matrix_ts_format)
        daytwo = datetime.date(2011, 1,
                               2).strftime(predictor.expected_matrix_ts_format)
        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_window': '3month',
            'metta-uuid': '1234',
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        session = sessionmaker(bind=db_engine)()
        obj = session.query(Prediction).first()
        session.delete(obj)
        session.commit()

        make_transient(obj)
        session = sessionmaker(bind=db_engine)()
        session.add(obj)
        session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called