Esempio n. 1
0
    def test_retry_recovery(self):
        db_engine = None
        trainer = None
        port = None
        with rig_engines() as (db_engine, project_storage):
            port = db_engine.url.port
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper(),
            )
            matrix_store = get_matrix_store(project_storage)

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            get_matrix_store(project_storage)

        with patch("time.sleep") as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config(), dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {'entity_id': [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices='entity_id')
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata['indices']), metadata)
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
def test_uniform_distribution():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator()
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date=datetime.date(2016, 1, 1),
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 5  # 5 features x 1 entity for this as_of_date
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Esempio n. 4
0
def prepare():
    with rig_engines() as (db_engine, project_storage):
        train_matrix_uuid = '1234'
        session = sessionmaker(db_engine)()
        session.add(Matrix(matrix_uuid=train_matrix_uuid))

        # Create the fake trained model and store in db
        trained_model = MockTrainedModel()
        model_hash = 'abcd'
        project_storage.model_storage_engine().write(trained_model, model_hash)
        db_model = Model(model_hash=model_hash,
                         train_matrix_uuid=train_matrix_uuid)
        session.add(db_model)
        session.commit()
        yield project_storage, db_engine, db_model.model_id
Esempio n. 5
0
    def test_retry_max(self):
        db_engine = None
        trainer = None
        # set up a basic model training run
        with rig_engines() as (db_engine, project_storage):
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper())
            matrix_store = get_matrix_store(project_storage)

        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config(), dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
def test_calculate_and_save():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='train'),
        )
        test_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='test'),
        )
        calculator = IndividualImportanceCalculator(db_engine,
                                                    methods=['sample'],
                                                    replace=False)
        # given a trained model
        # and a test matrix
        _, model_id = \
            fake_trained_model(
                db_engine,
                train_matrix_uuid=train_store.uuid
            )
        # i expect to be able to call calculate and save
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        # and find individual importances in the results schema afterwards
        records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) > 0
        # and that when run again, has the same result
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        new_records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) == len(new_records)
        assert records == new_records
Esempio n. 7
0
def prepare():
    with rig_engines() as (db_engine, project_storage):
        train_matrix_uuid = "1234"
        try:
            session = sessionmaker(db_engine)()
            session.add(Matrix(matrix_uuid=train_matrix_uuid))

            # Create the fake trained model and store in db
            trained_model = MockTrainedModel()
            model_hash = "abcd"
            project_storage.model_storage_engine().write(
                trained_model, model_hash)
            db_model = Model(model_hash=model_hash,
                             train_matrix_uuid=train_matrix_uuid,
                             random_seed=MODEL_RANDOM_SEED)
            session.add(db_model)
            session.commit()
            yield project_storage, db_engine, db_model.model_id
        finally:
            session.close()
Esempio n. 8
0
def test_custom_groups(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # create training set
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(['class_path']),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        # expect only one model group now
        records = [
            row[0] for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 1
        assert records[0] == model_ids[0]
Esempio n. 9
0
def test_baseline_exception_handling():
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with rig_engines() as (db_engine, project_storage):
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=project_storage.model_storage_engine(),
            db_engine=db_engine,
            model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config, dict(), get_matrix_store(project_storage))

        model_ids = []
        for train_task in train_tasks:
            model_ids.append(trainer.process_train_task(**train_task))
        assert model_ids == [1, None]
Esempio n. 10
0
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with rig_engines() as (db_engine, project_storage):
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(experiment_hash=None,
                               model_storage_engine=model_storage_engine,
                               db_engine=db_engine,
                               model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config,
            dict(),
            get_matrix_store(project_storage),
        )

        assert len(
            train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
        assert len([
            task for task in train_tasks if 'n_jobs' in task['parameters']
        ]) == 32

        for train_task in train_tasks:
            trainer.process_train_task(**train_task)

        for row in db_engine.execute(
                'select hyperparameters from model_metadata.model_groups'):
            assert 'n_jobs' not in row[0]
Esempio n. 11
0
def test_n_jobs_not_new_model():
    grid_config = {
        "sklearn.ensemble.AdaBoostClassifier": {
            "n_estimators": [10, 100, 1000]
        },
        "sklearn.ensemble.RandomForestClassifier": {
            "n_estimators": [10, 100],
            "max_features": ["sqrt", "log2"],
            "max_depth": [5, 10, 15, 20],
            "criterion": ["gini", "entropy"],
            "n_jobs": [12, 24],
        },
    }

    with rig_engines() as (db_engine, project_storage):
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
            model_grouper=ModelGrouper(),
        )

        train_tasks = trainer.generate_train_tasks(
            grid_config, dict(), get_matrix_store(project_storage))

        assert len(
            train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
        assert (len([
            task for task in train_tasks if "n_jobs" in task["parameters"]
        ]) == 32)

        for train_task in train_tasks:
            trainer.process_train_task(**train_task)

        for row in db_engine.execute(
                "select hyperparameters from model_metadata.model_groups"):
            assert "n_jobs" not in row[0]
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices="entity_id")
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata["indices"]),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date="2016-01-01",
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Esempio n. 13
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage, matrix_creator(),
            matrix_metadata_creator(matrix_type='train'))
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    'entity_id': [3],
                    'feature_one': [8],
                    'feature_two': [5],
                    'label': [0]
                }).set_index('entity_id'),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=['entity_id']))
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            'metrics': ['precision@'],
            'thresholds': {
                'top_n': [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=['feature_one', 'feature_two'])

                model_evaluator.evaluate(
                    predictions_proba,
                    test_store,
                    model_id,
                )

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                '''select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2''')
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute('''
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1''')
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
        ]
Esempio n. 14
0
def test_model_trainer(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # Creates a matrix entry in the matrices table with uuid from metadata above
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )

        # assert
        # 1. that the models and feature importances table entries are present
        records = [
            row for row in db_engine.execute(
                "select * from train_results.feature_importances")
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        records = [
            row for row in db_engine.execute(
                "select model_hash from model_metadata.models")
        ]
        assert len(records) == 4
        hashes = [row[0] for row in records]

        # 2. that the model groups are distinct
        records = [
            row for row in db_engine.execute(
                "select distinct model_group_id from model_metadata.models")
        ]
        assert len(records) == 4

        # 3. that the model sizes are saved in the table and all are < 1 kB
        records = [
            row for row in db_engine.execute(
                "select model_size from model_metadata.models")
        ]
        assert len(records) == 4
        for i in records:
            size = i[0]
            assert size < 1

        # 4. that all four models are cached
        model_pickles = [
            model_storage_engine.load(model_hash) for model_hash in hashes
        ]
        assert len(model_pickles) == 4
        assert len([x for x in model_pickles if x is not None]) == 4

        # 5. that their results can have predictions made on it
        test_matrix = pandas.DataFrame.from_dict({
            "entity_id": [3, 4],
            "feature_one": [4, 4],
            "feature_two": [6, 5]
        }).set_index("entity_id")

        for model_pickle in model_pickles:
            predictions = model_pickle.predict(test_matrix)
            assert len(predictions) == 2

        # 6. when run again, same models are returned
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )
        assert (len([
            row for row in db_engine.execute(
                "select model_hash from model_metadata.models")
        ]) == 4)
        assert model_ids == new_model_ids

        # 7. if replace is set, update non-unique attributes and feature importances
        max_batch_run_time = [
            row[0] for row in db_engine.execute(
                "select max(batch_run_time) from model_metadata.models")
        ][0]
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(
                model_group_keys=["label_name", "label_timespan"]),
            db_engine=db_engine,
            replace=True,
        )
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )
        assert model_ids == new_model_ids
        assert [
            row["model_id"] for row in db_engine.execute(
                "select model_id from model_metadata.models order by 1 asc")
        ] == model_ids
        new_max_batch_run_time = [
            row[0] for row in db_engine.execute(
                "select max(batch_run_time) from model_metadata.models")
        ][0]
        assert new_max_batch_run_time > max_batch_run_time

        records = [
            row for row in db_engine.execute(
                "select * from train_results.feature_importances")
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        # 8. if the cache is missing but the metadata is still there, reuse the metadata
        for row in db_engine.execute(
                "select model_hash from model_metadata.models"):
            model_storage_engine.delete(row[0])
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )
        assert model_ids == sorted(new_model_ids)

        # 9. that the generator interface works the same way
        new_model_ids = trainer.generate_trained_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )
        assert model_ids == sorted([model_id for model_id in new_model_ids])
Esempio n. 15
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type="train"),
        )
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    "entity_id": [3],
                    "feature_one": [8],
                    "feature_two": [5],
                    "label": [0],
                }).set_index("entity_id"),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=["entity_id"]),
            )
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            "metrics": ["precision@"],
            "thresholds": {
                "top_n": [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            "sklearn.linear_model.LogisticRegression": {
                "C": [0.00001, 0.0001],
                "penalty": ["l1", "l2"],
                "random_state": [2193],
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=["feature_one", "feature_two"],
                )

                model_evaluator.evaluate(predictions_proba, test_store,
                                         model_id)

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                """select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2""")
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute("""
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1""")
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
        ]