def test_uniform_distribution():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator()
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date=datetime.date(2016, 1, 1),
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 5  # 5 features x 1 entity for this as_of_date
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Ejemplo n.º 2
0
def test_predictor_save_predictions(matrix_type, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    # if save_predictions is sent as False, don't save
    predictor = Predictor(project_storage.model_storage_engine(),
                          db_engine,
                          save_predictions=False)

    matrix = matrix_creator(index="entity_id")
    metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                       matrix_type=matrix_type,
                                       indices=["entity_id"])

    matrix_store = get_matrix_store(project_storage, matrix, metadata)
    train_matrix_columns = matrix.columns[0:-1].tolist()

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )

    # assert
    # 1. that the returned predictions are of the desired length
    assert len(predict_proba) == 2

    # 2. that the predictions table entries are present and
    # can be linked to the original models
    assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {'entity_id': [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices='entity_id')
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata['indices']), metadata)
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
Ejemplo n.º 4
0
def test_predictor_get_train_columns():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)
        train_store = get_matrix_store(
            project_storage=project_storage,
            matrix=matrix_creator(),
            metadata=matrix_metadata_creator(matrix_type="train"),
        )

        # flip the order of some feature columns in the test matrix
        other_order_matrix = matrix_creator()
        order = other_order_matrix.columns.tolist()
        order[0], order[1] = order[1], order[0]
        other_order_matrix = other_order_matrix[order]
        test_store = get_matrix_store(
            project_storage=project_storage,
            matrix=other_order_matrix,
            metadata=matrix_metadata_creator(matrix_type="test"),
        )

        # Runs the same test for training and testing predictions
        for store, mat_type in zip((train_store, test_store),
                                   ("train", "test")):
            predict_proba = predictor.predict(
                model_id,
                store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_store.columns(),
            )
            # assert
            # 1. that we calculated predictions
            assert len(predict_proba) > 0

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) > 0
Ejemplo n.º 5
0
def test_predictor_retrieve():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine,
                              replace=False)

        # create prediction set
        matrix = matrix_creator()
        metadata = matrix_metadata_creator()
        matrix_store = get_matrix_store(project_storage, matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist())

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        reorder_session = sessionmaker(bind=db_engine)()
        obj = reorder_session.query(TestPrediction).first()
        reorder_session.delete(obj)
        reorder_session.commit()

        make_transient(obj)
        reorder_session = sessionmaker(bind=db_engine)()
        reorder_session.add(obj)
        reorder_session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist())
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
Ejemplo n.º 6
0
    def test_replace_true_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )
            matrix_metadata = matrix_metadata_creator(state="active",
                                                      test_duration="1month",
                                                      label_name="booking")

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            feature_dictionary = {
                "features0": ["f1", "f2"],
                "features1": ["f3", "f4"]
            }
            uuid = filename_friendly_hash(matrix_metadata)
            build_args = dict(
                as_of_times=dates,
                label_name="booking",
                label_type="binary",
                feature_dictionary=feature_dictionary,
                matrix_metadata=matrix_metadata,
                matrix_uuid=uuid,
                matrix_type="test",
            )

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                    replace=True,
                )

                builder.build_matrix(**build_args)

                assert len(
                    matrix_storage_engine.get_store(uuid).design_matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
                # rerun
                builder.build_matrix(**build_args)
                assert len(
                    matrix_storage_engine.get_store(uuid).design_matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
Ejemplo n.º 7
0
def setup_model_train_tester(project_storage,
                             replace,
                             additional_bigtrain_classnames=None):
    matrix_storage_engine = MatrixStorageEngine(project_storage)
    train_matrix_store = get_matrix_store(
        project_storage,
        metadata=matrix_metadata_creator(matrix_type="train"),
        write_to_db=False)
    test_matrix_store = get_matrix_store(
        project_storage,
        metadata=matrix_metadata_creator(matrix_type="test"),
        write_to_db=False)
    sample_train_kwargs = {
        'matrix_store': train_matrix_store,
        'class_path': None,
        'parameters': {},
        'model_hash': None,
        'misc_db_parameters': {}
    }
    train_test_task = {
        'train_kwargs': sample_train_kwargs,
        'train_store': train_matrix_store,
        'test_store': test_matrix_store
    }

    predictor = MagicMock(spec_set=Predictor)
    trainer = MagicMock(spec_set=ModelTrainer)
    evaluator = MagicMock(spec_set=ModelEvaluator)
    individual_importance_calculator = MagicMock(
        spec_set=IndividualImportanceCalculator)
    protected_groups_generator = MagicMock(spec_set=ProtectedGroupsGenerator)
    train_tester = ModelTrainTester(
        matrix_storage_engine=matrix_storage_engine,
        model_trainer=trainer,
        model_evaluator=evaluator,
        individual_importance_calculator=individual_importance_calculator,
        predictor=predictor,
        subsets=[],
        replace=replace,
        protected_groups_generator=protected_groups_generator,
        additional_bigtrain_classnames=additional_bigtrain_classnames)
    return train_tester, train_test_task
Ejemplo n.º 8
0
def test_calculate_and_save():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='train'),
        )
        test_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='test'),
        )
        calculator = IndividualImportanceCalculator(db_engine,
                                                    methods=['sample'],
                                                    replace=False)
        # given a trained model
        # and a test matrix
        _, model_id = \
            fake_trained_model(
                db_engine,
                train_matrix_uuid=train_store.uuid
            )
        # i expect to be able to call calculate and save
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        # and find individual importances in the results schema afterwards
        records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) > 0
        # and that when run again, has the same result
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        new_records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) == len(new_records)
        assert records == new_records
Ejemplo n.º 9
0
    def test_replace_true_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)
            matrix_metadata = matrix_metadata_creator(
                state='state_one and state_two', test_duration='1month')

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            feature_dictionary = {
                'features0': ['f1', 'f2'],
                'features1': ['f3', 'f4'],
            }
            uuid = metta.generate_uuid(matrix_metadata)
            build_args = dict(as_of_times=dates,
                              label_name='booking',
                              label_type='binary',
                              feature_dictionary=feature_dictionary,
                              matrix_metadata=matrix_metadata,
                              matrix_uuid=uuid,
                              matrix_type='test')

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine,
                    replace=True)

                builder.build_matrix(**build_args)

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
                # rerun
                builder.build_matrix(**build_args)
                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
Ejemplo n.º 10
0
def test_predictor_composite_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        source_dict = {
            "entity_id": [1, 2, 1, 2],
            "as_of_date": [dayone, dayone, daytwo, daytwo],
            "feature_one": [3, 4, 5, 6],
            "feature_two": [5, 6, 7, 8],
            "label": [7, 8, 8, 7],
        }

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):

            matrix = pandas.DataFrame.from_dict(source_dict).set_index(
                ["entity_id", "as_of_date"])
            metadata = matrix_metadata_creator(matrix_type=mat_type)
            matrix_store = get_matrix_store(project_storage, matrix, metadata)

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=["feature_one", "feature_two"],
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 4

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4
Ejemplo n.º 11
0
def test_predictor_needs_predictions(matrix_type, predict_setup_args):
    """Test that the logic that figures out if predictions are needed for a given model/matrix"""
    (project_storage, db_engine, model_id) = predict_setup_args
    # if not all of the predictions for the given model id and matrix are present in the db,
    # needs_predictions should return true. else, false
    predictor = Predictor(project_storage.model_storage_engine(), db_engine,
                          'worst')

    metadata = matrix_metadata_creator(matrix_type=matrix_type)
    matrix_store = get_matrix_store(project_storage, metadata=metadata)
    train_matrix_columns = matrix_store.columns()

    # we haven't done anything yet, this should definitely need predictions
    assert predictor.needs_predictions(matrix_store, model_id)
    predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )
    # now that predictions have been made, this should no longer need predictions
    assert not predictor.needs_predictions(matrix_store, model_id)
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices="entity_id")
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata["indices"]),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date="2016-01-01",
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Ejemplo n.º 13
0
def test_predictor_needs_predictions(matrix_type, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    # if not all of the predictions for the given model id and matrix are present in the db,
    # needs_predictions should return true. else, false
    predictor = Predictor(project_storage.model_storage_engine(), db_engine)

    matrix = matrix_creator(index="entity_id")
    metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                       matrix_type=matrix_type,
                                       indices=["entity_id"])

    matrix_store = get_matrix_store(project_storage, matrix, metadata)
    train_matrix_columns = matrix.columns[0:-1].tolist()

    # we haven't done anything yet, this should definitely need predictions
    assert predictor.needs_predictions(matrix_store, model_id)
    predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )
    # now that predictions have been made, this should no longer need predictions
    assert not predictor.needs_predictions(matrix_store, model_id)
Ejemplo n.º 14
0
def prediction_results(matrix_type, predictor, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args

    dayone = datetime.datetime(2011, 1, 1)
    daytwo = datetime.datetime(2011, 1, 2)
    source_dict = {
        "entity_id": [1, 2, 3, 1, 2, 3],
        "as_of_date": [dayone, dayone, dayone, daytwo, daytwo, daytwo],
        "feature_one": [3] * 6,
        "feature_two": [5] * 6,
        "label": [True, False] * 3
    }

    matrix = pd.DataFrame.from_dict(source_dict)
    metadata = matrix_metadata_creator(matrix_type=matrix_type)
    matrix_store = get_matrix_store(project_storage, matrix, metadata)

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=["feature_one", "feature_two"],
    )
    return predict_proba
Ejemplo n.º 15
0
def test_predictor_entity_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            matrix = matrix_creator(index="entity_id")
            metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                               matrix_type=mat_type,
                                               indices=["entity_id"])

            matrix_store = get_matrix_store(project_storage, matrix, metadata)
            train_matrix_columns = matrix.columns[0:-1].tolist()

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

        # 5. running with same model_id, different as of date
        # then with same as of date only replaces the records
        # with the same date

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            new_matrix = matrix_creator(index="entity_id")
            new_metadata = matrix_metadata_creator(
                end_time=AS_OF_DATE + datetime.timedelta(days=1),
                matrix_type=mat_type,
                indices=["entity_id"],
            )
            new_matrix_store = get_matrix_store(project_storage, new_matrix,
                                                new_metadata)

            predictor.predict(
                model_id,
                new_matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4

        # 6. That we can delete the model when done prediction on it
        predictor.delete_model(model_id)
        assert predictor.load_model(model_id) is None
Ejemplo n.º 16
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage, matrix_creator(),
            matrix_metadata_creator(matrix_type='train'))
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    'entity_id': [3],
                    'feature_one': [8],
                    'feature_two': [5],
                    'label': [0]
                }).set_index('entity_id'),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=['entity_id']))
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            'metrics': ['precision@'],
            'thresholds': {
                'top_n': [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=['feature_one', 'feature_two'])

                model_evaluator.evaluate(
                    predictions_proba,
                    test_store,
                    model_id,
                )

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                '''select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2''')
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute('''
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1''')
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
        ]
Ejemplo n.º 17
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type="train"),
        )
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    "entity_id": [3],
                    "feature_one": [8],
                    "feature_two": [5],
                    "label": [0],
                }).set_index("entity_id"),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=["entity_id"]),
            )
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            "metrics": ["precision@"],
            "thresholds": {
                "top_n": [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            "sklearn.linear_model.LogisticRegression": {
                "C": [0.00001, 0.0001],
                "penalty": ["l1", "l2"],
                "random_state": [2193],
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=["feature_one", "feature_two"],
                )

                model_evaluator.evaluate(predictions_proba, test_store,
                                         model_id)

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                """select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2""")
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute("""
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1""")
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
        ]