Exemple #1
0
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=S3ModelStorageEngine('econ-dev/inspections'),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                InMemoryMatrixStore(matrix, {
                    'label_timespan': '1d',
                    'end_time': datetime.datetime.now(),
                    'feature_start_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2'],
                    'indices': ['entity_id'],
                })
            )
            assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks
                if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in engine.execute(
                'select model_parameters from results.model_groups'
            ):
                assert 'n_jobs' not in row[0]
Exemple #2
0
def test_uniform_distribution_entity_id_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {'entity_id': [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        test_store = InMemoryMatrixStore(
            matrix=pandas.DataFrame.from_dict(data_dict),
            metadata=sample_metadata())
        session.commit()
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
Exemple #3
0
    def matrix_store(self):
        data_dict = OrderedDict([('entity_id', [1, 2]),
                                 ('k_feature', [0.5, 0.4]),
                                 ('m_feature', [0.4, 0.5]), ('label', [0, 1])])
        df = pd.DataFrame.from_dict(data_dict)
        metadata = {
            'label_name': 'label',
            'indices': ['entity_id'],
        }

        inmemory = InMemoryMatrixStore(matrix=df, metadata=metadata)

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpcsv = os.path.join(tmpdir, 'df.csv')
            tmpyaml = os.path.join(tmpdir, 'metadata.yaml')
            tmphdf = os.path.join(tmpdir, 'df.h5')
            with open(tmpyaml, 'w') as outfile:
                yaml.dump(metadata, outfile, default_flow_style=False)
                df.to_csv(tmpcsv)
                df.to_hdf(tmphdf, 'matrix')
                csv = CSVMatrixStore(matrix_path=tmpcsv, metadata_path=tmpyaml)
                hdf = HDFMatrixStore(matrix_path=tmphdf, metadata_path=tmpyaml)

                assert csv.matrix.to_dict() == inmemory.matrix.to_dict()
                assert hdf.matrix.to_dict() == inmemory.matrix.to_dict()

                assert csv.metadata == inmemory.metadata
                assert hdf.metadata == inmemory.metadata

                assert csv.head_of_matrix.to_dict(
                ) == inmemory.head_of_matrix.to_dict()
                assert hdf.head_of_matrix.to_dict(
                ) == inmemory.head_of_matrix.to_dict()

                assert csv.empty == inmemory.empty
                assert hdf.empty == inmemory.empty

                assert csv.labels().to_dict() == inmemory.labels().to_dict()
                assert hdf.labels().to_dict() == inmemory.labels().to_dict()

        matrix_store = [inmemory, csv, hdf]

        return matrix_store
Exemple #4
0
    def test_retry_recovery(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        port = None
        with testing.postgresql.Postgresql() as postgresql:
            port = postgresql.settings['port']
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(matrix, {
                'label_timespan': '1d',
                'end_time': datetime.datetime.now(),
                'feature_start_time': datetime.date(2012, 12, 20),
                'label_name': 'label',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            })

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None
        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            engine = create_engine(self.new_server.url())
            ensure_db(engine)
        with patch('time.sleep') as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config, dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
Exemple #5
0
    def test_as_of_dates_entity_date_index(self):
        data = {
            'entity_id': [1, 2, 1, 2],
            'feature_one': [0.5, 0.6, 0.5, 0.6],
            'feature_two': [0.5, 0.6, 0.5, 0.6],
            'as_of_date':
            ['2016-01-01', '2016-01-01', '2017-01-01', '2017-01-01']
        }
        matrix = InMemoryMatrixStore(
            matrix=pd.DataFrame.from_dict(data),
            metadata={'indices': ['entity_id', 'as_of_date']})

        self.assertEqual(matrix.as_of_dates, ['2016-01-01', '2017-01-01'])
Exemple #6
0
def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine)
        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two']
        )

        # assert
        # 1. that the returned predictions are of the desired length
        assert len(predict_proba) == 4

        # 2. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in
            db_engine.execute('''select entity_id, as_of_date
            from results.predictions
            join results.models using (model_id)''')
        ]
        assert len(records) == 4
Exemple #7
0
    def test_retry_max(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        # set up a basic model training run
        # TODO abstract the setup of a basic model training run where
        # we don't worry about the specific values used? it would make
        # tests like this require a bit less noise to read past
        with testing.postgresql.Postgresql() as postgresql:
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(matrix, {
                'label_timespan': '1d',
                'end_time': datetime.datetime.now(),
                'feature_start_time': datetime.date(2012, 12, 20),
                'label_name': 'label',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            })
        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config, dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
Exemple #8
0
def test_baseline_exception_handling():
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        project_path = 'econ-dev/inspections'
        model_storage_engine = S3ModelStorageEngine(project_path)
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine = model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                InMemoryMatrixStore(matrix, {
                    'label_timespan': '1d',
                    'end_time': datetime.datetime.now(),
                    'feature_start_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2'],
                    'indices': ['entity_id'],
                })
            )
            model_ids = []
            for train_task in train_tasks:
                model_ids.append(trainer.process_train_task(**train_task))
            assert model_ids == [1, None]
def test_model_trainer(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # Creates a matrix entry in the matrices table with uuid from metadata above
            MatrixFactory(matrix_uuid="1234")
            session.commit()
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(),
                db_engine=db_engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in db_engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 4

            # 3. that the model sizes are saved in the table and all are < 1 kB
            records = [
                row for row in db_engine.execute(
                    'select model_size from model_metadata.models')
            ]
            assert len(records) == 4
            for i in records:
                size = i[0]
                assert size < 1

            # 4. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 5. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\
                .matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 6. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert len([
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 7. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(
                    model_group_keys=['label_name', 'label_timespan']),
                db_engine=db_engine,
                replace=True)
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in db_engine.execute(
                    'select model_id from model_metadata.models order by 1 asc'
                )
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 8. if the cache is missing but the metadata is still there, reuse the metadata
            for row in db_engine.execute(
                    'select model_hash from model_metadata.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == sorted(new_model_ids)

            # 9. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
Exemple #10
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)

        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

        predictor = Predictor(project_path,
                              model_storage_engine,
                              db_engine,
                              replace=False)

        dayone = datetime.date(2011, 1,
                               1).strftime(predictor.expected_matrix_ts_format)
        daytwo = datetime.date(2011, 1,
                               2).strftime(predictor.expected_matrix_ts_format)

        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
            'matrix_type': 'test'
        }

        matrix_store = InMemoryMatrixStore(matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        reorder_session = sessionmaker(bind=db_engine)()
        obj = reorder_session.query(TestPrediction).first()
        reorder_session.delete(obj)
        reorder_session.commit()

        make_transient(obj)
        reorder_session = sessionmaker(bind=db_engine)()
        reorder_session.add(obj)
        reorder_session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
Exemple #11
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)

            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)

            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')

            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            train_matrix_columns = ['feature_one', 'feature_two']

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):
                # Create the matrix to be tested and store in db
                metadata['matrix_type'] = mat_type

                matrix_store = InMemoryMatrixStore(matrix, metadata)

                # Note, the first time 'matrix' is used, the label column is popped.
                # It must be added back in to 'matrix' to create another matrix_store.
                matrix['label'] = [7, 8]

                predict_proba = predictor.predict(
                    model_id,
                    matrix_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_matrix_columns)

                # assert
                # 1. that the returned predictions are of the desired length
                assert len(predict_proba) == 2

                # 2. that the predictions table entries are present and
                # can be linked to the original models
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 2

                # 3. that the contained as_of_dates match what we sent in
                for record in records:
                    assert record[1].date() == AS_OF_DATE

                # 4. that the entity ids match the given dataset
                assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):

                # Create the matrix to be tested and store in db
                new_metadata['matrix_type'] = mat_type

                new_matrix_store = InMemoryMatrixStore(new_matrix,
                                                       new_metadata)

                # Adding 'label' column back into new_matrix
                new_matrix['label'] = [7, 8]

                predictor.predict(model_id,
                                  new_matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                predictor.predict(model_id,
                                  matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
Exemple #12
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }), {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], [{}], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from test_results.test_predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.test_evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
Exemple #13
0
def sample_matrix_store():
    return InMemoryMatrixStore(sample_df(), sample_metadata())
Exemple #14
0
def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in
                engine.execute('select model_hash from results.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in
                engine.execute('select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert len([
                row for row in
                engine.execute('select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan'],
                replace=True
            )
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in
                engine.execute('select model_id from results.models order by 1 asc')
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])