def __init__(self, config, db_engine, model_storage_class=None, project_path=None, replace=True): self.config = config self.db_engine = db_engine if model_storage_class: self.model_storage_engine =\ model_storage_class(project_path=project_path) self.project_path = project_path self.replace = replace ensure_db(self.db_engine) self.labels_table_name = 'labels' self.features_schema_name = 'features' if project_path: self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self._split_definitions = None self._matrix_build_tasks = None self._feature_table_tasks = None self._all_as_of_times = None self.initialize_factories() self.initialize_components()
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': ['precision@', 'recall@', 'fpr@'], 'thresholds': { 'percentiles': [50.0], 'top_n': [3] } }] model_scorer = ModelScorer(metric_groups, db_engine) _, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = numpy.array([True, False, numpy.nan, True, False]) prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) evaluation_start = datetime.datetime(2016, 4, 1) evaluation_end = datetime.datetime(2016, 7, 1) example_frequency = '1d' model_scorer.score(prediction_probas, labels, model_id, evaluation_start, evaluation_end, example_frequency) for record in db_engine.execute( '''select * from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, evaluation_start)): assert record['num_labeled_examples'] == 4 assert record['num_positive_labels'] == 2 if 'pct' in record['parameter']: assert record['num_labeled_above_threshold'] == 1 else: assert record['num_labeled_above_threshold'] == 2
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine( s3_conn, 'econ-dev/inspections'), db_engine=engine, model_group_keys=['label_name', 'label_window']) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) train_tasks = trainer.generate_train_tasks( grid_config, dict(), InMemoryMatrixStore( matrix, { 'label_window': '1d', 'end_time': datetime.datetime.now(), 'beginning_of_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'] })) assert len(train_tasks ) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in engine.execute( 'select model_parameters from results.model_groups'): assert 'n_jobs' not in row[0]
def test_retry_recovery(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_window']) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore( matrix, { 'label_window': '1d', 'end_time': datetime.datetime.now(), 'beginning_of_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'] }) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) engine = create_engine(self.new_server.url()) ensure_db(engine) with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config, dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_save_experiment_and_get_hash(): # no reason to make assertions on the config itself, use a basic dict experiment_config = {'one': 'two'} with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) exp_hash = save_experiment_and_get_hash(experiment_config, engine) assert isinstance(exp_hash, str) new_hash = save_experiment_and_get_hash(experiment_config, engine) assert new_hash == exp_hash
def __init__(self, config, db_engine, model_storage_class, project_path): self.config = config self.db_engine = db_engine self.model_storage_engine =\ model_storage_class(project_path=project_path) self.project_path = project_path ensure_db(self.db_engine) self.labels_table_name = 'labels' self.features_schema_name = 'features' self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.initialize_factories() self.initialize_components()
def test_retry_max(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_window']) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore( matrix, { 'label_window': '1d', 'end_time': datetime.datetime.now(), 'beginning_of_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'] }) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config, dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': ['precision@', 'recall@'], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }] model_scorer = ModelScorer(metric_groups, db_engine) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = fake_labels(5) as_of_date = datetime.date(2016, 5, 5) evaluation_start = datetime.datetime(2016, 4, 1) evaluation_end = datetime.datetime(2016, 7, 1) prediction_frequency = '1d' model_scorer.score( trained_model.predict_proba(labels)[:, 1], trained_model.predict(labels), labels, model_id, evaluation_start, evaluation_end, prediction_frequency) # assert # that all of the records are there results = db_engine.execute( '''select distinct(metric || parameter) from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, evaluation_start)) records = [row[0] for row in results] assert records == [ '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', ]
def test_predictor_composite_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] }).set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 4 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4
def test_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) metadata = { 'beginning_of_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_window': '1y', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'] } project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_window']) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in engine.execute( 'select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id? records = [ row for row in engine.execute( 'select model_hash from results.models') ] assert len(records) == 4 cache_keys = [ model_cache_key(project_path, model_row[0], s3_conn) for model_row in records ] # 2. that the model groups are distinct records = [ row for row in engine.execute( 'select distinct model_group_id from results.models') ] assert len(records) == 4 # 3. that all four models are cached model_pickles = [ pickle.loads(cache_key.get()['Body'].read()) for cache_key in cache_keys ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 4. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 5. when run again, same models are returned new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert len([ row for row in engine.execute( 'select model_hash from results.models') ]) == 4 assert model_ids == new_model_ids # 6. if metadata is deleted but the cache is still there, # retrains that one and replaces the feature importance records engine.execute( 'delete from results.feature_importances where model_id = 3') engine.execute('delete from results.models where model_id = 3') new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) expected_model_ids = [1, 2, 4, 5] assert expected_model_ids == sorted(new_model_ids) assert [ row['model_id'] for row in engine.execute( 'select model_id from results.models order by 1 asc') ] == expected_model_ids records = [ row for row in engine.execute( 'select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id? # 7. if the cache is missing but the metadata is still there, reuse the metadata for row in engine.execute('select model_hash from results.models'): model_storage_engine.get_store(row[0]).delete() expected_model_ids = [1, 2, 4, 5] new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert expected_model_ids == sorted(new_model_ids) # 8. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert expected_model_ids == \ sorted([model_id for model_id in new_model_ids])
def reuse_pipeline_test(pipeline_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) temporal_config = { 'beginning_of_time': '2010-01-01', 'modeling_start_time': '2011-01-01', 'modeling_end_time': '2014-01-01', 'update_window': '1y', 'train_label_windows': ['6months'], 'test_label_windows': ['6months'], 'train_example_frequency': '1day', 'test_example_frequency': '3months', 'train_durations': ['6months'], 'test_durations': ['1months'], } scoring_config = { 'metric_groups': [{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [2] } }], 'sort_seed': 12345 } grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } feature_config = [{ 'prefix': 'test_features', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }] experiment_config = { 'events_table': 'events', 'entity_column_name': 'entity_id', 'model_comment': 'test2-final-final', 'model_group_keys': ['label_name', 'label_type'], 'feature_aggregations': feature_config, 'temporal_config': temporal_config, 'grid_config': grid_config, 'scoring': scoring_config, } temp_dir = TemporaryDirectory() try: pipeline = pipeline_class( config=experiment_config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir.name, 'inspections'), ) pipeline.run() evaluations = num_linked_evaluations(db_engine) assert evaluations > 0 pipeline = pipeline_class(config=experiment_config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join( temp_dir.name, 'inspections'), replace=False) pipeline.make_entity_date_table = Mock() pipeline.run() assert not pipeline.make_entity_date_table.called finally: temp_dir.cleanup()
def simple_pipeline_test(pipeline_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) temporal_config = { 'beginning_of_time': '2010-01-01', 'modeling_start_time': '2011-01-01', 'modeling_end_time': '2014-01-01', 'update_window': '1y', 'train_label_windows': ['6months'], 'test_label_windows': ['6months'], 'train_example_frequency': '1day', 'test_example_frequency': '3months', 'train_durations': ['6months'], 'test_durations': ['1months'], } scoring_config = { 'metric_groups': [{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [2] } }] } grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } feature_config = [{ 'prefix': 'test_features', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }] experiment_config = { 'events_table': 'events', 'entity_column_name': 'entity_id', 'model_comment': 'test2-final-final', 'model_group_keys': ['label_name', 'label_type'], 'feature_aggregations': feature_config, 'temporal_config': temporal_config, 'grid_config': grid_config, 'scoring': scoring_config, } with TemporaryDirectory() as temp_dir: pipeline_class(config=experiment_config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections')).run() # assert # 1. that model groups entries are present num_mgs = len([ row for row in db_engine.execute('select * from results.model_groups') ]) assert num_mgs > 0 # 2. that model entries are present, and linked to model groups num_models = len([ row for row in db_engine.execute(''' select * from results.model_groups join results.models using (model_group_id) where model_comment = 'test2-final-final' ''') ]) assert num_models > 0 # 3. predictions, linked to models num_predictions = len([ row for row in db_engine.execute(''' select * from results.predictions join results.models using (model_id)''') ]) assert num_predictions > 0 # 4. evaluations linked to predictions linked to models num_evaluations = len([ row for row in db_engine.execute(''' select * from results.evaluations e join results.models using (model_id) join results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time > p.as_of_date) ''') ]) assert num_evaluations > 0 # 5. experiment num_experiments = len([ row for row in db_engine.execute('select * from results.experiments') ]) assert num_experiments == 1 # 6. that models are linked to experiments num_models_with_experiment = len([ row for row in db_engine.execute(''' select * from results.experiments join results.models using (experiment_hash) ''') ]) assert num_models == num_models_with_experiment # 7. that models have the train end date and label window results = [ (model['train_end_time'], model['train_label_window']) for model in db_engine.execute('select * from results.models') ] assert sorted(set(results)) == [(datetime(2012, 1, 1), timedelta(180)), (datetime(2013, 1, 1), timedelta(180))]
def test_model_scoring_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': [ 'precision@', 'recall@', 'true positives@', 'true negatives@', 'false positives@', 'false negatives@' ], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }, { 'metrics': [ 'f1', 'mediocre', 'accuracy', 'roc_auc', 'average precision score' ], }, { 'metrics': ['fbeta@'], 'parameters': [{ 'beta': 0.75 }, { 'beta': 1.25 }] }] custom_metrics = {'mediocre': always_half} model_scorer = ModelScorer(metric_groups, db_engine, custom_metrics) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = fake_labels(5) as_of_date = datetime.date(2016, 5, 5) model_scorer.score( trained_model.predict_proba(labels)[:, 1], trained_model.predict(labels), labels, model_id, as_of_date, as_of_date, '1y') # assert # that all of the records are there records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, as_of_date)) ] assert records == [ 'accuracy', 'average precision score', 'f1', 'false [email protected]_pct', 'false negatives@10_abs', 'false [email protected]_pct', 'false negatives@5_abs', 'false [email protected]_pct', 'false positives@10_abs', 'false [email protected]_pct', 'false positives@5_abs', '[email protected]_beta', '[email protected]_beta', 'mediocre', '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', 'roc_auc', 'true [email protected]_pct', 'true negatives@10_abs', 'true [email protected]_pct', 'true negatives@5_abs', 'true [email protected]_pct', 'true positives@10_abs', 'true [email protected]_pct', 'true positives@5_abs' ]
def generic_pipeline_test(pipeline_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) temporal_config = { 'beginning_of_time': '2010-01-01', 'modeling_start_time': '2011-01-01', 'modeling_end_time': '2014-01-01', 'update_window': '1y', 'prediction_window': '6m', 'look_back_durations': ['6m'], 'test_durations': ['1m'], 'prediction_frequency': '1d' } scoring_config = [ {'metrics': ['precision@'], 'thresholds': {'top_n': [2]}} ] grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } feature_config = [{ 'prefix': 'test_features', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }] experiment_config = { 'events_table': 'events', 'entity_column_name': 'entity_id', 'model_comment': 'test2-final-final', 'feature_aggregations': feature_config, 'temporal_config': temporal_config, 'grid_config': grid_config, 'scoring': scoring_config, } with TemporaryDirectory() as temp_dir: pipeline_class( config=experiment_config, db_engine=db_engine, model_storage_class=InMemoryModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections') ).run() # assert # 1. that model groups entries are present num_mgs = len([ row for row in db_engine.execute('select * from results.model_groups') ]) assert num_mgs > 0 # 2. that model entries are present, and linked to model groups num_models = len([ row for row in db_engine.execute(''' select * from results.model_groups join results.models using (model_group_id) where model_comment = 'test2-final-final' ''') ]) assert num_models > 0 # 3. predictions, linked to models num_predictions = len([ row for row in db_engine.execute(''' select * from results.predictions join results.models using (model_id)''') ]) assert num_predictions > 0 # 4. evaluations linked to predictions linked to models num_evaluations = len([ row for row in db_engine.execute(''' select * from results.evaluations e join results.models using (model_id) join results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time > p.as_of_date) ''') ]) assert num_evaluations > 0
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_window': '3month', 'metta-uuid': '1234', } new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict()) predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_simple_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set with fake_metta({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }, {'label_name': 'label'}) as (matrix_path, metadata_path): trainer = SimpleModelTrainer( training_set_path=matrix_path, training_metadata_path=metadata_path, model_config=model_config, project_path='econ-dev/inspections', s3_conn=s3_conn, db_engine=engine ) cache_keys = trainer.train_models() # assert # 1. that all four models are cached model_pickles = [ pickle.loads(cache_key.get()['Body'].read()) for cache_key in cache_keys ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 2. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 3. that the models table entries are present records = [ row for row in engine.execute('select * from results.models') ] assert len(records) == 4 records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id?
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'prediction_window': '1y', 'feature_names': ['ft1', 'ft2'] } train_store = InMemoryMatrixStore(train_matrix, train_metadata) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }).set_index('entity_id'), { 'label_name': 'label', 'end_time': as_of_date } ) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, model_storage_engine=model_storage_engine, matrix_store=None, db_engine=db_engine, ) predictor = Predictor( project_path, model_storage_engine, db_engine ) model_scorer = ModelScorer( [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}], db_engine ) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store ) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions, predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict() ) model_scorer.score( predictions_proba, predictions, test_store.labels(), model_id, as_of_date, as_of_date, '6month' ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, model_id, as_of_date from results.predictions join results.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) engine = create_engine(self.new_server.url()) ensure_db(engine)
def test_predictor_retrieve(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False) dayone = datetime.date(2011, 1, 1).isoformat() daytwo = datetime.date(2011, 1, 2).isoformat() # create prediction set matrix_data = { 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] } matrix = pandas.DataFrame.from_dict(matrix_data)\ .set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail session = sessionmaker(bind=db_engine)() obj = session.query(Prediction).first() session.delete(obj) session.commit() make_transient(obj) session = sessionmaker(bind=db_engine)() session.add(obj) session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called