def test_custom_groups(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) init_engine(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') MatrixFactory(matrix_uuid="1234") session.commit() # create training set project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # expect only one model group now records = [ row[0] for row in engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def get_matrix_store(project_storage, matrix=None, metadata=None, write_to_db=True): """Return a matrix store associated with the given project storage. Also adds an entry in the matrices table if it doesn't exist already Args: project_storage (triage.component.catwalk.storage.ProjectStorage) A project's storage matrix (dataframe, optional): A matrix to store. Defaults to the output of matrix_creator() metadata (dict, optional): matrix metadata. defaults to the output of matrix_metadata_creator() """ if matrix is None: matrix = matrix_creator() if not metadata: metadata = matrix_metadata_creator() matrix["as_of_date"] = matrix["as_of_date"].apply(pd.Timestamp) matrix.set_index(MatrixStore.indices, inplace=True) matrix_store = project_storage.matrix_storage_engine().get_store( filename_friendly_hash(metadata)) matrix_store.metadata = metadata new_matrix = matrix.copy() labels = new_matrix.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = new_matrix, labels matrix_store.save() matrix_store.clear_cache() if write_to_db: if (session.query(Matrix).filter( Matrix.matrix_uuid == matrix_store.uuid).count() == 0): MatrixFactory(matrix_uuid=matrix_store.uuid) session.commit() return matrix_store
def test_baseline_exception_handling(sample_matrix_store): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks(grid_config, dict(), sample_matrix_store) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit()
def update_ranks_test(predictor, entities_scores_labels, rank_col, expected_result, model_random_seed=12345, need_seed_data=True): """Not a test in itself but rather a utility called by many of the ranking tests""" ensure_db(predictor.db_engine) init_engine(predictor.db_engine) model_id = 5 matrix_uuid = "4567" matrix_type = "test" as_of_date = datetime.datetime(2012, 1, 1) if need_seed_data: matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id, random_seed=model_random_seed) for entity_id, score, label in entities_scores_labels: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score, label_value=int(label)) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, {rank_col}::float from {matrix_type}_results.predictions where as_of_date = %s and model_id = %s and matrix_uuid = %s order by {rank_col} asc''', (as_of_date, model_id, matrix_uuid))) assert ranks == expected_result # Test that the predictions metadata table is populated metadata_records = [ row for row in predictor.db_engine.execute( f"""select tiebreaker_ordering, prediction_metadata.random_seed, models.random_seed from {matrix_type}_results.prediction_metadata join triage_metadata.models using (model_id) join triage_metadata.matrices using (matrix_uuid) """) ] assert len(metadata_records) == 1 tiebreaker_ordering, random_seed, received_model_random_seed = metadata_records[ 0] if tiebreaker_ordering == 'random': assert random_seed is model_random_seed else: assert not random_seed assert tiebreaker_ordering == predictor.rank_order assert received_model_random_seed == model_random_seed
def test_n_jobs_not_new_model(sample_matrix_store): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine( 'econ-dev/inspections'), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), sample_matrix_store, ) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() assert len(train_tasks ) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select model_parameters from model_metadata.model_groups' ): assert 'n_jobs' not in row[0]
def test_prediction_ranks_multiple_dates(project_storage, db_engine): """make sure that multiple as-of-dates in a single matrix are handled correctly. keep the other variables simple by making no within-date ties that would end up testing the tiebreaker logic, just data for two dates with data that could theoretically confound a bad ranking method: - a different order for entities in both dates - each date has some not in the other """ ensure_db(db_engine) init_engine(db_engine) predictor = Predictor(project_storage.model_storage_engine(), db_engine, 'worst') model_id = 5 matrix_uuid = "4567" matrix_type = "test" entities_dates_and_scores = ( (23, datetime.datetime(2012, 1, 1), 0.95), (34, datetime.datetime(2012, 1, 1), 0.94), (45, datetime.datetime(2013, 1, 1), 0.92), (23, datetime.datetime(2013, 1, 1), 0.45), ) expected_result = ( (23, datetime.datetime(2012, 1, 1), 1), (34, datetime.datetime(2012, 1, 1), 2), (45, datetime.datetime(2013, 1, 1), 3), (23, datetime.datetime(2013, 1, 1), 4), ) matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id) for entity_id, as_of_date, score in entities_dates_and_scores: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, as_of_date, rank_abs_no_ties from {matrix_type}_results.predictions where model_id = %s and matrix_uuid = %s order by rank_abs_no_ties''', ( model_id, matrix_uuid))) assert ranks == expected_result
def test_predictor_get_train_columns(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) project_path = 'econ-dev/inspections' with tempfile.TemporaryDirectory() as temp_dir: train_store, test_store = sample_metta_csv_diff_order(temp_dir) model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model( project_path, model_storage_engine, db_engine, train_matrix_uuid=train_store.uuid ) predictor = Predictor(project_path, model_storage_engine, db_engine) # The train_store uuid is stored in fake_trained_model. Storing the other MatrixFactory(matrix_uuid=test_store.uuid) session.commit() # Runs the same test for training and testing predictions for store, mat_type in zip((train_store, test_store), ("train", "test")): predict_proba = predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns()) # assert # 1. that we calculated predictions assert len(predict_proba) > 0 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, as_of_date from {}_results.{}_predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) > 0
def get_matrix_store(project_storage, matrix=None, metadata=None): """Return a matrix store associated with the given project storage. Also adds an entry in the matrices table if it doesn't exist already Args: project_storage (triage.component.catwalk.storage.ProjectStorage) A project's storage matrix (dataframe, optional): A matrix to store. Defaults to the output of matrix_creator() metadata (dict, optional): matrix metadata. defaults to the output of matrix_metadata_creator() """ if matrix is None: matrix = matrix_creator() if not metadata: metadata = matrix_metadata_creator() matrix_store = project_storage.matrix_storage_engine().get_store(metadata['metta-uuid']) matrix_store.matrix = matrix matrix_store.metadata = metadata matrix_store.save() if session.query(Matrix).filter(Matrix.matrix_uuid == matrix_store.uuid).count() == 0: MatrixFactory(matrix_uuid=matrix_store.uuid) session.commit() return matrix_store
def test_model_trainer(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # Creates a matrix entry in the matrices table with uuid from metadata above MatrixFactory(matrix_uuid="1234") session.commit() project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\ .matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc' ) ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]