def test_hdf_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: matrix_storage_engine.matrix_storage_class = HDFMatrixStore builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine) uuid = metta.generate_uuid(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name='booking', label_type='binary', feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type='test') assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), sample_matrix_store()) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_model_grouping_default_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper() # get the basic first model group with our default matrix assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, sample_metadata, engine) == 1) # the end time is not by default a model group key so changing it # should still get us the same group metadata_new_end_time = copy(sample_metadata) metadata_new_end_time["end_time"] = datetime.date(2017, 3, 20) assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_new_end_time, engine) == 1) # max_training_history is a default key, # so it should trigger a new group metadata_train_history = copy(sample_metadata) metadata_train_history["max_training_history"] = "3y" assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_train_history, engine) == 2) # classifier is of course a default key as well assert (model_grouper.get_model_group_id("module.OtherClassifier", {"param1": "val1"}, sample_metadata, engine) == 3)
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine('econ-dev/inspections'), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) train_tasks = trainer.generate_train_tasks( grid_config, dict(), InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) ) assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in engine.execute( 'select model_parameters from results.model_groups' ): assert 'n_jobs' not in row[0]
def test_uniform_distribution_entity_id_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature='feature_{}'.format(i)) for i in range(0, 10) ] data_dict = {'entity_id': [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] test_store = InMemoryMatrixStore( matrix=pandas.DataFrame.from_dict(data_dict), metadata=sample_metadata()) session.commit() results = uniform_distribution(db_engine, model_id=model.model_id, as_of_date='2016-01-01', test_matrix_store=test_store, n_ranks=5) assert len(results) == 10 # 5 features x 2 entities for result in results: assert 'entity_id' in result assert 'feature_name' in result assert 'score' in result assert 'feature_value' in result assert result['feature_value'] == 0.5 assert result['score'] >= 0 assert result['score'] <= 1 assert isinstance(result['feature_name'], str) assert result['entity_id'] in [1, 2]
def test_custom_groups(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) init_engine(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') MatrixFactory(matrix_uuid="1234") session.commit() # create training set project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # expect only one model group now records = [ row[0] for row in engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) uuid = filename_friendly_hash(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name="booking", label_type="binary", feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5
def test_restart_experiment(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) experiment.run() evaluations = num_linked_evaluations(db_engine) assert evaluations > 0 experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), replace=False) experiment.make_entity_date_table = mock.Mock() experiment.run() assert not experiment.make_entity_date_table.called
def test_model_grouping_custom_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper( model_group_keys=["feature_names", "as_of_date_frequency"]) # get the basic first model group with our default matrix assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, sample_metadata, engine) == 1) # classifier is now not a key, so changing it should not get a new id assert (model_grouper.get_model_group_id("module.OtherClassifier", {"param1": "val1"}, sample_metadata, engine) == 1) # as_of_date_frequency is a key, # so it should trigger a new group metadata_frequency = copy(sample_metadata) metadata_frequency["as_of_date_frequency"] = "2w" assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_frequency, engine) == 2) # testing feature names may seem redundant but it is on a separate # code path so make sure its logic works metadata_features = copy(sample_metadata) metadata_features["feature_names"] = ["ft1", "ft3"] assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_features, engine) == 3)
def filter_same_train_end_times(self, engine): ensure_db(engine) init_engine(engine) mg1 = ModelGroupFactory(model_group_id=1, model_type='modelType1') mg2 = ModelGroupFactory(model_group_id=2, model_type='modelType2') mg3 = ModelGroupFactory(model_group_id=3, model_type='modelType3') mg4 = ModelGroupFactory(model_group_id=4, model_type='modelType4') # model group 1 ModelFactory(model_group_rel=mg1, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2017, 1, 1)) # model group 2 only has three timestamps, should not pass ModelFactory(model_group_rel=mg2, train_end_time=datetime(2014, 1, 1)) # model group 3 ModelFactory(model_group_rel=mg3, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2017, 1, 1)) # model group 4 only has three timestamps, should not pass ModelFactory(model_group_rel=mg4, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg4, train_end_time=datetime(2016, 1, 1)) session.commit() train_end_times = [ '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01' ] model_groups = [1, 2, 3, 4] model_group_ids = model_groups_filter( train_end_times=train_end_times, initial_model_group_ids=model_groups, models_table='models', db_engine=engine) return model_group_ids
def test_baseline_exception_handling(sample_matrix_store): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks(grid_config, dict(), sample_matrix_store) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup=True, cleanup_timeout=0.02, # Set short timeout ) with mock.patch.object(experiment, 'generate_matrices') as build_mock: build_mock.side_effect = RuntimeError('boom!') with pytest.raises(TimeoutError) as exc_info: experiment() # Last exception is TimeoutError, but earlier error is preserved in # __context__, and will be noted as well in any standard traceback: assert exc_info.value.__context__ is build_mock.side_effect
def test_model_grouping_custom_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper( model_group_keys=['feature_names', 'as_of_date_frequency']) # get the basic first model group with our default matrix assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, sample_metadata, engine) == 1 # classifier is now not a key, so changing it should not get a new id assert model_grouper.get_model_group_id('module.OtherClassifier', {'param1': 'val1'}, sample_metadata, engine) == 1 # as_of_date_frequency is a key, # so it should trigger a new group metadata_frequency = copy(sample_metadata) metadata_frequency['as_of_date_frequency'] = '2w' assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, metadata_frequency, engine) == 2 # testing feature names may seem redundant but it is on a separate # code path so make sure its logic works metadata_features = copy(sample_metadata) metadata_features['feature_names'] = ['ft1', 'ft3'] assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, metadata_features, engine) == 3
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, replace=False) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'test_duration': '1 month', 'indices': ['entity_id', 'as_of_date'], } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert not builder.make_entity_date_table.called
def test_experiment_validator(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: ExperimentValidator(db_engine).run(sample_config("query")) ExperimentValidator(db_engine).run(sample_config("filepath"))
def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit()
def test_save_experiment_and_get_hash(): # no reason to make assertions on the config itself, use a basic dict experiment_config = {"one": "two"} with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) exp_hash = save_experiment_and_get_hash(experiment_config, engine) assert isinstance(exp_hash, str) new_hash = save_experiment_and_get_hash(experiment_config, engine) assert new_hash == exp_hash
def test_replace_true_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) matrix_metadata = matrix_metadata_creator(state="active", test_duration="1month", label_name="booking") dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"] } uuid = filename_friendly_hash(matrix_metadata) build_args = dict( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=True, ) builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid) # rerun builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid)
def update_ranks_test(predictor, entities_scores_labels, rank_col, expected_result, model_random_seed=12345, need_seed_data=True): """Not a test in itself but rather a utility called by many of the ranking tests""" ensure_db(predictor.db_engine) init_engine(predictor.db_engine) model_id = 5 matrix_uuid = "4567" matrix_type = "test" as_of_date = datetime.datetime(2012, 1, 1) if need_seed_data: matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id, random_seed=model_random_seed) for entity_id, score, label in entities_scores_labels: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score, label_value=int(label)) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, {rank_col}::float from {matrix_type}_results.predictions where as_of_date = %s and model_id = %s and matrix_uuid = %s order by {rank_col} asc''', (as_of_date, model_id, matrix_uuid))) assert ranks == expected_result # Test that the predictions metadata table is populated metadata_records = [ row for row in predictor.db_engine.execute( f"""select tiebreaker_ordering, prediction_metadata.random_seed, models.random_seed from {matrix_type}_results.prediction_metadata join triage_metadata.models using (model_id) join triage_metadata.matrices using (matrix_uuid) """) ] assert len(metadata_records) == 1 tiebreaker_ordering, random_seed, received_model_random_seed = metadata_records[ 0] if tiebreaker_ordering == 'random': assert random_seed is model_random_seed else: assert not random_seed assert tiebreaker_ordering == predictor.rank_order assert received_model_random_seed == model_random_seed
def test_Audition(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) num_model_groups = 10 model_types = [ "classifier type {}".format(i) for i in range(0, num_model_groups) ] model_groups = [ ModelGroupFactory(model_type=model_type) for model_type in model_types ] train_end_times = [ datetime(2013, 1, 1), datetime(2014, 1, 1), datetime(2015, 1, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ("precision@", "100_abs"), ("recall@", "100_abs"), ("precision@", "50_abs"), ("recall@", "50_abs"), ("fpr@", "10_pct"), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory(model_rel=model, metric=metric, parameter=parameter) session.commit() with tempfile.TemporaryDirectory() as td: with mock.patch('os.getcwd') as mock_getcwd: mock_getcwd.return_value = td AuditionRunner(config_dict=config, db_engine=db_engine, directory=td).run() assert len(os.listdir(os.getcwd())) == 6
def test_retry_recovery(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) engine = create_engine(self.new_server.url()) ensure_db(engine) with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config, dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def rig_engines(): """Set up a db engine and project storage engine Yields (tuple) (database engine, project storage engine) """ with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with tempfile.TemporaryDirectory() as temp_dir: project_storage = ProjectStorage(temp_dir) yield db_engine, project_storage
def __init__( self, config, db_engine, model_storage_class=FSModelStorageEngine, project_path=None, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( 'Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments' ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine if model_storage_class: self.model_storage_engine = model_storage_class( project_path=project_path) self.matrix_store_class = CSVMatrixStore # can't be configurable until Architect obeys self.project_path = project_path self.replace = replace ensure_db(self.db_engine) self.features_schema_name = 'features' if project_path: self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = 'labels_{}'.format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( 'cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation' ) else: logging.info( 'cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation' ) self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def test_train_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: builder = builders.HighMemoryCSVBuilder( db_config=db_config, matrix_directory=temp_dir, engine=engine) feature_dictionary = FeatureGroup(name='mygroup', features_by_table={ 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], }) matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'max_training_history': '1 month' } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='train') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6)
def test_n_jobs_not_new_model(sample_matrix_store): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine( 'econ-dev/inspections'), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), sample_matrix_store, ) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() assert len(train_tasks ) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select model_parameters from model_metadata.model_groups' ): assert 'n_jobs' not in row[0]
def prepare_experiment(config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment( config=config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup=False ) yield experiment
def test_load_if_right_version(self): experiment_config = sample_config() experiment_config['config_version'] = CONFIG_VERSION with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment( config=experiment_config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) assert isinstance(experiment, SingleThreadedExperiment)
def test_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup_timeout=0.02, # Set short timeout ) with pytest.raises(TimeoutError): experiment()
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) config = sample_config() config['label_config']['name'] = 'custom_label_name' with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) assert experiment.label_generator.label_name == 'custom_label_name' assert experiment.planner.label_names == ['custom_label_name']
def test_prediction_ranks_multiple_dates(project_storage, db_engine): """make sure that multiple as-of-dates in a single matrix are handled correctly. keep the other variables simple by making no within-date ties that would end up testing the tiebreaker logic, just data for two dates with data that could theoretically confound a bad ranking method: - a different order for entities in both dates - each date has some not in the other """ ensure_db(db_engine) init_engine(db_engine) predictor = Predictor(project_storage.model_storage_engine(), db_engine, 'worst') model_id = 5 matrix_uuid = "4567" matrix_type = "test" entities_dates_and_scores = ( (23, datetime.datetime(2012, 1, 1), 0.95), (34, datetime.datetime(2012, 1, 1), 0.94), (45, datetime.datetime(2013, 1, 1), 0.92), (23, datetime.datetime(2013, 1, 1), 0.45), ) expected_result = ( (23, datetime.datetime(2012, 1, 1), 1), (34, datetime.datetime(2012, 1, 1), 2), (45, datetime.datetime(2013, 1, 1), 3), (23, datetime.datetime(2013, 1, 1), 4), ) matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id) for entity_id, as_of_date, score in entities_dates_and_scores: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, as_of_date, rank_abs_no_ties from {matrix_type}_results.predictions where model_id = %s and matrix_uuid = %s order by rank_abs_no_ties''', ( model_id, matrix_uuid))) assert ranks == expected_result