def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine('econ-dev/inspections'), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) train_tasks = trainer.generate_train_tasks( grid_config, dict(), InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) ) assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in engine.execute( 'select model_parameters from results.model_groups' ): assert 'n_jobs' not in row[0]
def test_retry_recovery(self): db_engine = None trainer = None port = None with rig_engines() as (db_engine, project_storage): port = db_engine.url.port trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper(), ) matrix_store = get_matrix_store(project_storage) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage) with patch("time.sleep") as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_baseline_exception_handling(sample_matrix_store): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks(grid_config, dict(), sample_matrix_store) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_custom_groups(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) init_engine(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') MatrixFactory(matrix_uuid="1234") session.commit() # create training set project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # expect only one model group now records = [ row[0] for row in engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), sample_matrix_store()) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, run_id=run_id, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_retry_recovery(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) engine = create_engine(self.new_server.url()) ensure_db(engine) with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config, dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_n_jobs_not_new_model(sample_matrix_store): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine( 'econ-dev/inspections'), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), sample_matrix_store, ) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() assert len(train_tasks ) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select model_parameters from model_metadata.model_groups' ): assert 'n_jobs' not in row[0]
def test_retry_max(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config, dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_baseline_exception_handling(): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) ensure_db(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine = model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) train_tasks = trainer.generate_train_tasks( grid_config, dict(), InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) ) model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), ) yield trainer
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) matrix_store = get_matrix_store(project_storage) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_recovery(self): db_engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), sample_matrix_store()) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_custom_groups(grid_config): with rig_engines() as (db_engine, project_storage): # create training set model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) # expect only one model group now records = [ row[0] for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_baseline_exception_handling(): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer(experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage), ) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select hyperparameters from model_metadata.model_groups'): assert 'n_jobs' not in row[0]
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_n_jobs_not_new_model(): grid_config = { "sklearn.ensemble.AdaBoostClassifier": { "n_estimators": [10, 100, 1000] }, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": [10, 100], "max_features": ["sqrt", "log2"], "max_depth": [5, 10, 15, 20], "criterion": ["gini", "entropy"], "n_jobs": [12, 24], }, } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), ) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert (len([ task for task in train_tasks if "n_jobs" in task["parameters"] ]) == 32) for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( "select hyperparameters from model_metadata.model_groups"): assert "n_jobs" not in row[0]
def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) # import pdb; pdb.set_trace() trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), run_id=run_id, ) yield trainer
def initialize_components(self): split_config = self.config['temporal_config'] self.chopper = Timechop( feature_start_time=dt_from_str(split_config['feature_start_time']), feature_end_time=dt_from_str(split_config['feature_end_time']), label_start_time=dt_from_str(split_config['label_start_time']), label_end_time=dt_from_str(split_config['label_end_time']), model_update_frequency=split_config['model_update_frequency'], training_label_timespans=split_config['training_label_timespans'], test_label_timespans=split_config['test_label_timespans'], training_as_of_date_frequencies=split_config[ 'training_as_of_date_frequencies'], test_as_of_date_frequencies=split_config[ 'test_as_of_date_frequencies'], max_training_histories=split_config['max_training_histories'], test_durations=split_config['test_durations'], ) cohort_config = self.config.get('cohort_config', {}) if 'query' in cohort_config: self.state_table_generator = StateTableGeneratorFromQuery( experiment_hash=self.experiment_hash, db_engine=self.db_engine, query=cohort_config['query']) elif 'entities_table' in cohort_config: self.state_table_generator = StateTableGeneratorFromEntities( experiment_hash=self.experiment_hash, db_engine=self.db_engine, entities_table=cohort_config['entities_table']) elif 'dense_states' in cohort_config: self.state_table_generator = StateTableGeneratorFromDense( experiment_hash=self.experiment_hash, db_engine=self.db_engine, dense_state_table=cohort_config['dense_states']['table_name']) else: raise ValueError('Cohort config missing or unrecognized') self.label_generator = LabelGenerator( label_name=self.config['label_config'].get('name', None), query=self.config['label_config']['query'], db_engine=self.db_engine, ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine, ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config['feature_start_time']) self.feature_group_creator = FeatureGroupCreator( self.config.get('feature_group_definition', {'all': [True]})) self.feature_group_mixer = FeatureGroupMixer( self.config.get('feature_group_strategies', ['all'])) self.planner = Planner( feature_start_time=dt_from_str(split_config['feature_start_time']), label_names=[ self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME) ], label_types=['binary'], matrix_directory=self.matrices_directory, cohort_name=self.config.get('cohort_config', {}).get('name', None), states=self.config.get('cohort_config', {}).get('dense_states', {}).get('state_filters', []), user_metadata=self.config.get('user_metadata', {}), ) self.matrix_builder = HighMemoryCSVBuilder( db_config={ 'features_schema_name': self.features_schema_name, 'labels_schema_name': 'public', 'labels_table_name': self.labels_table_name, # TODO: have planner/builder take state table later on, so we # can grab it from the StateTableGenerator instead of # duplicating it here 'sparse_state_table_name': 'tmp_sparse_states_{}'.format(self.experiment_hash), }, matrix_directory=self.matrices_directory, include_missing_labels_in_train_as=self.config['label_config'].get( 'include_missing_labels_in_train_as', None), engine=self.db_engine, replace=self.replace) self.trainer = ModelTrainer( project_path=self.project_path, experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get('model_group_keys', [])), db_engine=self.db_engine, replace=self.replace) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, project_path=self.project_path, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get( 'individual_importance', {}), evaluator_config=self.config.get('scoring', {}))
def test_model_trainer(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # Creates a matrix entry in the matrices table with uuid from metadata above MatrixFactory(matrix_uuid="1234") session.commit() project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\ .matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc' ) ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
class ExperimentBase(ABC): """The base class for all Experiments. Subclasses must implement the following four methods: process_query_tasks process_matrix_build_tasks process_train_tasks process_model_test_tasks Look at singlethreaded.py for reference implementation of each. Args: config (dict) db_engine (triage.util.db.SerializableDbEngine or sqlalchemy.engine.Engine) project_path (string) replace (bool) cleanup_timeout (int) materialize_subquery_fromobjs (bool, default True) Whether or not to create and index tables for feature "from objects" that are subqueries. Can speed up performance when building features for many as-of-dates. profile (bool) """ cleanup_timeout = 60 # seconds def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, profile=False, ): self._check_config_version(config) self.config = config self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path self.replace = replace self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation" ) else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile) def _check_config_version(self, config): if "config_version" in config: config_version = config["config_version"] else: logging.warning( "config_version key not found in experiment config. " "Assuming v1, which may not be correct" ) config_version = "v1" if config_version != CONFIG_VERSION: raise ValueError( "Experiment config '{}' " "does not match current version '{}'. " "Will not run experiment.".format(config_version, CONFIG_VERSION) ) def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_generator = CohortTableGenerator( cohort_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace ) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices or perform feature imputation." ) self.cohort_table_generator = CohortTableGeneratorNoOp() if "label_config" in self.config: self.label_generator = LabelGenerator( label_name=self.config["label_config"].get("name", None), query=self.config["label_config"]["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get("individual_importance", {}), evaluator_config=self.config.get("scoring", {}), ) @cachedproperty def split_definitions(self): """Temporal splits based on the experiment's configuration Returns: (dict) temporal splits Example: ``` { 'feature_start_time': {datetime}, 'feature_end_time': {datetime}, 'label_start_time': {datetime}, 'label_end_time': {datetime}, 'train_matrix': { 'first_as_of_time': {datetime}, 'last_as_of_time': {datetime}, 'matrix_info_end_time': {datetime}, 'training_label_timespan': {str}, 'training_as_of_date_frequency': {str}, 'max_training_history': {str}, 'as_of_times': [list of {datetime}s] }, 'test_matrices': [list of matrix defs similar to train_matrix] } ``` (When updating/setting split definitions, matrices should have UUIDs.) """ split_definitions = self.chopper.chop_time() logging.info("Computed and stored split definitions: %s", split_definitions) logging.info("\n----TIME SPLIT SUMMARY----\n") logging.info("Number of time splits: {}".format(len(split_definitions))) for split_index, split in enumerate(split_definitions): train_times = split["train_matrix"]["as_of_times"] test_times = [ as_of_time for test_matrix in split["test_matrices"] for as_of_time in test_matrix["as_of_times"] ] logging.info( """Split index {}: Training as_of_time_range: {} to {} ({} total) Testing as_of_time range: {} to {} ({} total)\n\n""".format( split_index, min(train_times), max(train_times), len(train_times), min(test_times), max(test_times), len(test_times), ) ) return split_definitions @cachedproperty def all_as_of_times(self): """All 'as of times' in experiment config Used for label and feature generation. Returns: (list) of datetimes """ all_as_of_times = [] for split in self.split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) logging.debug( "Adding as_of_times from train matrix: %s", split["train_matrix"]["as_of_times"], ) for test_matrix in split["test_matrices"]: logging.debug( "Adding as_of_times from test matrix: %s", test_matrix["as_of_times"], ) all_as_of_times.extend(test_matrix["as_of_times"]) logging.info( "Computed %s total as_of_times for label and feature generation", len(all_as_of_times), ) distinct_as_of_times = list(set(all_as_of_times)) logging.info( "Computed %s distinct as_of_times for label and feature generation", len(distinct_as_of_times), ) logging.info( "You can view all as_of_times by inspecting `.all_as_of_times` on this Experiment" ) return distinct_as_of_times @cachedproperty def collate_aggregations(self): """Collation of ``Aggregation`` objects used by this experiment. Returns: (list) of ``collate.Aggregation`` objects """ logging.info("Creating collate aggregations") if "feature_aggregations" not in self.config: logging.warning("No feature_aggregation config is available") return [] return self.feature_generator.aggregations( feature_aggregation_config=self.config["feature_aggregations"], feature_dates=self.all_as_of_times, state_table=self.cohort_table_name, ) @cachedproperty def feature_aggregation_table_tasks(self): """All feature table query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info( "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) ) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="aggregation" ) @cachedproperty def feature_imputation_table_tasks(self): """All feature imputation query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info( "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) ) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="imputation" ) @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features will necessarily end up in matrices Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ result = self.feature_dictionary_creator.feature_dictionary( feature_table_names=self.feature_imputation_table_tasks.keys(), index_column_lookup=self.feature_generator.index_column_lookup( self.collate_aggregations ), ) logging.info("Computed master feature dictionary: %s", result) return result @property def feature_dicts(self): """Feature dictionaries, representing the feature tables and columns configured in this experiment after computing feature groups. Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ return self.feature_group_mixer.generate( self.feature_group_creator.subsets(self.master_feature_dictionary) ) @cachedproperty def matrix_build_tasks(self): """Tasks for all matrices that need to be built as a part of this Experiment. Each task contains arguments understood by ``Architect.build_matrix``. Returns: (list) of dicts """ if not table_has_data(self.cohort_table_name, self.db_engine): logging.warning("cohort table is not populated, cannot build any matrices") return {} if not table_has_data(self.labels_table_name, self.db_engine): logging.warning("labels table is not populated, cannot build any matrices") return {} (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.full_matrix_definitions = updated_split_definitions return matrix_build_tasks @cachedproperty def full_matrix_definitions(self): """Full matrix definitions Returns: (list) temporal and feature information for each matrix """ (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.matrix_build_tasks = matrix_build_tasks return updated_split_definitions @property def all_label_timespans(self): """All train and test label timespans Returns: (list) label timespans, in string form as they appeared in the experiment config """ return list( set( self.config["temporal_config"]["training_label_timespans"] + self.config["temporal_config"]["test_label_timespans"] ) ) def generate_labels(self): """Generate labels based on experiment configuration Results are stored in the database, not returned """ self.label_generator.generate_all_labels( self.labels_table_name, self.all_as_of_times, self.all_label_timespans ) def generate_cohort(self): self.cohort_table_generator.generate_cohort_table( as_of_dates=self.all_as_of_times ) def log_split(self, split_num, split): logging.info( "Starting train/test for %s out of %s: train range: %s to %s", split_num + 1, len(self.full_matrix_definitions), split["train_matrix"]["first_as_of_time"], split["train_matrix"]["matrix_info_end_time"], ) @abstractmethod def process_train_tasks(self, train_tasks): pass @abstractmethod def process_query_tasks(self, query_tasks): pass @abstractmethod def process_matrix_build_tasks(self, matrix_build_tasks): pass def generate_preimputation_features(self): self.process_query_tasks(self.feature_aggregation_table_tasks) logging.info( "Finished running preimputation feature queries. The final results are in tables: %s", ",".join(agg.get_table_name() for agg in self.collate_aggregations), ) def impute_missing_features(self): self.process_query_tasks(self.feature_imputation_table_tasks) logging.info( "Finished running postimputation feature queries. The final results are in tables: %s", ",".join( agg.get_table_name(imputed=True) for agg in self.collate_aggregations ), ) def build_matrices(self): associate_matrices_with_experiment( self.experiment_hash, self.matrix_build_tasks.keys(), self.db_engine ) self.process_matrix_build_tasks(self.matrix_build_tasks) def generate_matrices(self): logging.info("Creating cohort") self.generate_cohort() logging.info("Creating labels") self.generate_labels() logging.info("Creating feature aggregation tables") self.generate_preimputation_features() logging.info("Creating feature imputation tables") self.impute_missing_features() logging.info("Building all matrices") self.build_matrices() def train_and_test_models(self): if "grid_config" not in self.config: logging.warning( "No grid_config was passed in the experiment config. No models will be trained" ) return for split_num, split in enumerate(self.full_matrix_definitions): self.log_split(split_num, split) train_store = self.matrix_storage_engine.get_store(split["train_uuid"]) if train_store.empty: logging.warning( """Train matrix for split %s was empty, no point in training this model. Skipping """, split["train_uuid"], ) continue if len(train_store.labels().unique()) == 1: logging.warning( """Train Matrix for split %s had only one unique value, no point in training this model. Skipping """, split["train_uuid"], ) continue logging.info("Training models") train_tasks = self.trainer.generate_train_tasks( grid_config=self.config["grid_config"], misc_db_parameters=dict( test=False, model_comment=self.config.get("model_comment", None) ), matrix_store=train_store, ) associate_models_with_experiment( self.experiment_hash, [train_task['model_hash'] for train_task in train_tasks], self.db_engine ) model_ids = self.process_train_tasks(train_tasks) logging.info("Done training models for split %s", split_num) test_tasks = self.tester.generate_model_test_tasks( split=split, train_store=train_store, model_ids=model_ids ) logging.info( "Found %s non-empty test matrices for split %s", len(test_tasks), split_num, ) self.process_model_test_tasks(test_tasks) def validate(self, strict=True): ExperimentValidator(self.db_engine, strict=strict).run(self.config) def _run(self): try: logging.info("Generating matrices") self.generate_matrices() finally: if self.cleanup: self.clean_up_tables() self.train_and_test_models() logging.info("Experiment complete") self._log_end_of_run_report() def _log_end_of_run_report(self): missing_models = missing_model_hashes(self.experiment_hash, self.db_engine) if len(missing_models) > 0: logging.info("Found %s missing model hashes." "This means that they were supposed to either be trained or reused" "by this experiment but are not present in the models table." "Inspect the logs for any training errors. Full list: %s", len(missing_models), missing_models ) else: logging.info("All models that were supposed to be trained were trained. Awesome!") missing_matrices = missing_matrix_uuids(self.experiment_hash, self.db_engine) if len(missing_matrices) > 0: logging.info("Found %s missing matrix uuids." "This means that they were supposed to either be build or reused" "by this experiment but are not present in the matrices table." "Inspect the logs for any matrix building errors. Full list: %s", len(missing_matrices), missing_matrices ) else: logging.info("All matrices that were supposed to be build were built. Awesome!") def clean_up_tables(self): logging.info("Cleaning up state and labels tables") with timeout(self.cleanup_timeout): self.cohort_table_generator.clean_up() self.label_generator.clean_up(self.labels_table_name) def _run_profile(self): cp = cProfile.Profile() cp.runcall(self._run) store = self.project_storage.get_store( ["profiling_stats"], f"{int(time.time())}.profile" ) with store.open('wb') as fd: cp.create_stats() marshal.dump(cp.stats, fd) logging.info("Profiling stats of this Triage run calculated and written to %s" "in cProfile format.", store) def run(self): try: if self.profile: self._run_profile() else: self._run() except Exception: logging.exception("Run interrupted by uncaught exception") raise __call__ = run
def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_generator = CohortTableGenerator( cohort_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace ) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices or perform feature imputation." ) self.cohort_table_generator = CohortTableGeneratorNoOp() if "label_config" in self.config: self.label_generator = LabelGenerator( label_name=self.config["label_config"].get("name", None), query=self.config["label_config"]["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get("individual_importance", {}), evaluator_config=self.config.get("scoring", {}), )
def test_model_trainer(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine def set_test_seed(): random.seed(5) set_test_seed() model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 4 # 2. that the random seeds are distinct records = [ row for row in db_engine.execute( "select distinct random_seed from triage_metadata.models") ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( "select model_size from triage_metadata.models") ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pd.DataFrame.from_dict({ "entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5] }).set_index("entity_id") for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again with the same starting seed, same models are returned set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert (len([ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ]) == 4) assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"]), db_engine=db_engine, replace=True, ) set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == new_model_ids assert [ row["model_id"] for row in db_engine.execute( "select model_id from triage_metadata.models order by 1 asc") ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata set_test_seed() for row in db_engine.execute( "select model_hash from triage_metadata.models"): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way set_test_seed() new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted([model_id for model_id in new_model_ids])
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_model_trainer(grid_config): with rig_engines() as (db_engine, project_storage): # Creates a matrix entry in the matrices table with uuid from metadata above model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }).set_index('entity_id') for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], } project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in engine.execute('select model_hash from results.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in engine.execute('select distinct model_group_id from results.models') ] assert len(records) == 4 # 3. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 4. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 5. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert len([ row for row in engine.execute('select model_hash from results.models') ]) == 4 assert model_ids == new_model_ids # 6. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'], replace=True ) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in engine.execute('select model_id from results.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 7. if the cache is missing but the metadata is still there, reuse the metadata for row in engine.execute('select model_hash from results.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == sorted(new_model_ids) # 8. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
class ExperimentBase(ABC): """The base class for all Experiments. Subclasses must implement the following four methods: process_query_tasks process_matrix_build_tasks process_train_tasks process_model_test_tasks Look at singlethreaded.py for reference implementation of each. Args: config (dict) db_engine (triage.util.db.SerializableDbEngine or sqlalchemy.engine.Engine) project_path (string) replace (bool) cleanup_timeout (int) """ cleanup_timeout = 60 # seconds def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning('Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments') self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine(self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = 'features' self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) self.labels_table_name = 'labels_{}'.format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info('cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation') else: logging.info('cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation') self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) def _check_config_version(self, config): if 'config_version' in config: config_version = config['config_version'] else: logging.warning('config_version key not found in experiment config. ' 'Assuming v1, which may not be correct') config_version = 'v1' if config_version != CONFIG_VERSION: raise ValueError( "Experiment config '{}' " "does not match current version '{}'. " "Will not run experiment." .format(config_version, CONFIG_VERSION) ) def initialize_components(self): split_config = self.config['temporal_config'] self.chopper = Timechop(**split_config) cohort_config = self.config.get('cohort_config', {}) if 'query' in cohort_config: self.state_table_generator = StateTableGeneratorFromQuery( experiment_hash=self.experiment_hash, db_engine=self.db_engine, query=cohort_config['query'] ) elif 'entities_table' in cohort_config: self.state_table_generator = StateTableGeneratorFromEntities( experiment_hash=self.experiment_hash, db_engine=self.db_engine, entities_table=cohort_config['entities_table'] ) elif 'dense_states' in cohort_config: self.state_table_generator = StateTableGeneratorFromDense( experiment_hash=self.experiment_hash, db_engine=self.db_engine, dense_state_table=cohort_config['dense_states']['table_name'] ) else: logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.') self.state_table_generator = StateTableGeneratorNoOp() if 'label_config' in self.config: self.label_generator = LabelGenerator( label_name=self.config['label_config'].get('name', None), query=self.config['label_config']['query'], db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.') self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine, ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config['feature_start_time'] ) self.feature_group_creator = FeatureGroupCreator( self.config.get('feature_group_definition', {'all': [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get('feature_group_strategies', ['all']) ) self.planner = Planner( feature_start_time=dt_from_str(split_config['feature_start_time']), label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)], label_types=['binary'], cohort_name=self.config.get('cohort_config', {}).get('name', None), states=self.config.get('cohort_config', {}).get('dense_states', {}) .get('state_filters', []), user_metadata=self.config.get('user_metadata', {}), ) self.matrix_builder = MatrixBuilder( db_config={ 'features_schema_name': self.features_schema_name, 'labels_schema_name': 'public', 'labels_table_name': self.labels_table_name, # TODO: have planner/builder take state table later on, so we # can grab it from the StateTableGenerator instead of # duplicating it here 'sparse_state_table_name': self.sparse_states_table_name, }, matrix_storage_engine=self.matrix_storage_engine, include_missing_labels_in_train_as=self.config.get('label_config', {}) .get('include_missing_labels_in_train_as', None), engine=self.db_engine, replace=self.replace ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get('model_group_keys', [])), db_engine=self.db_engine, replace=self.replace ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get('individual_importance', {}), evaluator_config=self.config.get('scoring', {}) ) @property def sparse_states_table_name(self): return 'tmp_sparse_states_{}'.format(self.experiment_hash) @cachedproperty def split_definitions(self): """Temporal splits based on the experiment's configuration Returns: (dict) temporal splits Example: ``` { 'feature_start_time': {datetime}, 'feature_end_time': {datetime}, 'label_start_time': {datetime}, 'label_end_time': {datetime}, 'train_matrix': { 'first_as_of_time': {datetime}, 'last_as_of_time': {datetime}, 'matrix_info_end_time': {datetime}, 'training_label_timespan': {str}, 'training_as_of_date_frequency': {str}, 'max_training_history': {str}, 'as_of_times': [list of {datetime}s] }, 'test_matrices': [list of matrix defs similar to train_matrix] } ``` (When updating/setting split definitions, matrices should have UUIDs.) """ split_definitions = self.chopper.chop_time() logging.info('Computed and stored split definitions: %s', split_definitions) logging.info('\n----TIME SPLIT SUMMARY----\n') logging.info('Number of time splits: {}'.format(len(split_definitions))) for split_index, split in enumerate(split_definitions): train_times = split['train_matrix']['as_of_times'] test_times = [as_of_time for test_matrix in split['test_matrices'] for as_of_time in test_matrix['as_of_times']] logging.info('''Split index {}: Training as_of_time_range: {} to {} ({} total) Testing as_of_time range: {} to {} ({} total)\n\n'''.format( split_index, min(train_times), max(train_times), len(train_times), min(test_times), max(test_times), len(test_times) )) return split_definitions @cachedproperty def all_as_of_times(self): """All 'as of times' in experiment config Used for label and feature generation. Returns: (list) of datetimes """ all_as_of_times = [] for split in self.split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) logging.debug('Adding as_of_times from train matrix: %s', split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: logging.debug('Adding as_of_times from test matrix: %s', test_matrix['as_of_times']) all_as_of_times.extend(test_matrix['as_of_times']) logging.info( 'Computed %s total as_of_times for label and feature generation', len(all_as_of_times) ) distinct_as_of_times = list(set(all_as_of_times)) logging.info( 'Computed %s distinct as_of_times for label and feature generation', len(distinct_as_of_times) ) logging.info('You can view all as_of_times by inspecting `.all_as_of_times` on this Experiment') return distinct_as_of_times @cachedproperty def collate_aggregations(self): """Collation of ``Aggregation`` objects used by this experiment. Returns: (list) of ``collate.Aggregation`` objects """ logging.info('Creating collate aggregations') cohort_table = self.state_table_generator.sparse_table_name if 'feature_aggregations' not in self.config: logging.warning('No feature_aggregation config is available') return [] return self.feature_generator.aggregations( feature_aggregation_config=self.config['feature_aggregations'], feature_dates=self.all_as_of_times, state_table=cohort_table ) @cachedproperty def feature_aggregation_table_tasks(self): """All feature table query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info('Calculating feature tasks for %s as_of_times', len(self.all_as_of_times)) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type='aggregation' ) @cachedproperty def feature_imputation_table_tasks(self): """All feature imputation query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info('Calculating feature tasks for %s as_of_times', len(self.all_as_of_times)) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type='imputation' ) @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features will necessarily end up in matrices Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ result = self.feature_dictionary_creator.feature_dictionary( feature_table_names=self.feature_imputation_table_tasks.keys(), index_column_lookup=self.feature_generator.index_column_lookup( self.collate_aggregations ) ) logging.info('Computed master feature dictionary: %s', result) return result @property def feature_dicts(self): """Feature dictionaries, representing the feature tables and columns configured in this experiment after computing feature groups. Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ return self.feature_group_mixer.generate( self.feature_group_creator.subsets(self.master_feature_dictionary) ) @cachedproperty def matrix_build_tasks(self): """Tasks for all matrices that need to be built as a part of this Experiment. Each task contains arguments understood by ``Architect.build_matrix``. Returns: (list) of dicts """ if not table_has_data(self.sparse_states_table_name, self.db_engine): logging.warning('cohort table is not populated, cannot build any matrices') return {} if not table_has_data(self.labels_table_name, self.db_engine): logging.warning('labels table is not populated, cannot build any matrices') return {} ( updated_split_definitions, matrix_build_tasks ) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.full_matrix_definitions = updated_split_definitions return matrix_build_tasks @cachedproperty def full_matrix_definitions(self): """Full matrix definitions Returns: (list) temporal and feature information for each matrix """ ( updated_split_definitions, matrix_build_tasks ) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.matrix_build_tasks = matrix_build_tasks return updated_split_definitions @property def all_label_timespans(self): """All train and test label timespans Returns: (list) label timespans, in string form as they appeared in the experiment config """ return list(set( self.config['temporal_config']['training_label_timespans'] + self.config['temporal_config']['test_label_timespans'] )) def generate_labels(self): """Generate labels based on experiment configuration Results are stored in the database, not returned """ self.label_generator.generate_all_labels( self.labels_table_name, self.all_as_of_times, self.all_label_timespans ) def generate_cohort(self): self.state_table_generator.generate_sparse_table( as_of_dates=self.all_as_of_times ) def log_split(self, split_num, split): logging.info( 'Starting train/test for %s out of %s: train range: %s to %s', split_num+1, len(self.full_matrix_definitions), split['train_matrix']['first_as_of_time'], split['train_matrix']['matrix_info_end_time'], ) @abstractmethod def process_train_tasks(self, train_tasks): pass @abstractmethod def process_query_tasks(self, query_tasks): pass @abstractmethod def process_matrix_build_tasks(self, matrix_build_tasks): pass def generate_preimputation_features(self): self.process_query_tasks(self.feature_aggregation_table_tasks) logging.info('Finished running preimputation feature queries. The final results are in tables: %s', ','.join(agg.get_table_name() for agg in self.collate_aggregations) ) def impute_missing_features(self): self.process_query_tasks(self.feature_imputation_table_tasks) logging.info('Finished running postimputation feature queries. The final results are in tables: %s', ','.join(agg.get_table_name(imputed=True) for agg in self.collate_aggregations) ) def build_matrices(self): self.process_matrix_build_tasks(self.matrix_build_tasks) def generate_matrices(self): logging.info('Creating cohort') self.generate_cohort() logging.info('Creating labels') self.generate_labels() logging.info('Creating feature aggregation tables') self.generate_preimputation_features() logging.info('Creating feature imputation tables') self.impute_missing_features() logging.info('Building all matrices') self.build_matrices() def train_and_test_models(self): if 'grid_config' not in self.config: logging.warning('No grid_config was passed in the experiment config. No models will be trained') return for split_num, split in enumerate(self.full_matrix_definitions): self.log_split(split_num, split) train_store = self.matrix_storage_engine.get_store(split['train_uuid']) if train_store.empty: logging.warning('''Train matrix for split %s was empty, no point in training this model. Skipping ''', split['train_uuid']) continue if len(train_store.labels().unique()) == 1: logging.warning('''Train Matrix for split %s had only one unique value, no point in training this model. Skipping ''', split['train_uuid']) continue logging.info('Training models') train_tasks = self.trainer.generate_train_tasks( grid_config=self.config['grid_config'], misc_db_parameters=dict( test=False, model_comment=self.config.get('model_comment', None), ), matrix_store=train_store ) model_ids = self.process_train_tasks(train_tasks) logging.info('Done training models for split %s', split_num) test_tasks = self.tester.generate_model_test_tasks( split=split, train_store=train_store, model_ids=model_ids, ) logging.info('Found %s non-empty test matrices for split %s', len(test_tasks), split_num) self.process_model_test_tasks(test_tasks) def validate(self, strict=True): ExperimentValidator(self.db_engine, strict=strict).run(self.config) def _run(self): try: logging.info('Generating matrices') self.generate_matrices() finally: if self.cleanup: self.clean_up_tables() self.train_and_test_models() def clean_up_tables(self): logging.info('Cleaning up state and labels tables') with timeout(self.cleanup_timeout): self.state_table_generator.clean_up() self.label_generator.clean_up(self.labels_table_name) def run(self): try: self._run() except Exception: logging.exception('Run interrupted by uncaught exception') raise __call__ = run
def initialize_components(self): split_config = self.config['temporal_config'] self.chopper = Timechop(**split_config) cohort_config = self.config.get('cohort_config', {}) if 'query' in cohort_config: self.state_table_generator = StateTableGeneratorFromQuery( experiment_hash=self.experiment_hash, db_engine=self.db_engine, query=cohort_config['query'] ) elif 'entities_table' in cohort_config: self.state_table_generator = StateTableGeneratorFromEntities( experiment_hash=self.experiment_hash, db_engine=self.db_engine, entities_table=cohort_config['entities_table'] ) elif 'dense_states' in cohort_config: self.state_table_generator = StateTableGeneratorFromDense( experiment_hash=self.experiment_hash, db_engine=self.db_engine, dense_state_table=cohort_config['dense_states']['table_name'] ) else: logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.') self.state_table_generator = StateTableGeneratorNoOp() if 'label_config' in self.config: self.label_generator = LabelGenerator( label_name=self.config['label_config'].get('name', None), query=self.config['label_config']['query'], db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.') self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine, ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config['feature_start_time'] ) self.feature_group_creator = FeatureGroupCreator( self.config.get('feature_group_definition', {'all': [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get('feature_group_strategies', ['all']) ) self.planner = Planner( feature_start_time=dt_from_str(split_config['feature_start_time']), label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)], label_types=['binary'], cohort_name=self.config.get('cohort_config', {}).get('name', None), states=self.config.get('cohort_config', {}).get('dense_states', {}) .get('state_filters', []), user_metadata=self.config.get('user_metadata', {}), ) self.matrix_builder = MatrixBuilder( db_config={ 'features_schema_name': self.features_schema_name, 'labels_schema_name': 'public', 'labels_table_name': self.labels_table_name, # TODO: have planner/builder take state table later on, so we # can grab it from the StateTableGenerator instead of # duplicating it here 'sparse_state_table_name': self.sparse_states_table_name, }, matrix_storage_engine=self.matrix_storage_engine, include_missing_labels_in_train_as=self.config.get('label_config', {}) .get('include_missing_labels_in_train_as', None), engine=self.db_engine, replace=self.replace ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get('model_group_keys', [])), db_engine=self.db_engine, replace=self.replace ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get('individual_importance', {}), evaluator_config=self.config.get('scoring', {}) )