def test_save_experiment_and_get_hash(): # no reason to make assertions on the config itself, use a basic dict experiment_config = {"one": "two"} with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) exp_hash = save_experiment_and_get_hash(experiment_config, engine) assert isinstance(exp_hash, str) new_hash = save_experiment_and_get_hash(experiment_config, engine) assert new_hash == exp_hash
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, run_id=run_id, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def __init__( self, config, db_engine, model_storage_class=FSModelStorageEngine, project_path=None, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( 'Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments' ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine if model_storage_class: self.model_storage_engine = model_storage_class( project_path=project_path) self.matrix_store_class = CSVMatrixStore # can't be configurable until Architect obeys self.project_path = project_path self.replace = replace ensure_db(self.db_engine) self.features_schema_name = 'features' if project_path: self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = 'labels_{}'.format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( 'cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation' ) else: logging.info( 'cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation' ) self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, ): self._check_config_version(config) self.config = config self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( "Raw, unserializable SQLAlchemy engine passed. " "URL will be used, other options may be lost in multi-process environments" ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation") else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_storage, sample_timechop_splits, sample_grid_config): db_engine = db_engine_with_results_schema model_storage_engine = ModelStorageEngine(project_storage) matrix_storage_engine = MatrixStorageEngine(project_storage) sample_matrix_store = get_matrix_store(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, run_id=run_id, ) train_tester = ModelTrainTester( matrix_storage_engine=matrix_storage_engine, model_trainer=trainer, model_evaluator=None, individual_importance_calculator=None, predictor=None, subsets=None, protected_groups_generator=None, ) with patch.object(matrix_storage_engine, 'get_store', return_value=sample_matrix_store): batches = train_tester.generate_task_batches( splits=sample_timechop_splits, grid_config=sample_grid_config) assert len(batches) == 3 # we expect to have a task for each combination of split and classifier flattened_tasks = list(task for batch in batches for task in batch.tasks) assert len(flattened_tasks) == \ len(sample_timechop_splits) * len(list(flatten_grid_config(sample_grid_config))) # we also expect each task to match the call signature of process_task with patch.object(train_tester, 'process_task', autospec=True): for task in flattened_tasks: train_tester.process_task(**task)
def test_missing_matrix_uuids(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine) matrix_uuids = ['abcd', 'bcde', 'cdef'] # if we associate matrix uuids with an experiment but don't actually build the matrices # they should show up as missing associate_matrices_with_experiment(experiment_hash, matrix_uuids, db_engine) assert missing_matrix_uuids(experiment_hash, db_engine) == matrix_uuids # if we insert a matrix row they should no longer be considered missing db_engine.execute( f"insert into {Matrix.__table__.fullname} (matrix_uuid) values (%s)", matrix_uuids[0] ) assert missing_matrix_uuids(experiment_hash, db_engine) == matrix_uuids[1:]
def test_missing_model_hashes(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine) model_hashes = ['abcd', 'bcde', 'cdef'] # if we associate model hashes with an experiment but don't actually train the models # they should show up as missing associate_models_with_experiment(experiment_hash, model_hashes, db_engine) assert missing_model_hashes(experiment_hash, db_engine) == model_hashes # if we insert a model row they should no longer be considered missing db_engine.execute( f"insert into {Model.__table__.fullname} (model_hash) values (%s)", model_hashes[0] ) assert missing_model_hashes(experiment_hash, db_engine) == model_hashes[1:]
def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) # import pdb; pdb.set_trace() trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), run_id=run_id, ) yield trainer
def __init__( self, config, db_engine, model_storage_class=None, project_path=None, replace=True, cleanup_timeout=None, ): self._check_config_version(config) self.config = config self.db_engine = db_engine if model_storage_class: self.model_storage_engine = model_storage_class( project_path=project_path) self.matrix_store_class = CSVMatrixStore # can't be configurable until Architect obeys self.project_path = project_path self.replace = replace ensure_db(self.db_engine) self.labels_table_name = 'labels' self.features_schema_name = 'features' if project_path: self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.initialize_factories() self.initialize_components() self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config self.config['random_seed'] = self.config.get('random_seed', random.randint(1, 1e7)) random.seed(self.config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config['temporal_config'] = fill_timechop_config_missing( self.config, self.db_engine) ## Defaults to all the entities found in the features_aggregation's from_obj self.config['cohort_config'] = fill_cohort_config_missing( self.config) ## Defaults to all the feature_aggregation's prefixes self.config[ 'feature_group_definition'] = fill_feature_group_definition( self.config) grid_config = fill_model_grid_presets(self.config) self.config.pop('model_grid_preset', None) if grid_config is not None: self.config['grid_config'] = grid_config ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, additional_bigtrain_classnames=None, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {"db_engine", "config", "self"} }, ) self._check_config_version(config) self.config = config if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] ) if self.config.get("label_config") is not None: self.config["label_config"] = load_query_if_needed( self.config["label_config"] ) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path logger.verbose( f"Matrices and trained models will be saved in {self.project_path}" ) self.replace = replace if self.replace: logger.notice( f"Replace flag is set to true. Matrices, models, " "evaluations and predictions (if they exist) will be replaced" ) self.save_predictions = save_predictions if not self.save_predictions: logger.notice( f"Save predictions flag is set to false. " "Individual predictions won't be stored in the predictions " "table. This will decrease both the running time " "of an experiment and also decrease the space needed in the db" ) self.skip_validation = skip_validation if self.skip_validation: logger.notice( f"Warning: Skip validation flag is set to true. " "The experiment config file specified won't be validated. " "This will reduce (a little) the running time of the experiment, " "but has some potential risks, e.g. the experiment could fail" "after some time due to some misconfiguration. Proceed with care." ) self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs if not self.materialize_subquery_fromobjs: logger.notice( "Materialize from_objs is set to false. " "The from_objs will be calculated on the fly every time." ) self.features_ignore_cohort = features_ignore_cohort if self.features_ignore_cohort: logger.notice( "Features will be calculated for all the entities " "(i.e. ignoring cohort) this setting will have the effect " "that more db space will be used, but potentially could save " "time if you are running several similar experiments with " "different cohorts." ) self.additional_bigtrain_classnames = additional_bigtrain_classnames # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config["temporal_config"] = fill_timechop_config_missing( self.config, self.db_engine ) ## Defaults to all the entities found in the features_aggregation's from_obj self.config["cohort_config"] = fill_cohort_config_missing(self.config) ## Defaults to all the feature_aggregation's prefixes self.config["feature_group_definition"] = fill_feature_group_definition( self.config ) grid_config = fill_model_grid_presets(self.config) self.config.pop("model_grid_preset", None) if grid_config is not None: self.config["grid_config"] = grid_config if not self.config.get("random_seed", None): logger.notice( "Random seed not specified. A random seed will be provided. " "This could have interesting side effects, " "e.g. new models per model group are trained, " "tested and evaluated everytime that you run this experiment configuration" ) self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7)) logger.verbose( f"Using random seed [{self.random_seed}] for running the experiment" ) random.seed(self.random_seed) ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), random_seed=self.random_seed, experiment_kwargs=experiment_kwargs, db_engine=self.db_engine, ) logger.debug(f"Experiment run id [{self.run_id}] assigned") self.initialize_components() self.cleanup = cleanup if self.cleanup: logger.notice( "Cleanup is set to true, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile if self.profile: logger.spam("Profiling will be stored using cProfile")
def test_reuse_model_random_seeds(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine # re-using the random seeds requires the association between experiments and models # to exist, which we're not getting in these tests since we aren't using the experiment # architecture, so back-fill these associations after each train_models() run def update_experiment_models(db_engine): sql = """ INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) SELECT er.run_hash, m.model_hash FROM triage_metadata.models m LEFT JOIN triage_metadata.triage_runs er ON m.built_in_triage_run = er.id LEFT JOIN triage_metadata.experiment_models em ON m.model_hash = em.model_hash AND er.run_hash = em.experiment_hash WHERE em.experiment_hash IS NULL """ db_engine.execute(sql) db_engine.execute('COMMIT;') random.seed(5) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # simulate running a new experiment where the experiment hash has changed # (e.g. because the model grid is different), but experiment seed is the # same, so previously-trained models should not get new seeds experiment_hash = save_experiment_and_get_hash( config={'baz': 'qux'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) new_grid = grid_config.copy() new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] random.seed(5) new_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should have received 5 models assert len(new_model_ids) == 6 # all the original model ids should be in the new set assert len(set(new_model_ids) & set(model_ids)) == len(model_ids) # however, we should NOT re-use the random seeds (and so get new model_ids) # if the experiment-level seed is different experiment_hash = save_experiment_and_get_hash( config={'lorem': 'ipsum'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=42, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) random.seed(42) # different from above newer_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should get entirely new models now (different IDs) assert len(newer_model_ids) == 6 assert len(set(new_model_ids) & set(newer_model_ids)) == 0
def test_model_trainer(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine def set_test_seed(): random.seed(5) set_test_seed() model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( "select * from train_results.feature_importances" ) ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute("select model_hash from triage_metadata.models") ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 4 # 2. that the random seeds are distinct records = [ row for row in db_engine.execute( "select distinct random_seed from triage_metadata.models" ) ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute("select model_size from triage_metadata.models") ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [model_storage_engine.load(model_hash) for model_hash in hashes] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pd.DataFrame.from_dict( {"entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5]} ).set_index("entity_id") for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again with the same starting seed, same models are returned set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert ( len( [ row for row in db_engine.execute( "select model_hash from triage_metadata.models" ) ] ) == 4 ) assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models" ) ][0] experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"] ), db_engine=db_engine, replace=True, run_id=run_id, ) set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == new_model_ids assert [ row["model_id"] for row in db_engine.execute( "select model_id from triage_metadata.models order by 1 asc" ) ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models" ) ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( "select * from train_results.feature_importances" ) ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata set_test_seed() for row in db_engine.execute("select model_hash from triage_metadata.models"): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way set_test_seed() new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted([model_id for model_id in new_model_ids])
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, ): experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config random.seed(config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)