def initialize_tracking_and_get_run_id(experiment_hash, experiment_class_path, random_seed, experiment_kwargs, db_engine): """Create a row in the TriageRun table with some initial info and return the created run_id Args: experiment_hash (str) An experiment hash that exists in the experiments table experiment_class_path (str) The name of the experiment subclass used random_seed (int) Random seed used to run the experiment experiment_kwargs (dict) Any runtime Experiment keyword arguments that should be saved db_engine (sqlalchemy.engine) """ # Any experiment kwargs that are types (e.g. MatrixStorageClass) can't # be serialized, so just use the class name if so cleaned_experiment_kwargs = { k: (classpath(v) if isinstance(v, type) else v) for k, v in experiment_kwargs.items() } run = TriageRun( start_time=datetime.datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="experiment", run_hash=experiment_hash, last_updated_time=datetime.datetime.now(), current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), working_directory=os.getcwd(), ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=experiment_class_path, random_seed=random_seed, experiment_kwargs=cleaned_experiment_kwargs, ) run_id = None with scoped_session(db_engine) as session: session.add(run) session.commit() run_id = run.run_id if not run_id: raise ValueError("Failed to retrieve run_id from saved row") return run_id
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config self.config['random_seed'] = self.config.get('random_seed', random.randint(1, 1e7)) random.seed(self.config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config['temporal_config'] = fill_timechop_config_missing( self.config, self.db_engine) ## Defaults to all the entities found in the features_aggregation's from_obj self.config['cohort_config'] = fill_cohort_config_missing( self.config) ## Defaults to all the feature_aggregation's prefixes self.config[ 'feature_group_definition'] = fill_feature_group_definition( self.config) grid_config = fill_model_grid_presets(self.config) self.config.pop('model_grid_preset', None) if grid_config is not None: self.config['grid_config'] = grid_config ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, additional_bigtrain_classnames=None, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {"db_engine", "config", "self"} }, ) self._check_config_version(config) self.config = config if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] ) if self.config.get("label_config") is not None: self.config["label_config"] = load_query_if_needed( self.config["label_config"] ) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path logger.verbose( f"Matrices and trained models will be saved in {self.project_path}" ) self.replace = replace if self.replace: logger.notice( f"Replace flag is set to true. Matrices, models, " "evaluations and predictions (if they exist) will be replaced" ) self.save_predictions = save_predictions if not self.save_predictions: logger.notice( f"Save predictions flag is set to false. " "Individual predictions won't be stored in the predictions " "table. This will decrease both the running time " "of an experiment and also decrease the space needed in the db" ) self.skip_validation = skip_validation if self.skip_validation: logger.notice( f"Warning: Skip validation flag is set to true. " "The experiment config file specified won't be validated. " "This will reduce (a little) the running time of the experiment, " "but has some potential risks, e.g. the experiment could fail" "after some time due to some misconfiguration. Proceed with care." ) self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs if not self.materialize_subquery_fromobjs: logger.notice( "Materialize from_objs is set to false. " "The from_objs will be calculated on the fly every time." ) self.features_ignore_cohort = features_ignore_cohort if self.features_ignore_cohort: logger.notice( "Features will be calculated for all the entities " "(i.e. ignoring cohort) this setting will have the effect " "that more db space will be used, but potentially could save " "time if you are running several similar experiments with " "different cohorts." ) self.additional_bigtrain_classnames = additional_bigtrain_classnames # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config["temporal_config"] = fill_timechop_config_missing( self.config, self.db_engine ) ## Defaults to all the entities found in the features_aggregation's from_obj self.config["cohort_config"] = fill_cohort_config_missing(self.config) ## Defaults to all the feature_aggregation's prefixes self.config["feature_group_definition"] = fill_feature_group_definition( self.config ) grid_config = fill_model_grid_presets(self.config) self.config.pop("model_grid_preset", None) if grid_config is not None: self.config["grid_config"] = grid_config if not self.config.get("random_seed", None): logger.notice( "Random seed not specified. A random seed will be provided. " "This could have interesting side effects, " "e.g. new models per model group are trained, " "tested and evaluated everytime that you run this experiment configuration" ) self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7)) logger.verbose( f"Using random seed [{self.random_seed}] for running the experiment" ) random.seed(self.random_seed) ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), random_seed=self.random_seed, experiment_kwargs=experiment_kwargs, db_engine=self.db_engine, ) logger.debug(f"Experiment run id [{self.run_id}] assigned") self.initialize_components() self.cleanup = cleanup if self.cleanup: logger.notice( "Cleanup is set to true, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile if self.profile: logger.spam("Profiling will be stored using cProfile")
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, ): experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config random.seed(config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def retrain(self, prediction_date): """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: prediction_date(str) """ # Retrain config and hash retrain_config = { "model_group_id": self.model_group_id, "prediction_date": prediction_date, "test_label_timespan": self.test_label_timespan, "test_duration": self.test_duration, } self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: retrain.prediction_date = prediction_date # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) chops = timechopper.chop_time() assert len(chops) == 1 chops_train_matrix = chops[0]['train_matrix'] as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { 'first_as_of_time': chops_train_matrix['first_as_of_time'], 'last_as_of_time': chops_train_matrix['last_as_of_time'], 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } # Set ExperimentRun run = TriageRun( start_time=datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="retrain", run_hash=self.retrain_hash, last_updated_time=datetime.now(), current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), working_directory=os.getcwd(), ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=classpath(self.__class__), random_seed=retrieve_experiment_seed_from_run_id( self.db_engine, self.triage_run_id), ) run_id = None with scoped_session(self.db_engine) as session: session.add(run) session.commit() run_id = run.run_id if not run_id: raise ValueError("Failed to retrieve run_id from saved row") # set ModelTrainer's run_id and experiment_hash for Retrain run self.model_trainer.run_id = run_id self.model_trainer.experiment_hash = self.retrain_hash # 1. Generate all labels self.generate_all_labels(as_of_date) record_labels_table_name(run_id, self.db_engine, self.labels_table_name) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" self.generate_entity_date_table(as_of_date, cohort_table_name) record_cohort_table_name(run_id, self.db_engine, cohort_table_name) # 3. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( as_of_date, cohort_table_name) feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation') self.feature_generator.process_table_tasks( feature_aggregation_table_tasks) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.model_group_info['model_id_last_split'], ) feature_group_creator = FeatureGroupCreator( self.experiment_config['feature_group_definition']) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( feature_group_creator.subsets(reconstructed_feature_dict))[0] self.feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build new matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, "labels_table_name": self.labels_table_name, } record_matrix_building_started(run_id, self.db_engine) matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) new_matrix_metadata = Planner.make_metadata( matrix_definition=retrain_definition, feature_dictionary=feature_group_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='train', feature_start_time=dt_from_str(self.feature_start_time), user_metadata=self.user_metadata, ) new_matrix_metadata['matrix_id'] = "_".join([ self.label_name, 'binary', str(as_of_date), 'retrain', ]) matrix_uuid = filename_friendly_hash(new_matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=self.label_name, label_type='binary', feature_dictionary=feature_group_dict, matrix_metadata=new_matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="train", ) retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } # get the random seed from the last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split']) random_seed = self.model_trainer.get_or_generate_random_seed( model_group_id=self.model_group_id, matrix_metadata=last_split_matrix_metadata, train_matrix_uuid=last_split_train_matrix_uuid) # create retrain model hash retrain_model_hash = self.model_trainer._model_hash( self.matrix_storage_engine.get_store(matrix_uuid).metadata, class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], random_seed=random_seed, ) associate_models_with_retrain(self.retrain_hash, (retrain_model_hash, ), self.db_engine) record_model_building_started(run_id, self.db_engine) retrain_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], model_hash=retrain_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, model_group_id=self.model_group_id) self.retrain_model_hash = retrieve_model_hash_from_id( self.db_engine, retrain_model_id) self.retrain_matrix_uuid = matrix_uuid self.retrain_model_id = retrain_model_id return { 'retrain_model_comment': retrain_model_comment, 'retrain_model_id': retrain_model_id }