def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_generator = CohortTableGenerator( cohort_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace ) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices or perform feature imputation." ) self.cohort_table_generator = CohortTableGeneratorNoOp() if "label_config" in self.config: self.label_generator = LabelGenerator( label_name=self.config["label_config"].get("name", None), query=self.config["label_config"]["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get("individual_importance", {}), evaluator_config=self.config.get("scoring", {}), )
def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_name = "cohort_{}_{}".format( cohort_config.get('name', 'default'), filename_friendly_hash(cohort_config['query'])) self.cohort_table_generator = EntityDateTableGenerator( entity_date_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices, perform feature imputation, " "or save time by only computing features for that cohort.") self.features_ignore_cohort = True self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.cohort_table_generator = EntityDateTableGeneratorNoOp() self.subsets = [None] + self.config.get("scoring", {}).get( "subsets", []) if "label_config" in self.config: label_config = self.config["label_config"] self.labels_table_name = "labels_{}_{}".format( label_config.get('name', 'default'), filename_friendly_hash(label_config['query'])) self.label_generator = LabelGenerator( label_name=label_config.get("name", None), query=label_config["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.labels_table_name = "labels_{}".format(self.experiment_hash) self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices.") self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, features_ignore_cohort=self.features_ignore_cohort) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]})) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"])) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[ self.config.get("cohort_config", {}).get("name", None) ], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get( "label_config", {}).get("include_missing_labels_in_train_as", None), engine=self.db_engine, replace=self.replace, ) self.subsetter = Subsetter(db_engine=self.db_engine, replace=self.replace, as_of_times=self.all_as_of_times) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.predictor = Predictor( db_engine=self.db_engine, model_storage_engine=self.model_storage_engine, save_predictions=self.save_predictions, replace=self.replace, ) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=self.db_engine, n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5), methods=self.config.get("individual_importance", {}).get("methods", ["uniform"]), replace=self.replace, ) self.evaluator = ModelEvaluator( db_engine=self.db_engine, sort_seed=self.config.get("scoring", {}).get("sort_seed", None), testing_metric_groups=self.config.get("scoring", {}).get( "testing_metric_groups", []), training_metric_groups=self.config.get("scoring", {}).get( "training_metric_groups", []), ) self.model_train_tester = ModelTrainTester( matrix_storage_engine=self.matrix_storage_engine, model_evaluator=self.evaluator, model_trainer=self.trainer, individual_importance_calculator=self. individual_importance_calculator, predictor=self.predictor, subsets=self.subsets, )
def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) if "label_config" in self.config: label_config = self.config["label_config"] self.labels_table_name = "labels_{}_{}".format( label_config.get("name", "default"), filename_friendly_hash(label_config["query"]), ) self.label_generator = LabelGenerator( label_name=label_config.get("name", None), query=label_config["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.labels_table_name = "labels_{}".format(self.experiment_hash) self.label_generator = LabelGeneratorNoOp() logger.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) record_labels_table_name(self.run_id, self.db_engine, self.labels_table_name) cohort_config = self.config.get("cohort_config", {}) self.cohort_table_generator = None if "query" in cohort_config: self.cohort_hash = filename_friendly_hash( self.config["cohort_config"]["query"] ) elif "query" in self.config.get("label_config", {}): logger.info( "cohort_config missing or unrecognized, but labels are configured. Labels will be used as the cohort." ) self.cohort_hash = filename_friendly_hash( self.config["label_config"]["query"] ) else: self.features_ignore_cohort = True self.cohort_hash = None self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.cohort_table_generator = CohortTableGeneratorNoOp() if not self.cohort_table_generator: self.cohort_table_name = "cohort_{}_{}".format( cohort_config.get("name", "default"), self.cohort_hash ) self.cohort_table_generator = EntityDateTableGenerator( entity_date_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config.get("query", None), labels_table_name=self.labels_table_name, replace=self.replace, ) record_cohort_table_name(self.run_id, self.db_engine, self.cohort_table_name) if "bias_audit_config" in self.config: bias_config = self.config["bias_audit_config"] self.bias_hash = filename_friendly_hash(bias_config) self.protected_groups_table_name = f"protected_groups_{self.bias_hash}" self.protected_groups_generator = ProtectedGroupsGenerator( db_engine=self.db_engine, from_obj=parse_from_obj(bias_config, "bias_from_obj"), attribute_columns=bias_config.get("attribute_columns", None), entity_id_column=bias_config.get("entity_id_column", None), knowledge_date_column=bias_config.get("knowledge_date_column", None), protected_groups_table_name=self.protected_groups_table_name, replace=self.replace, ) record_bias_hash(self.run_id, self.db_engine, self.bias_hash) else: self.protected_groups_generator = ProtectedGroupsGeneratorNoOp() logger.notice( "bias_audit_config missing in the configuration file or unrecognized. " "Without protected groups, you will not be able to audit your models for bias and fairness." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, features_ignore_cohort=self.features_ignore_cohort, ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.subsets = self.config.get("scoring", {}).get("subsets", []) if self.subsets: self.subsetter = Subsetter( db_engine=self.db_engine, replace=self.replace, as_of_times=self.all_as_of_times, ) else: self.subsetter = SubsetterNoOp() logger.notice( "scoring.subsets missing in the configuration file or unrecognized. No subsets will be generated" ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.predictor = Predictor( db_engine=self.db_engine, model_storage_engine=self.model_storage_engine, save_predictions=self.save_predictions, replace=self.replace, rank_order=self.config.get("prediction", {}).get( "rank_tiebreaker", "worst" ), ) if "individual_importance" in self.config: self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=self.db_engine, n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5), methods=self.config.get("individual_importance", {}).get( "methods", ["uniform"] ), replace=self.replace, ) else: self.individual_importance_calculator = IndividualImportanceCalculatorNoOp() logger.notice( "individual_importance missing in the configuration file or unrecognized, " "you will not be able to do analysis on individual feature importances." ) self.evaluator = ModelEvaluator( db_engine=self.db_engine, testing_metric_groups=self.config.get("scoring", {}).get( "testing_metric_groups", [] ), training_metric_groups=self.config.get("scoring", {}).get( "training_metric_groups", [] ), bias_config=self.config.get("bias_audit_config", {}), ) self.model_train_tester = ModelTrainTester( matrix_storage_engine=self.matrix_storage_engine, model_evaluator=self.evaluator, model_trainer=self.trainer, individual_importance_calculator=self.individual_importance_calculator, predictor=self.predictor, subsets=self.subsets, protected_groups_generator=self.protected_groups_generator, cohort_hash=self.cohort_hash, replace=self.replace, additional_bigtrain_classnames=self.additional_bigtrain_classnames, )
def test_load_labels_data_include_missing_labels_as_false(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0), ] # same as the other load_labels_data test, except we include an extra date, 2016-06-01 # this date does have entity 0 included via the states table, but no labels # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, include_missing_labels_in_train_as=False, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [0, 2, 3, 4, 4], "as_of_date": [dates[2], dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) # the first row would not be here if we had not configured the Builder # to include missing labels as false test = result == df assert test.all().all()
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=False, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = filename_friendly_hash(matrix_metadata) builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert not builder.make_entity_date_table.called
def test_load_features_data(): dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make dataframe for entity ids and dates ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, label_name="booking", label_type="binary", label_timespan="1 month", ) features = [["f1", "f2"], ["f3", "f4"]] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ["entity_id", "as_of_date"] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df["as_of_date"] = convert_string_column_to_date(temp_df["as_of_date"]) features_dfs.append( ids_dates.merge( right=temp_df, how="left", on=["entity_id", "as_of_date"] ).set_index(["entity_id", "as_of_date"]) ) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) feature_dictionary = dict( ("features{}".format(i), feature_list) for i, feature_list in enumerate(features) ) returned_features_dfs = builder.load_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid="my_uuid", ) # get the queries and test them for result, df in zip(returned_features_dfs, features_dfs): test = result == df assert test.all().all()
def test_load_labels_data(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns=[ "entity_id", "as_of_date", "label_timespan", "label_name", "label_type", "label", ], ) labels_df["as_of_date"] = convert_string_column_to_date(labels_df["as_of_date"]) labels_df.set_index(["entity_id", "as_of_date"]) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_type="train", matrix_uuid="my_uuid", label_timespan="1 month", ) result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan="1 month", matrix_uuid="my_uuid", entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict( { "entity_id": [2, 3, 4, 4], "as_of_date": [dates[1], dates[1], dates[0], dates[1]], "booking": [0, 0, 1, 0], } ).set_index(["entity_id", "as_of_date"]) test = result == df assert test.all().all()
def basic_integration_test(state_filters, feature_group_create_rules, feature_group_mix_rules, expected_matrix_multiplier, expected_group_lists): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( feature_start_time=datetime(2010, 1, 1), feature_end_time=datetime(2014, 1, 1), label_start_time=datetime(2011, 1, 1), label_end_time=datetime(2014, 1, 1), model_update_frequency='1year', training_label_timespans=['6months'], test_label_timespans=['6months'], training_as_of_date_frequencies='1day', test_as_of_date_frequencies='3months', max_training_histories=['1months'], test_durations=['1months'], ) state_table_generator = StateTableGeneratorFromDense( db_engine=db_engine, experiment_hash='abcd', dense_state_table='states', ) label_generator = LabelGenerator( db_engine=db_engine, query=sample_config()['label_config']['query']) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name='features', replace=True, ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name='features') feature_group_creator = FeatureGroupCreator( feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) project_storage = ProjectStorage(temp_dir) planner = Planner( feature_start_time=datetime(2010, 1, 1), label_names=['outcome'], label_types=['binary'], states=state_filters, user_metadata={}, ) builder = MatrixBuilder( engine=db_engine, db_config={ 'features_schema_name': 'features', 'labels_schema_name': 'public', 'labels_table_name': 'labels', 'sparse_state_table_name': 'tmp_sparse_states_abcd', }, matrix_storage_engine=project_storage.matrix_storage_engine(), replace=True) # chop time split_definitions = chopper.chop_time() num_split_matrices = sum(1 + len(split['test_matrices']) for split in split_definitions) # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: all_as_of_times.extend(test_matrix['as_of_times']) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels(labels_table='labels', as_of_dates=all_as_of_times, label_timespans=['6months']) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict aggregations = feature_generator.aggregations( feature_aggregation_config=[{ 'prefix': 'cat', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], 'imputation': { 'all': { 'type': 'mean' } } }], 'intervals': ['1y'], 'groups': ['entity_id'] }, { 'prefix': 'dog', 'from_obj': 'dog_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates_imputation': { 'count': { 'type': 'constant', 'value': 7 }, 'sum': { 'type': 'mean' }, 'avg': { 'type': 'zero' } }, 'aggregates': [{ 'quantity': 'dog_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }], feature_dates=all_as_of_times, state_table=state_table_generator.sparse_table_name) feature_table_agg_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type='aggregation') # create feature aggregation tables feature_generator.process_table_tasks(feature_table_agg_tasks) feature_table_imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type='imputation') # create feature imputation tables feature_generator.process_table_tasks(feature_table_imp_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator.feature_dictionary( feature_table_names=feature_table_imp_tasks.keys(), index_column_lookup=feature_generator.index_column_lookup( aggregations)) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict)) # figure out what matrices need to be built _, matrix_build_tasks =\ planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices builder.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrices_records = list( db_engine.execute( '''select matrix_uuid, num_observations, matrix_type from model_metadata.matrices ''')) matrix_directory = os.path.join(temp_dir, 'matrices') matrices = [ path for path in os.listdir(matrix_directory) if '.csv' in path ] metadatas = [ path for path in os.listdir(matrix_directory) if '.yaml' in path ] assert len(matrices) == num_split_matrices * \ expected_matrix_multiplier assert len(metadatas) == num_split_matrices * \ expected_matrix_multiplier assert len(matrices) == len(matrices_records) feature_group_name_lists = [] for metadata_path in metadatas: with open(os.path.join(matrix_directory, metadata_path)) as f: metadata = yaml.load(f) feature_group_name_lists.append(metadata['feature_groups']) for matrix_uuid, num_observations, matrix_type in matrices_records: assert matrix_uuid in matrix_build_tasks #the hashes of the matrices assert type(num_observations) is int assert matrix_type == matrix_build_tasks[matrix_uuid][ 'matrix_type'] def deep_unique_tuple(l): return set([tuple(i) for i in l]) assert deep_unique_tuple( feature_group_name_lists) == deep_unique_tuple( expected_group_lists)
def test_make_entity_date_table_include_missing_labels(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0), ] # same as the other make_entity_date_label test except there is an extra date, 2016-06-01 # entity 0 is included in this date via the states table, but has no label # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, label_name="booking", label_type="binary", label_timespan="1 month", ) # this line adds the new entity-date combo as an expected one ids_dates = ids_dates.append( {"entity_id": 0, "as_of_date": datetime.date(2016, 6, 1)}, ignore_index=True ) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, include_missing_labels_in_train_as=False, engine=engine, ) engine.execute("CREATE TABLE features.tmp_entity_date (a int, b date);") # call the function to test the creation of the table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_uuid="my_uuid", matrix_type="train", label_timespan="1 month", ) # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date".format( entity_date_table_name ), engine, ) # compare the table to the test dataframe assert sorted(result.values.tolist()) == sorted(ids_dates.values.tolist())
def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database Args: db_engine (sqlalchemy.db.engine) project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="triage_production", feature_start_time=experiment_config['temporal_config'] ['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name) feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=db_engine, experiment_hash=None, replace=True, ) feature_start_time = experiment_config['temporal_config'][ 'feature_start_time'] label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] user_metadata = experiment_config['user_metadata'] # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] temporal_config.update( temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0]) matrix_metadata = Planner.make_metadata( prod_definitions[-1], reconstructed_feature_dict, label_name, label_type, cohort_name, 'production', feature_start_time, user_metadata, ) matrix_metadata['matrix_id'] = str( as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=label_name, label_type=label_type, feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best') predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=matrix_storage_engine.get_store( train_matrix_uuid).columns())
def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it Args: prediction_date(str) """ cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" # 1. Generate cohort self.generate_entity_date_table(prediction_date, cohort_table_name) # 2. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( prediction_date, cohort_table_name) self.feature_generator.process_table_tasks( self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation')) # 3. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.retrain_model_id) self.feature_generator.process_table_tasks(imputation_table_tasks) # 4. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) # Use timechop to get the time definition for production temporal_config = self.get_temporal_config_for_retrain( dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) retrain_config = get_retrain_config_from_model_id( self.db_engine, self.retrain_model_id) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), test_duration=retrain_config['test_duration'], test_label_timespan=retrain_config['test_label_timespan']) last_split_definition = prod_definitions[-1] matrix_metadata = Planner.make_metadata( matrix_definition=last_split_definition, feature_dictionary=reconstructed_feature_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='production', feature_start_time=self.feature_start_time, user_metadata=self.user_metadata, ) matrix_metadata['matrix_id'] = str( prediction_date ) + f'_model_id_{self.retrain_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[prediction_date], label_name=self.label_name, label_type='binary', feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 5. Predict the risk score for production predictor = Predictor( model_storage_engine=self.project_storage.model_storage_engine(), db_engine=self.db_engine, rank_order='best') predictor.predict( model_id=self.retrain_model_id, matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=self.matrix_storage_engine.get_store( self.retrain_matrix_uuid).columns(), ) self.predict_matrix_uuid = matrix_uuid
def retrain(self, prediction_date): """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: prediction_date(str) """ # Retrain config and hash retrain_config = { "model_group_id": self.model_group_id, "prediction_date": prediction_date, "test_label_timespan": self.test_label_timespan, "test_duration": self.test_duration, } self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: retrain.prediction_date = prediction_date # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) chops = timechopper.chop_time() assert len(chops) == 1 chops_train_matrix = chops[0]['train_matrix'] as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { 'first_as_of_time': chops_train_matrix['first_as_of_time'], 'last_as_of_time': chops_train_matrix['last_as_of_time'], 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } # Set ExperimentRun run = TriageRun( start_time=datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="retrain", run_hash=self.retrain_hash, last_updated_time=datetime.now(), current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), working_directory=os.getcwd(), ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=classpath(self.__class__), random_seed=retrieve_experiment_seed_from_run_id( self.db_engine, self.triage_run_id), ) run_id = None with scoped_session(self.db_engine) as session: session.add(run) session.commit() run_id = run.run_id if not run_id: raise ValueError("Failed to retrieve run_id from saved row") # set ModelTrainer's run_id and experiment_hash for Retrain run self.model_trainer.run_id = run_id self.model_trainer.experiment_hash = self.retrain_hash # 1. Generate all labels self.generate_all_labels(as_of_date) record_labels_table_name(run_id, self.db_engine, self.labels_table_name) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" self.generate_entity_date_table(as_of_date, cohort_table_name) record_cohort_table_name(run_id, self.db_engine, cohort_table_name) # 3. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( as_of_date, cohort_table_name) feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation') self.feature_generator.process_table_tasks( feature_aggregation_table_tasks) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.model_group_info['model_id_last_split'], ) feature_group_creator = FeatureGroupCreator( self.experiment_config['feature_group_definition']) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( feature_group_creator.subsets(reconstructed_feature_dict))[0] self.feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build new matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, "labels_table_name": self.labels_table_name, } record_matrix_building_started(run_id, self.db_engine) matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) new_matrix_metadata = Planner.make_metadata( matrix_definition=retrain_definition, feature_dictionary=feature_group_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='train', feature_start_time=dt_from_str(self.feature_start_time), user_metadata=self.user_metadata, ) new_matrix_metadata['matrix_id'] = "_".join([ self.label_name, 'binary', str(as_of_date), 'retrain', ]) matrix_uuid = filename_friendly_hash(new_matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=self.label_name, label_type='binary', feature_dictionary=feature_group_dict, matrix_metadata=new_matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="train", ) retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } # get the random seed from the last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split']) random_seed = self.model_trainer.get_or_generate_random_seed( model_group_id=self.model_group_id, matrix_metadata=last_split_matrix_metadata, train_matrix_uuid=last_split_train_matrix_uuid) # create retrain model hash retrain_model_hash = self.model_trainer._model_hash( self.matrix_storage_engine.get_store(matrix_uuid).metadata, class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], random_seed=random_seed, ) associate_models_with_retrain(self.retrain_hash, (retrain_model_hash, ), self.db_engine) record_model_building_started(run_id, self.db_engine) retrain_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], model_hash=retrain_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, model_group_id=self.model_group_id) self.retrain_model_hash = retrieve_model_hash_from_id( self.db_engine, retrain_model_id) self.retrain_matrix_uuid = matrix_uuid self.retrain_model_id = retrain_model_id return { 'retrain_model_comment': retrain_model_comment, 'retrain_model_id': retrain_model_id }
def basic_integration_test( cohort_names, feature_group_create_rules, feature_group_mix_rules, expected_matrix_multiplier, expected_group_lists, ): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( feature_start_time=datetime(2010, 1, 1), feature_end_time=datetime(2014, 1, 1), label_start_time=datetime(2011, 1, 1), label_end_time=datetime(2014, 1, 1), model_update_frequency="1year", training_label_timespans=["6months"], test_label_timespans=["6months"], training_as_of_date_frequencies="1day", test_as_of_date_frequencies="3months", max_training_histories=["1months"], test_durations=["1months"], ) cohort_table_generator = CohortTableGenerator( db_engine=db_engine, cohort_table_name="cohort_abcd", query="select distinct(entity_id) from events" ) label_generator = LabelGenerator( db_engine=db_engine, query=sample_config()["label_config"]["query"] ) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="features", replace=True ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name="features" ) feature_group_creator = FeatureGroupCreator(feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) project_storage = ProjectStorage(temp_dir) planner = Planner( feature_start_time=datetime(2010, 1, 1), label_names=["outcome"], label_types=["binary"], cohort_names=cohort_names, user_metadata={}, ) builder = MatrixBuilder( engine=db_engine, db_config={ "features_schema_name": "features", "labels_schema_name": "public", "labels_table_name": "labels", "cohort_table_name": "cohort_abcd", }, experiment_hash=None, matrix_storage_engine=project_storage.matrix_storage_engine(), replace=True, ) # chop time split_definitions = chopper.chop_time() num_split_matrices = sum( 1 + len(split["test_matrices"]) for split in split_definitions ) # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) for test_matrix in split["test_matrices"]: all_as_of_times.extend(test_matrix["as_of_times"]) all_as_of_times = list(set(all_as_of_times)) # generate cohort state table cohort_table_generator.generate_cohort_table(as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels( labels_table="labels", as_of_dates=all_as_of_times, label_timespans=["6months"], ) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict aggregations = feature_generator.aggregations( feature_aggregation_config=[ { "prefix": "cat", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", "aggregates": [ { "quantity": "cat_sightings", "metrics": ["count", "avg"], "imputation": {"all": {"type": "mean"}}, } ], "intervals": ["1y"], "groups": ["entity_id"], }, { "prefix": "dog", "from_obj": "dog_complaints", "knowledge_date_column": "as_of_date", "aggregates_imputation": { "count": {"type": "constant", "value": 7}, "sum": {"type": "mean"}, "avg": {"type": "zero"}, }, "aggregates": [ {"quantity": "dog_sightings", "metrics": ["count", "avg"]} ], "intervals": ["1y"], "groups": ["entity_id"], }, ], feature_dates=all_as_of_times, state_table=cohort_table_generator.cohort_table_name, ) feature_table_agg_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="aggregation" ) # create feature aggregation tables feature_generator.process_table_tasks(feature_table_agg_tasks) feature_table_imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="imputation" ) # create feature imputation tables feature_generator.process_table_tasks(feature_table_imp_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator.feature_dictionary( feature_table_names=feature_table_imp_tasks.keys(), index_column_lookup=feature_generator.index_column_lookup(aggregations), ) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict) ) # figure out what matrices need to be built _, matrix_build_tasks = planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices builder.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrices_records = list( db_engine.execute( """select matrix_uuid, num_observations, matrix_type from model_metadata.matrices """ ) ) matrix_directory = os.path.join(temp_dir, "matrices") matrices = [path for path in os.listdir(matrix_directory) if ".csv" in path] metadatas = [ path for path in os.listdir(matrix_directory) if ".yaml" in path ] assert len(matrices) == num_split_matrices * expected_matrix_multiplier assert len(metadatas) == num_split_matrices * expected_matrix_multiplier assert len(matrices) == len(matrices_records) feature_group_name_lists = [] for metadata_path in metadatas: with open(os.path.join(matrix_directory, metadata_path)) as f: metadata = yaml.load(f) feature_group_name_lists.append(metadata["feature_groups"]) for matrix_uuid, num_observations, matrix_type in matrices_records: assert matrix_uuid in matrix_build_tasks # the hashes of the matrices assert type(num_observations) is int assert matrix_type == matrix_build_tasks[matrix_uuid]["matrix_type"] def deep_unique_tuple(l): return set([tuple(i) for i in l]) assert deep_unique_tuple(feature_group_name_lists) == deep_unique_tuple( expected_group_lists )
def test_load_features_data(): dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make dataframe for entity ids and dates ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') features = [['f1', 'f2'], ['f3', 'f4']] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ['entity_id', 'as_of_date'] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df['as_of_date'] = convert_string_column_to_date( temp_df['as_of_date']) features_dfs.append( ids_dates.merge(right=temp_df, how='left', on=['entity_id', 'as_of_date' ]).set_index(['entity_id', 'as_of_date'])) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') feature_dictionary = dict( ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features)) returned_features_dfs = builder.load_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid='my_uuid') # get the queries and test them for result, df in zip(returned_features_dfs, features_dfs): test = (result == df) assert (test.all().all())