def matrix_build_tasks(self): """Tasks for all matrices that need to be built as a part of this Experiment. Each task contains arguments understood by ``Architect.build_matrix``. Returns: (list) of dicts """ if not table_has_data(self.sparse_states_table_name, self.db_engine): logging.warning('cohort table is not populated, cannot build any matrices') return {} if not table_has_data(self.labels_table_name, self.db_engine): logging.warning('labels table is not populated, cannot build any matrices') return {} ( updated_split_definitions, matrix_build_tasks ) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.full_matrix_definitions = updated_split_definitions return matrix_build_tasks
def test_predictor_save_predictions(matrix_type, predict_setup_args): """Test the save_predictions flag being set to False We still want to return predict_proba, but not save data to the DB """ (project_storage, db_engine, model_id) = predict_setup_args # if save_predictions is sent as False, don't save predictor = Predictor(project_storage.model_storage_engine(), db_engine, rank_order='worst', save_predictions=False) matrix_store = get_matrix_store(project_storage) train_matrix_columns = matrix_store.columns() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_predictor_save_predictions(matrix_type, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args # if save_predictions is sent as False, don't save predictor = Predictor(project_storage.model_storage_engine(), db_engine, save_predictions=False) matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=matrix_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_populate_predictions_table(finished_experiment_without_predictions): """assert that generate_predictions populate the predictions table""" db_engine = finished_experiment_without_predictions.db_engine model_groups = [1] project_path = finished_experiment_without_predictions.project_storage.project_path add_predictions(db_engine=db_engine, model_groups=model_groups, project_path=project_path) assert table_has_data('test_results.predictions', db_engine)
def _all_valid_entity_dates_query(self, state, as_of_time_strings): query = f""" SELECT entity_id, as_of_date FROM {self.db_config["cohort_table_name"]} WHERE {state} AND as_of_date IN (SELECT (UNNEST (ARRAY{as_of_time_strings}::timestamp[]))) ORDER BY entity_id, as_of_date """ if not table_has_data(self.db_config["cohort_table_name"], self.db_engine): raise ValueError("Required cohort table does not exist") return query
def table_should_have_data(table_name, db_engine): """Ensures that the table has at least one row Args: table_name (string) A table name (with schema) db_engine (sqlalchemy.engine) Raises: ValueError if the table does not have at least one row """ table_should_exist(table_name, db_engine) if not table_has_data(table_name, db_engine): raise ValueError("{} table does not have any data".format(table_name))
def _all_valid_entity_dates_query(self, state, as_of_time_strings): query = """ SELECT entity_id, as_of_date FROM {states_table} WHERE {state_string} AND as_of_date IN (SELECT (UNNEST (ARRAY{times}::timestamp[]))) ORDER BY entity_id, as_of_date """.format(states_table=self.db_config['sparse_state_table_name'], state_string=state, times=as_of_time_strings) if not table_has_data(self.db_config['sparse_state_table_name'], self.db_engine): raise ValueError('Required sparse state table does not exist') return query
def generate_entity_date_table(self, as_of_dates): """Convert the object's input table into a states table for the given as_of_dates Args: as_of_dates (list of datetime.dates) Dates to include in the state table """ logger.spam( f"Generating entity_date table {self.entity_date_table_name}") if self.query: logger.spam( f"Query is present, so running query on as_of_dates: {as_of_dates}" ) self._create_and_populate_entity_date_table_from_query(as_of_dates) elif self.labels_table_name: self._create_and_populate_entity_date_table_from_labels() else: raise ValueError( "Neither query not labels table name is available, cannot compute cohort" ) logger.spam( f"Table {self.entity_date_table_name} created and populated") if not table_has_data(self.entity_date_table_name, self.db_engine): raise ValueError(self._empty_table_message(as_of_dates)) if table_has_duplicates(self.entity_date_table_name, ['entity_id', 'as_of_date'], self.db_engine): raise ValueError( f"Duplicates found in {self.entity_date_table_name}!") logger.debug( f"Entity-date table generated at {self.entity_date_table_name}") logger.spam(f"Generating stats on {self.entity_date_table_name}") logger.spam( f"Row count of {self.entity_date_table_name}: {table_row_count(self.entity_date_table_name, self.db_engine)}" )
def build_matrix(self, as_of_times, label_name, label_type, feature_dictionary, matrix_directory, matrix_metadata, matrix_uuid, matrix_type): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_directory: the directory in which to store the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_directory: str :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logging.info('popped matrix %s build off the queue', matrix_uuid) if not table_has_data(self.db_config['sparse_state_table_name'], self.db_engine): logging.warning( 'cohort table is not populated, cannot build matrix') return if not table_has_data( "{}.{}".format(self.db_config['labels_schema_name'], self.db_config['labels_table_name']), self.db_engine): logging.warning( 'labels table is not populated, cannot build matrix') return matrix_filename = os.path.join(matrix_directory, '{}.csv'.format(matrix_uuid)) # The output directory is local or in s3 path_parsed = urlparse(matrix_filename) scheme = path_parsed.scheme # If '' of 'file' is a regular file or 's3' if scheme in ('', 'file'): if not self.replace and os.path.exists(matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return elif scheme == 's3': if not self.replace and s3fs.S3FileSystem().exists( matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return else: raise ValueError(f"""URL scheme not supported: {scheme} (from {matrix_filename}) """) logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'], matrix_filename) # make the entity time table and query the labels and features tables logging.info('Making entity date table for matrix %s', matrix_uuid) try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata['state'], matrix_type, matrix_uuid, matrix_metadata['label_timespan']) except ValueError as e: logging.warning( 'Not able to build entity-date table due to: %s - will not build matrix', exc_info=True) return logging.info( 'Extracting feature group data from database into file ' 'for matrix %s', matrix_uuid) features_csv_names = self.write_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logging.info(f"Feature data extracted for matrix {matrix_uuid}") try: logging.info( 'Extracting label data from database into file for ' 'matrix %s', matrix_uuid) labels_csv_name = self.write_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata['label_timespan']) features_csv_names.insert(0, labels_csv_name) logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info('Merging feature files for matrix %s', matrix_uuid) output = self.merge_feature_csvs(features_csv_names, matrix_directory, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") finally: # clean up files and database before finishing for csv_name in features_csv_names: self.remove_file(csv_name) try: # store the matrix logging.info('Archiving matrix %s with metta', matrix_uuid) metta.archive_matrix(matrix_config=matrix_metadata, df_matrix=output, overwrite=True, directory=self.matrix_directory, format='csv') logging.info("Matrix {matrix_uuid} archived (using metta)") # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == 'train': lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], matrix_metadata=json.dumps(matrix_metadata, sort_keys=True, default=str)) session = self.sessionmaker() session.add(matrix) session.commit() session.close() finally: if isinstance(output, str): os.remove(output)
def build_matrix( self, as_of_times, label_name, label_type, feature_dictionary, matrix_metadata, matrix_uuid, matrix_type, ): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logging.info("popped matrix %s build off the queue", matrix_uuid) if not table_has_data(self.db_config["sparse_state_table_name"], self.db_engine): logging.warning( "cohort table is not populated, cannot build matrix") return if not table_has_data( "{}.{}".format( self.db_config["labels_schema_name"], self.db_config["labels_table_name"], ), self.db_engine, ): logging.warning( "labels table is not populated, cannot build matrix") return matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: logging.info("Skipping %s because matrix already exists", matrix_uuid) return logging.info( "Creating matrix %s > %s", matrix_metadata["matrix_id"], matrix_store.matrix_base_store.path, ) # make the entity time table and query the labels and features tables logging.info("Making entity date table for matrix %s", matrix_uuid) try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata["state"], matrix_type, matrix_uuid, matrix_metadata["label_timespan"], ) except ValueError as e: logging.warning( "Not able to build entity-date table due to: %s - will not build matrix", exc_info=True, ) return logging.info( "Extracting feature group data from database into file " "for matrix %s", matrix_uuid, ) dataframes = self.load_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logging.info(f"Feature data extracted for matrix {matrix_uuid}") logging.info( "Extracting label data from database into file for " "matrix %s", matrix_uuid, ) labels_df = self.load_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata["label_timespan"], ) dataframes.insert(0, labels_df) logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info("Merging feature files for matrix %s", matrix_uuid) output = self.merge_feature_csvs(dataframes, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") # store the matrix matrix_store.matrix = output matrix_store.metadata = matrix_metadata matrix_store.save() logging.info("Matrix {matrix_uuid} saved") # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == "train": lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], matrix_metadata=json.dumps(matrix_metadata, sort_keys=True, default=str), built_by_experiment=self.experiment_hash) session = self.sessionmaker() session.merge(matrix) session.commit() session.close()
def build_matrix( self, as_of_times, label_name, label_type, feature_dictionary, matrix_metadata, matrix_uuid, matrix_type, ): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logger.spam(f"popped matrix {matrix_uuid} build off the queue") if not table_has_data(self.db_config["cohort_table_name"], self.db_engine): logger.warning( "cohort table is not populated, cannot build matrix") if self.run_id: errored_matrix(self.run_id, self.db_engine) return if not table_has_data( f"{self.db_config['labels_schema_name']}.{self.db_config['labels_table_name']}", self.db_engine, ): logger.warning( "labels table is not populated, cannot build matrix") if self.run_id: errored_matrix(self.run_id, self.db_engine) return matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: logger.notice( f"Skipping {matrix_uuid} because matrix already exists") if self.run_id: skipped_matrix(self.run_id, self.db_engine) return logger.debug( f'Storing matrix {matrix_metadata["matrix_id"]} in {matrix_store.matrix_base_store.path}' ) # make the entity time table and query the labels and features tables logger.debug(f"Making entity date table for matrix {matrix_uuid}") try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata["state"], matrix_type, matrix_uuid, matrix_metadata["label_timespan"], ) except ValueError as e: logger.exception( "Not able to build entity-date table, will not build matrix", ) if self.run_id: errored_matrix(self.run_id, self.db_engine) return logger.spam( f"Extracting feature group data from database into file for matrix {matrix_uuid}" ) dataframes = self.load_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logger.debug(f"Feature data extracted for matrix {matrix_uuid}") logger.spam( "Extracting label data from database into file for matrix {matrix_uuid}", ) labels_df = self.load_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata["label_timespan"], ) dataframes.insert(0, labels_df) logger.debug(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logger.spam(f"Merging feature files for matrix {matrix_uuid}") output = self.merge_feature_csvs(dataframes, matrix_uuid) logger.debug(f"Features data merged for matrix {matrix_uuid}") matrix_store.metadata = matrix_metadata # store the matrix labels = output.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = output, labels matrix_store.save() logger.info( f"Matrix {matrix_uuid} saved in {matrix_store.matrix_base_store.path}" ) # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == "train": lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, built_by_experiment=self.experiment_hash) session = self.sessionmaker() session.merge(matrix) session.commit() session.close() if self.run_id: built_matrix(self.run_id, self.db_engine)
def test_table_has_data(self): self.engine.execute("create table incidents (col1 varchar)") self.engine.execute("create table compliments (col1 varchar)") self.engine.execute("insert into compliments values ('good job')") assert dbreflect.table_has_data("compliments", self.engine) assert not dbreflect.table_has_data("incidents", self.engine)
def test_run_crosstabs(finished_experiment, crosstabs_config): run_crosstabs(finished_experiment.db_engine, crosstabs_config) expected_table_name = crosstabs_config.output[ "schema"] + "." + crosstabs_config.output["table"] table_has_data(expected_table_name, finished_experiment.db_engine)