def fake_trained_model(db_engine, train_matrix_uuid="efgh", train_end_time=datetime.datetime(2016, 1, 1)): """Creates and stores a trivial trained model and training matrix Args: db_engine (sqlalchemy.engine) Returns: (int) model id for database retrieval """ session = sessionmaker(db_engine)() session.merge(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() db_model = Model( model_hash="abcd", train_matrix_uuid=train_matrix_uuid, train_end_time=train_end_time, ) session.add(db_model) session.commit() model_id = db_model.model_id session.close() return trained_model, model_id
def __init__(self, matrix_type, matrix_uuid, label_count, db_engine, init_labels=None, metadata_overrides=None, matrix=None): base_metadata = { 'feature_start_time': datetime.date(2014, 1, 1), 'end_time': datetime.date(2015, 1, 1), 'as_of_date_frequency': '1y', 'matrix_id': 'some_matrix', 'label_name': 'label', 'label_timespan': '3month', 'indices': ['entity_id'], 'matrix_type': matrix_type } metadata_overrides = metadata_overrides or {} base_metadata.update(metadata_overrides) if matrix is None: matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') if init_labels is None: init_labels = [] self.matrix = matrix self.metadata = base_metadata self.label_count = label_count self.init_labels = init_labels self.matrix_uuid = matrix_uuid session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=matrix_uuid))
def __init__( self, matrix_type, matrix_uuid, label_count, db_engine, init_labels=None, metadata_overrides=None, matrix=None, init_as_of_dates=None, ): base_metadata = { "feature_start_time": datetime.date(2014, 1, 1), "end_time": datetime.date(2015, 1, 1), "as_of_date_frequency": "1y", "matrix_id": "some_matrix", "label_name": "label", "label_timespan": "3month", "indices": MatrixStore.indices, "matrix_type": matrix_type, "as_of_times": [datetime.date(2014, 10, 1), datetime.date(2014, 7, 1)], } metadata_overrides = metadata_overrides or {} base_metadata.update(metadata_overrides) if matrix is None: matrix = pd.DataFrame.from_dict({ "entity_id": [1, 2], "as_of_date": [pd.Timestamp(2014, 10, 1), pd.Timestamp(2014, 7, 1)], "feature_one": [3, 4], "feature_two": [5, 6], "label": [7, 8], }).set_index(MatrixStore.indices) if init_labels is None: init_labels = [] labels = matrix.pop("label") self.matrix_label_tuple = matrix, labels self.metadata = base_metadata self.label_count = label_count self.init_labels = pd.Series(init_labels, dtype="float64") self.matrix_uuid = matrix_uuid self.init_as_of_dates = init_as_of_dates or [] session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=matrix_uuid)) session.commit()
def prepare(): with rig_engines() as (db_engine, project_storage): train_matrix_uuid = '1234' session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() model_hash = 'abcd' project_storage.model_storage_engine().write(trained_model, model_hash) db_model = Model(model_hash=model_hash, train_matrix_uuid=train_matrix_uuid) session.add(db_model) session.commit() yield project_storage, db_engine, db_model.model_id
def fake_trained_model(db_engine, train_matrix_uuid='efgh'): """Creates and stores a trivial trained model and training matrix Args: db_engine (sqlalchemy.engine) Returns: (int) model id for database retrieval """ session = sessionmaker(db_engine)() session.merge(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() db_model = Model(model_hash='abcd', train_matrix_uuid=train_matrix_uuid) session.add(db_model) session.commit() return trained_model, db_model.model_id
def prepare(): with rig_engines() as (db_engine, project_storage): train_matrix_uuid = "1234" try: session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() model_hash = "abcd" project_storage.model_storage_engine().write( trained_model, model_hash) db_model = Model(model_hash=model_hash, train_matrix_uuid=train_matrix_uuid, random_seed=MODEL_RANDOM_SEED) session.add(db_model) session.commit() yield project_storage, db_engine, db_model.model_id finally: session.close()
def __init__( self, matrix_type, matrix_uuid, label_count, db_engine, init_labels=None, metadata_overrides=None, matrix=None, ): base_metadata = { "feature_start_time": datetime.date(2014, 1, 1), "end_time": datetime.date(2015, 1, 1), "as_of_date_frequency": "1y", "matrix_id": "some_matrix", "label_name": "label", "label_timespan": "3month", "indices": ["entity_id"], "matrix_type": matrix_type, } metadata_overrides = metadata_overrides or {} base_metadata.update(metadata_overrides) if matrix is None: matrix = pandas.DataFrame.from_dict({ "entity_id": [1, 2], "feature_one": [3, 4], "feature_two": [5, 6], "label": [7, 8], }).set_index("entity_id") if init_labels is None: init_labels = [] self.matrix = matrix self.metadata = base_metadata self.label_count = label_count self.init_labels = init_labels self.matrix_uuid = matrix_uuid session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=matrix_uuid))
def fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='efgh'): """Creates and stores a trivial trained model and training matrix Args: project_path (string) a desired fs/s3 project path model_storage_engine (triage.storage.ModelStorageEngine) db_engine (sqlalchemy.engine) Returns: (int) model id for database retrieval """ session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() model_storage_engine.get_store('abcd').write(trained_model) db_model = Model(model_hash='abcd', train_matrix_uuid=train_matrix_uuid) session.add(db_model) session.commit() return trained_model, db_model.model_id
def build_matrix(self, as_of_times, label_name, label_type, feature_dictionary, matrix_directory, matrix_metadata, matrix_uuid, matrix_type): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_directory: the directory in which to store the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_directory: str :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logging.info('popped matrix %s build off the queue', matrix_uuid) if not table_has_data(self.db_config['sparse_state_table_name'], self.db_engine): logging.warning( 'cohort table is not populated, cannot build matrix') return if not table_has_data( "{}.{}".format(self.db_config['labels_schema_name'], self.db_config['labels_table_name']), self.db_engine): logging.warning( 'labels table is not populated, cannot build matrix') return matrix_filename = os.path.join(matrix_directory, '{}.csv'.format(matrix_uuid)) # The output directory is local or in s3 path_parsed = urlparse(matrix_filename) scheme = path_parsed.scheme # If '' of 'file' is a regular file or 's3' if scheme in ('', 'file'): if not self.replace and os.path.exists(matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return elif scheme == 's3': if not self.replace and s3fs.S3FileSystem().exists( matrix_filename): logging.info('Skipping %s because matrix already exists', matrix_filename) return else: raise ValueError(f"""URL scheme not supported: {scheme} (from {matrix_filename}) """) logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'], matrix_filename) # make the entity time table and query the labels and features tables logging.info('Making entity date table for matrix %s', matrix_uuid) try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata['state'], matrix_type, matrix_uuid, matrix_metadata['label_timespan']) except ValueError as e: logging.warning( 'Not able to build entity-date table due to: %s - will not build matrix', exc_info=True) return logging.info( 'Extracting feature group data from database into file ' 'for matrix %s', matrix_uuid) features_csv_names = self.write_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logging.info(f"Feature data extracted for matrix {matrix_uuid}") try: logging.info( 'Extracting label data from database into file for ' 'matrix %s', matrix_uuid) labels_csv_name = self.write_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata['label_timespan']) features_csv_names.insert(0, labels_csv_name) logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info('Merging feature files for matrix %s', matrix_uuid) output = self.merge_feature_csvs(features_csv_names, matrix_directory, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") finally: # clean up files and database before finishing for csv_name in features_csv_names: self.remove_file(csv_name) try: # store the matrix logging.info('Archiving matrix %s with metta', matrix_uuid) metta.archive_matrix(matrix_config=matrix_metadata, df_matrix=output, overwrite=True, directory=self.matrix_directory, format='csv') logging.info("Matrix {matrix_uuid} archived (using metta)") # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == 'train': lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], matrix_metadata=json.dumps(matrix_metadata, sort_keys=True, default=str)) session = self.sessionmaker() session.add(matrix) session.commit() session.close() finally: if isinstance(output, str): os.remove(output)
def build_matrix( self, as_of_times, label_name, label_type, feature_dictionary, matrix_metadata, matrix_uuid, matrix_type, ): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logging.info("popped matrix %s build off the queue", matrix_uuid) if not table_has_data(self.db_config["sparse_state_table_name"], self.db_engine): logging.warning( "cohort table is not populated, cannot build matrix") return if not table_has_data( "{}.{}".format( self.db_config["labels_schema_name"], self.db_config["labels_table_name"], ), self.db_engine, ): logging.warning( "labels table is not populated, cannot build matrix") return matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: logging.info("Skipping %s because matrix already exists", matrix_uuid) return logging.info( "Creating matrix %s > %s", matrix_metadata["matrix_id"], matrix_store.matrix_base_store.path, ) # make the entity time table and query the labels and features tables logging.info("Making entity date table for matrix %s", matrix_uuid) try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata["state"], matrix_type, matrix_uuid, matrix_metadata["label_timespan"], ) except ValueError as e: logging.warning( "Not able to build entity-date table due to: %s - will not build matrix", exc_info=True, ) return logging.info( "Extracting feature group data from database into file " "for matrix %s", matrix_uuid, ) dataframes = self.load_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logging.info(f"Feature data extracted for matrix {matrix_uuid}") logging.info( "Extracting label data from database into file for " "matrix %s", matrix_uuid, ) labels_df = self.load_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata["label_timespan"], ) dataframes.insert(0, labels_df) logging.info(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logging.info("Merging feature files for matrix %s", matrix_uuid) output = self.merge_feature_csvs(dataframes, matrix_uuid) logging.info(f"Features data merged for matrix {matrix_uuid}") # store the matrix matrix_store.matrix = output matrix_store.metadata = matrix_metadata matrix_store.save() logging.info("Matrix {matrix_uuid} saved") # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == "train": lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], matrix_metadata=json.dumps(matrix_metadata, sort_keys=True, default=str), built_by_experiment=self.experiment_hash) session = self.sessionmaker() session.merge(matrix) session.commit() session.close()
def build_matrix( self, as_of_times, label_name, label_type, feature_dictionary, matrix_metadata, matrix_uuid, matrix_type, ): """ Write a design matrix to disk with the specified paramters. :param as_of_times: datetimes to be included in the matrix :param label_name: name of the label to be used :param label_type: the type of label to be used :param feature_dictionary: a dictionary of feature tables and features to be included in the matrix :param matrix_metadata: a dictionary of metadata about the matrix :param matrix_uuid: a unique id for the matrix :param matrix_type: the type (train/test) of matrix :type as_of_times: list :type label_name: str :type label_type: str :type feature_dictionary: dict :type matrix_metadata: dict :type matrix_uuid: str :type matrix_type: str :return: none :rtype: none """ logger.spam(f"popped matrix {matrix_uuid} build off the queue") if not table_has_data(self.db_config["cohort_table_name"], self.db_engine): logger.warning( "cohort table is not populated, cannot build matrix") if self.run_id: errored_matrix(self.run_id, self.db_engine) return if not table_has_data( f"{self.db_config['labels_schema_name']}.{self.db_config['labels_table_name']}", self.db_engine, ): logger.warning( "labels table is not populated, cannot build matrix") if self.run_id: errored_matrix(self.run_id, self.db_engine) return matrix_store = self.matrix_storage_engine.get_store(matrix_uuid) if not self.replace and matrix_store.exists: logger.notice( f"Skipping {matrix_uuid} because matrix already exists") if self.run_id: skipped_matrix(self.run_id, self.db_engine) return logger.debug( f'Storing matrix {matrix_metadata["matrix_id"]} in {matrix_store.matrix_base_store.path}' ) # make the entity time table and query the labels and features tables logger.debug(f"Making entity date table for matrix {matrix_uuid}") try: entity_date_table_name = self.make_entity_date_table( as_of_times, label_name, label_type, matrix_metadata["state"], matrix_type, matrix_uuid, matrix_metadata["label_timespan"], ) except ValueError as e: logger.exception( "Not able to build entity-date table, will not build matrix", ) if self.run_id: errored_matrix(self.run_id, self.db_engine) return logger.spam( f"Extracting feature group data from database into file for matrix {matrix_uuid}" ) dataframes = self.load_features_data(as_of_times, feature_dictionary, entity_date_table_name, matrix_uuid) logger.debug(f"Feature data extracted for matrix {matrix_uuid}") logger.spam( "Extracting label data from database into file for matrix {matrix_uuid}", ) labels_df = self.load_labels_data( label_name, label_type, entity_date_table_name, matrix_uuid, matrix_metadata["label_timespan"], ) dataframes.insert(0, labels_df) logger.debug(f"Label data extracted for matrix {matrix_uuid}") # stitch together the csvs logger.spam(f"Merging feature files for matrix {matrix_uuid}") output = self.merge_feature_csvs(dataframes, matrix_uuid) logger.debug(f"Features data merged for matrix {matrix_uuid}") matrix_store.metadata = matrix_metadata # store the matrix labels = output.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = output, labels matrix_store.save() logger.info( f"Matrix {matrix_uuid} saved in {matrix_store.matrix_base_store.path}" ) # If completely archived, save its information to matrices table # At this point, existence of matrix already tested, so no need to delete from db if matrix_type == "train": lookback = matrix_metadata["max_training_history"] else: lookback = matrix_metadata["test_duration"] matrix = Matrix( matrix_id=matrix_metadata["matrix_id"], matrix_uuid=matrix_uuid, matrix_type=matrix_type, labeling_window=matrix_metadata["label_timespan"], num_observations=len(output), lookback_duration=lookback, feature_start_time=matrix_metadata["feature_start_time"], feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, built_by_experiment=self.experiment_hash) session = self.sessionmaker() session.merge(matrix) session.commit() session.close() if self.run_id: built_matrix(self.run_id, self.db_engine)