Beispiel #1
0
def fake_trained_model(db_engine,
                       train_matrix_uuid="efgh",
                       train_end_time=datetime.datetime(2016, 1, 1)):
    """Creates and stores a trivial trained model and training matrix

    Args:
        db_engine (sqlalchemy.engine)

    Returns:
        (int) model id for database retrieval
    """
    session = sessionmaker(db_engine)()
    session.merge(Matrix(matrix_uuid=train_matrix_uuid))

    # Create the fake trained model and store in db
    trained_model = MockTrainedModel()
    db_model = Model(
        model_hash="abcd",
        train_matrix_uuid=train_matrix_uuid,
        train_end_time=train_end_time,
    )
    session.add(db_model)
    session.commit()
    model_id = db_model.model_id
    session.close()
    return trained_model, model_id
Beispiel #2
0
    def __init__(self, matrix_type, matrix_uuid, label_count, db_engine, init_labels=None, metadata_overrides=None, matrix=None):
        base_metadata = {
            'feature_start_time': datetime.date(2014, 1, 1),
            'end_time': datetime.date(2015, 1, 1),
            'as_of_date_frequency': '1y',
            'matrix_id': 'some_matrix',
            'label_name': 'label',
            'label_timespan': '3month',
            'indices': ['entity_id'],
            'matrix_type': matrix_type
        }
        metadata_overrides = metadata_overrides or {}
        base_metadata.update(metadata_overrides)
        if matrix is None:
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
        if init_labels is None:
            init_labels = []
        self.matrix = matrix
        self.metadata = base_metadata
        self.label_count = label_count
        self.init_labels = init_labels
        self.matrix_uuid = matrix_uuid

        session = sessionmaker(db_engine)()
        session.add(Matrix(matrix_uuid=matrix_uuid))
Beispiel #3
0
    def __init__(
        self,
        matrix_type,
        matrix_uuid,
        label_count,
        db_engine,
        init_labels=None,
        metadata_overrides=None,
        matrix=None,
        init_as_of_dates=None,
    ):
        base_metadata = {
            "feature_start_time": datetime.date(2014, 1, 1),
            "end_time": datetime.date(2015, 1, 1),
            "as_of_date_frequency": "1y",
            "matrix_id": "some_matrix",
            "label_name": "label",
            "label_timespan": "3month",
            "indices": MatrixStore.indices,
            "matrix_type": matrix_type,
            "as_of_times":
            [datetime.date(2014, 10, 1),
             datetime.date(2014, 7, 1)],
        }
        metadata_overrides = metadata_overrides or {}
        base_metadata.update(metadata_overrides)
        if matrix is None:
            matrix = pd.DataFrame.from_dict({
                "entity_id": [1, 2],
                "as_of_date":
                [pd.Timestamp(2014, 10, 1),
                 pd.Timestamp(2014, 7, 1)],
                "feature_one": [3, 4],
                "feature_two": [5, 6],
                "label": [7, 8],
            }).set_index(MatrixStore.indices)
        if init_labels is None:
            init_labels = []
        labels = matrix.pop("label")
        self.matrix_label_tuple = matrix, labels
        self.metadata = base_metadata
        self.label_count = label_count
        self.init_labels = pd.Series(init_labels, dtype="float64")
        self.matrix_uuid = matrix_uuid
        self.init_as_of_dates = init_as_of_dates or []

        session = sessionmaker(db_engine)()
        session.add(Matrix(matrix_uuid=matrix_uuid))
        session.commit()
Beispiel #4
0
def prepare():
    with rig_engines() as (db_engine, project_storage):
        train_matrix_uuid = '1234'
        session = sessionmaker(db_engine)()
        session.add(Matrix(matrix_uuid=train_matrix_uuid))

        # Create the fake trained model and store in db
        trained_model = MockTrainedModel()
        model_hash = 'abcd'
        project_storage.model_storage_engine().write(trained_model, model_hash)
        db_model = Model(model_hash=model_hash,
                         train_matrix_uuid=train_matrix_uuid)
        session.add(db_model)
        session.commit()
        yield project_storage, db_engine, db_model.model_id
Beispiel #5
0
def fake_trained_model(db_engine, train_matrix_uuid='efgh'):
    """Creates and stores a trivial trained model and training matrix

    Args:
        db_engine (sqlalchemy.engine)

    Returns:
        (int) model id for database retrieval
    """
    session = sessionmaker(db_engine)()
    session.merge(Matrix(matrix_uuid=train_matrix_uuid))

    # Create the fake trained model and store in db
    trained_model = MockTrainedModel()
    db_model = Model(model_hash='abcd', train_matrix_uuid=train_matrix_uuid)
    session.add(db_model)
    session.commit()
    return trained_model, db_model.model_id
Beispiel #6
0
def prepare():
    with rig_engines() as (db_engine, project_storage):
        train_matrix_uuid = "1234"
        try:
            session = sessionmaker(db_engine)()
            session.add(Matrix(matrix_uuid=train_matrix_uuid))

            # Create the fake trained model and store in db
            trained_model = MockTrainedModel()
            model_hash = "abcd"
            project_storage.model_storage_engine().write(
                trained_model, model_hash)
            db_model = Model(model_hash=model_hash,
                             train_matrix_uuid=train_matrix_uuid,
                             random_seed=MODEL_RANDOM_SEED)
            session.add(db_model)
            session.commit()
            yield project_storage, db_engine, db_model.model_id
        finally:
            session.close()
Beispiel #7
0
    def __init__(
        self,
        matrix_type,
        matrix_uuid,
        label_count,
        db_engine,
        init_labels=None,
        metadata_overrides=None,
        matrix=None,
    ):
        base_metadata = {
            "feature_start_time": datetime.date(2014, 1, 1),
            "end_time": datetime.date(2015, 1, 1),
            "as_of_date_frequency": "1y",
            "matrix_id": "some_matrix",
            "label_name": "label",
            "label_timespan": "3month",
            "indices": ["entity_id"],
            "matrix_type": matrix_type,
        }
        metadata_overrides = metadata_overrides or {}
        base_metadata.update(metadata_overrides)
        if matrix is None:
            matrix = pandas.DataFrame.from_dict({
                "entity_id": [1, 2],
                "feature_one": [3, 4],
                "feature_two": [5, 6],
                "label": [7, 8],
            }).set_index("entity_id")
        if init_labels is None:
            init_labels = []
        self.matrix = matrix
        self.metadata = base_metadata
        self.label_count = label_count
        self.init_labels = init_labels
        self.matrix_uuid = matrix_uuid

        session = sessionmaker(db_engine)()
        session.add(Matrix(matrix_uuid=matrix_uuid))
Beispiel #8
0
def fake_trained_model(project_path,
                       model_storage_engine,
                       db_engine,
                       train_matrix_uuid='efgh'):
    """Creates and stores a trivial trained model and training matrix

    Args:
        project_path (string) a desired fs/s3 project path
        model_storage_engine (triage.storage.ModelStorageEngine)
        db_engine (sqlalchemy.engine)

    Returns:
        (int) model id for database retrieval
    """
    session = sessionmaker(db_engine)()
    session.add(Matrix(matrix_uuid=train_matrix_uuid))

    # Create the fake trained model and store in db
    trained_model = MockTrainedModel()
    model_storage_engine.get_store('abcd').write(trained_model)
    db_model = Model(model_hash='abcd', train_matrix_uuid=train_matrix_uuid)
    session.add(db_model)
    session.commit()
    return trained_model, db_model.model_id
Beispiel #9
0
    def build_matrix(self, as_of_times, label_name, label_type,
                     feature_dictionary, matrix_directory, matrix_metadata,
                     matrix_uuid, matrix_type):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_directory: the directory in which to store the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_directory: str
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logging.info('popped matrix %s build off the queue', matrix_uuid)
        if not table_has_data(self.db_config['sparse_state_table_name'],
                              self.db_engine):
            logging.warning(
                'cohort table is not populated, cannot build matrix')
            return
        if not table_has_data(
                "{}.{}".format(self.db_config['labels_schema_name'],
                               self.db_config['labels_table_name']),
                self.db_engine):
            logging.warning(
                'labels table is not populated, cannot build matrix')
            return

        matrix_filename = os.path.join(matrix_directory,
                                       '{}.csv'.format(matrix_uuid))

        # The output directory is local or in s3
        path_parsed = urlparse(matrix_filename)
        scheme = path_parsed.scheme  # If '' of 'file' is a regular file or 's3'

        if scheme in ('', 'file'):
            if not self.replace and os.path.exists(matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        elif scheme == 's3':
            if not self.replace and s3fs.S3FileSystem().exists(
                    matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        else:
            raise ValueError(f"""URL scheme not supported:
              {scheme} (from {matrix_filename})
            """)

        logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'],
                     matrix_filename)
        # make the entity time table and query the labels and features tables
        logging.info('Making entity date table for matrix %s', matrix_uuid)
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times, label_name, label_type, matrix_metadata['state'],
                matrix_type, matrix_uuid, matrix_metadata['label_timespan'])
        except ValueError as e:
            logging.warning(
                'Not able to build entity-date table due to: %s - will not build matrix',
                exc_info=True)
            return
        logging.info(
            'Extracting feature group data from database into file '
            'for matrix %s', matrix_uuid)
        features_csv_names = self.write_features_data(as_of_times,
                                                      feature_dictionary,
                                                      entity_date_table_name,
                                                      matrix_uuid)
        logging.info(f"Feature data extracted for matrix {matrix_uuid}")
        try:
            logging.info(
                'Extracting label data from database into file for '
                'matrix %s', matrix_uuid)
            labels_csv_name = self.write_labels_data(
                label_name, label_type, entity_date_table_name, matrix_uuid,
                matrix_metadata['label_timespan'])
            features_csv_names.insert(0, labels_csv_name)

            logging.info(f"Label data extracted for matrix {matrix_uuid}")
            # stitch together the csvs
            logging.info('Merging feature files for matrix %s', matrix_uuid)
            output = self.merge_feature_csvs(features_csv_names,
                                             matrix_directory, matrix_uuid)
            logging.info(f"Features data merged for matrix {matrix_uuid}")
        finally:
            # clean up files and database before finishing
            for csv_name in features_csv_names:
                self.remove_file(csv_name)
        try:
            # store the matrix
            logging.info('Archiving matrix %s with metta', matrix_uuid)
            metta.archive_matrix(matrix_config=matrix_metadata,
                                 df_matrix=output,
                                 overwrite=True,
                                 directory=self.matrix_directory,
                                 format='csv')
            logging.info("Matrix {matrix_uuid} archived (using metta)")
            # If completely archived, save its information to matrices table
            # At this point, existence of matrix already tested, so no need to delete from db
            if matrix_type == 'train':
                lookback = matrix_metadata["max_training_history"]
            else:
                lookback = matrix_metadata["test_duration"]

            matrix = Matrix(
                matrix_id=matrix_metadata["matrix_id"],
                matrix_uuid=matrix_uuid,
                matrix_type=matrix_type,
                labeling_window=matrix_metadata["label_timespan"],
                num_observations=len(output),
                lookback_duration=lookback,
                feature_start_time=matrix_metadata["feature_start_time"],
                matrix_metadata=json.dumps(matrix_metadata,
                                           sort_keys=True,
                                           default=str))
            session = self.sessionmaker()
            session.add(matrix)
            session.commit()
            session.close()

        finally:
            if isinstance(output, str):
                os.remove(output)
Beispiel #10
0
    def build_matrix(
        self,
        as_of_times,
        label_name,
        label_type,
        feature_dictionary,
        matrix_metadata,
        matrix_uuid,
        matrix_type,
    ):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logging.info("popped matrix %s build off the queue", matrix_uuid)
        if not table_has_data(self.db_config["sparse_state_table_name"],
                              self.db_engine):
            logging.warning(
                "cohort table is not populated, cannot build matrix")
            return
        if not table_has_data(
                "{}.{}".format(
                    self.db_config["labels_schema_name"],
                    self.db_config["labels_table_name"],
                ),
                self.db_engine,
        ):
            logging.warning(
                "labels table is not populated, cannot build matrix")
            return

        matrix_store = self.matrix_storage_engine.get_store(matrix_uuid)
        if not self.replace and matrix_store.exists:
            logging.info("Skipping %s because matrix already exists",
                         matrix_uuid)
            return

        logging.info(
            "Creating matrix %s > %s",
            matrix_metadata["matrix_id"],
            matrix_store.matrix_base_store.path,
        )
        # make the entity time table and query the labels and features tables
        logging.info("Making entity date table for matrix %s", matrix_uuid)
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times,
                label_name,
                label_type,
                matrix_metadata["state"],
                matrix_type,
                matrix_uuid,
                matrix_metadata["label_timespan"],
            )
        except ValueError as e:
            logging.warning(
                "Not able to build entity-date table due to: %s - will not build matrix",
                exc_info=True,
            )
            return
        logging.info(
            "Extracting feature group data from database into file "
            "for matrix %s",
            matrix_uuid,
        )
        dataframes = self.load_features_data(as_of_times, feature_dictionary,
                                             entity_date_table_name,
                                             matrix_uuid)
        logging.info(f"Feature data extracted for matrix {matrix_uuid}")
        logging.info(
            "Extracting label data from database into file for "
            "matrix %s",
            matrix_uuid,
        )
        labels_df = self.load_labels_data(
            label_name,
            label_type,
            entity_date_table_name,
            matrix_uuid,
            matrix_metadata["label_timespan"],
        )
        dataframes.insert(0, labels_df)

        logging.info(f"Label data extracted for matrix {matrix_uuid}")
        # stitch together the csvs
        logging.info("Merging feature files for matrix %s", matrix_uuid)
        output = self.merge_feature_csvs(dataframes, matrix_uuid)
        logging.info(f"Features data merged for matrix {matrix_uuid}")

        # store the matrix
        matrix_store.matrix = output
        matrix_store.metadata = matrix_metadata
        matrix_store.save()
        logging.info("Matrix {matrix_uuid} saved")
        # If completely archived, save its information to matrices table
        # At this point, existence of matrix already tested, so no need to delete from db
        if matrix_type == "train":
            lookback = matrix_metadata["max_training_history"]
        else:
            lookback = matrix_metadata["test_duration"]

        matrix = Matrix(
            matrix_id=matrix_metadata["matrix_id"],
            matrix_uuid=matrix_uuid,
            matrix_type=matrix_type,
            labeling_window=matrix_metadata["label_timespan"],
            num_observations=len(output),
            lookback_duration=lookback,
            feature_start_time=matrix_metadata["feature_start_time"],
            matrix_metadata=json.dumps(matrix_metadata,
                                       sort_keys=True,
                                       default=str),
            built_by_experiment=self.experiment_hash)
        session = self.sessionmaker()
        session.merge(matrix)
        session.commit()
        session.close()
Beispiel #11
0
    def build_matrix(
        self,
        as_of_times,
        label_name,
        label_type,
        feature_dictionary,
        matrix_metadata,
        matrix_uuid,
        matrix_type,
    ):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logger.spam(f"popped matrix {matrix_uuid} build off the queue")
        if not table_has_data(self.db_config["cohort_table_name"],
                              self.db_engine):
            logger.warning(
                "cohort table is not populated, cannot build matrix")
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return
        if not table_has_data(
                f"{self.db_config['labels_schema_name']}.{self.db_config['labels_table_name']}",
                self.db_engine,
        ):
            logger.warning(
                "labels table is not populated, cannot build matrix")
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return

        matrix_store = self.matrix_storage_engine.get_store(matrix_uuid)
        if not self.replace and matrix_store.exists:
            logger.notice(
                f"Skipping {matrix_uuid} because matrix already exists")
            if self.run_id:
                skipped_matrix(self.run_id, self.db_engine)
            return

        logger.debug(
            f'Storing matrix {matrix_metadata["matrix_id"]} in {matrix_store.matrix_base_store.path}'
        )
        # make the entity time table and query the labels and features tables
        logger.debug(f"Making entity date table for matrix {matrix_uuid}")
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times,
                label_name,
                label_type,
                matrix_metadata["state"],
                matrix_type,
                matrix_uuid,
                matrix_metadata["label_timespan"],
            )
        except ValueError as e:
            logger.exception(
                "Not able to build entity-date table,  will not build matrix",
            )
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return
        logger.spam(
            f"Extracting feature group data from database into file  for matrix {matrix_uuid}"
        )
        dataframes = self.load_features_data(as_of_times, feature_dictionary,
                                             entity_date_table_name,
                                             matrix_uuid)
        logger.debug(f"Feature data extracted for matrix {matrix_uuid}")
        logger.spam(
            "Extracting label data from database into file for matrix {matrix_uuid}",
        )
        labels_df = self.load_labels_data(
            label_name,
            label_type,
            entity_date_table_name,
            matrix_uuid,
            matrix_metadata["label_timespan"],
        )
        dataframes.insert(0, labels_df)

        logger.debug(f"Label data extracted for matrix {matrix_uuid}")
        # stitch together the csvs
        logger.spam(f"Merging feature files for matrix {matrix_uuid}")
        output = self.merge_feature_csvs(dataframes, matrix_uuid)
        logger.debug(f"Features data merged for matrix {matrix_uuid}")

        matrix_store.metadata = matrix_metadata
        # store the matrix
        labels = output.pop(matrix_store.label_column_name)
        matrix_store.matrix_label_tuple = output, labels
        matrix_store.save()
        logger.info(
            f"Matrix {matrix_uuid} saved in {matrix_store.matrix_base_store.path}"
        )
        # If completely archived, save its information to matrices table
        # At this point, existence of matrix already tested, so no need to delete from db
        if matrix_type == "train":
            lookback = matrix_metadata["max_training_history"]
        else:
            lookback = matrix_metadata["test_duration"]

        matrix = Matrix(
            matrix_id=matrix_metadata["matrix_id"],
            matrix_uuid=matrix_uuid,
            matrix_type=matrix_type,
            labeling_window=matrix_metadata["label_timespan"],
            num_observations=len(output),
            lookback_duration=lookback,
            feature_start_time=matrix_metadata["feature_start_time"],
            feature_dictionary=feature_dictionary,
            matrix_metadata=matrix_metadata,
            built_by_experiment=self.experiment_hash)
        session = self.sessionmaker()
        session.merge(matrix)
        session.commit()
        session.close()
        if self.run_id:
            built_matrix(self.run_id, self.db_engine)