Exemple #1
0
    def matrix_build_tasks(self):
        """Tasks for all matrices that need to be built as a part of
        this Experiment.

        Each task contains arguments understood by
        ``Architect.build_matrix``.

        Returns: (list) of dicts

        """
        if not table_has_data(self.sparse_states_table_name, self.db_engine):
            logging.warning('cohort table is not populated, cannot build any matrices')
            return {}
        if not table_has_data(self.labels_table_name, self.db_engine):
            logging.warning('labels table is not populated, cannot build any matrices')
            return {}
        (
            updated_split_definitions,
            matrix_build_tasks
        ) = self.planner.generate_plans(
            self.split_definitions,
            self.feature_dicts
        )
        self.full_matrix_definitions = updated_split_definitions
        return matrix_build_tasks
Exemple #2
0
def test_predictor_save_predictions(matrix_type, predict_setup_args):
    """Test the save_predictions flag being set to False

    We still want to return predict_proba, but not save data to the DB
    """
    (project_storage, db_engine, model_id) = predict_setup_args
    # if save_predictions is sent as False, don't save
    predictor = Predictor(project_storage.model_storage_engine(),
                          db_engine,
                          rank_order='worst',
                          save_predictions=False)

    matrix_store = get_matrix_store(project_storage)
    train_matrix_columns = matrix_store.columns()

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )

    # assert
    # 1. that the returned predictions are of the desired length
    assert len(predict_proba) == 2

    # 2. that the predictions table entries are present and
    # can be linked to the original models
    assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_predictor_save_predictions(matrix_type, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    # if save_predictions is sent as False, don't save
    predictor = Predictor(project_storage.model_storage_engine(),
                          db_engine,
                          save_predictions=False)

    matrix = matrix_creator(index="entity_id")
    metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                       matrix_type=matrix_type,
                                       indices=["entity_id"])

    matrix_store = get_matrix_store(project_storage, matrix, metadata)
    train_matrix_columns = matrix.columns[0:-1].tolist()

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )

    # assert
    # 1. that the returned predictions are of the desired length
    assert len(predict_proba) == 2

    # 2. that the predictions table entries are present and
    # can be linked to the original models
    assert not table_has_data(f"{matrix_type}_predictions", db_engine)
Exemple #4
0
def test_populate_predictions_table(finished_experiment_without_predictions):
    """assert that generate_predictions populate the predictions table"""

    db_engine = finished_experiment_without_predictions.db_engine
    model_groups = [1]
    project_path = finished_experiment_without_predictions.project_storage.project_path

    add_predictions(db_engine=db_engine,
                    model_groups=model_groups,
                    project_path=project_path)

    assert table_has_data('test_results.predictions', db_engine)
Exemple #5
0
 def _all_valid_entity_dates_query(self, state, as_of_time_strings):
     query = f"""
         SELECT entity_id, as_of_date
         FROM {self.db_config["cohort_table_name"]}
         WHERE {state}
         AND as_of_date IN (SELECT (UNNEST (ARRAY{as_of_time_strings}::timestamp[])))
         ORDER BY entity_id, as_of_date
     """
     if not table_has_data(self.db_config["cohort_table_name"],
                           self.db_engine):
         raise ValueError("Required cohort table does not exist")
     return query
Exemple #6
0
def table_should_have_data(table_name, db_engine):
    """Ensures that the table has at least one row

    Args:
        table_name (string) A table name (with schema)
        db_engine (sqlalchemy.engine)

    Raises: ValueError if the table does not have at least one row
    """
    table_should_exist(table_name, db_engine)
    if not table_has_data(table_name, db_engine):
        raise ValueError("{} table does not have any data".format(table_name))
Exemple #7
0
 def _all_valid_entity_dates_query(self, state, as_of_time_strings):
     query = """
         SELECT entity_id, as_of_date
         FROM {states_table}
         WHERE {state_string}
         AND as_of_date IN (SELECT (UNNEST (ARRAY{times}::timestamp[])))
         ORDER BY entity_id, as_of_date
     """.format(states_table=self.db_config['sparse_state_table_name'],
                state_string=state,
                times=as_of_time_strings)
     if not table_has_data(self.db_config['sparse_state_table_name'],
                           self.db_engine):
         raise ValueError('Required sparse state table does not exist')
     return query
Exemple #8
0
    def generate_entity_date_table(self, as_of_dates):
        """Convert the object's input table
        into a states table for the given as_of_dates

        Args:
            as_of_dates (list of datetime.dates) Dates to include in the
                state table
        """
        logger.spam(
            f"Generating entity_date table {self.entity_date_table_name}")
        if self.query:
            logger.spam(
                f"Query is present, so running query on as_of_dates: {as_of_dates}"
            )
            self._create_and_populate_entity_date_table_from_query(as_of_dates)
        elif self.labels_table_name:
            self._create_and_populate_entity_date_table_from_labels()
        else:
            raise ValueError(
                "Neither query not labels table name is available, cannot compute cohort"
            )
        logger.spam(
            f"Table {self.entity_date_table_name} created and populated")

        if not table_has_data(self.entity_date_table_name, self.db_engine):
            raise ValueError(self._empty_table_message(as_of_dates))

        if table_has_duplicates(self.entity_date_table_name,
                                ['entity_id', 'as_of_date'], self.db_engine):
            raise ValueError(
                f"Duplicates found in {self.entity_date_table_name}!")

        logger.debug(
            f"Entity-date table generated at {self.entity_date_table_name}")
        logger.spam(f"Generating stats on {self.entity_date_table_name}")
        logger.spam(
            f"Row count of {self.entity_date_table_name}: {table_row_count(self.entity_date_table_name, self.db_engine)}"
        )
Exemple #9
0
    def build_matrix(self, as_of_times, label_name, label_type,
                     feature_dictionary, matrix_directory, matrix_metadata,
                     matrix_uuid, matrix_type):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_directory: the directory in which to store the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_directory: str
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logging.info('popped matrix %s build off the queue', matrix_uuid)
        if not table_has_data(self.db_config['sparse_state_table_name'],
                              self.db_engine):
            logging.warning(
                'cohort table is not populated, cannot build matrix')
            return
        if not table_has_data(
                "{}.{}".format(self.db_config['labels_schema_name'],
                               self.db_config['labels_table_name']),
                self.db_engine):
            logging.warning(
                'labels table is not populated, cannot build matrix')
            return

        matrix_filename = os.path.join(matrix_directory,
                                       '{}.csv'.format(matrix_uuid))

        # The output directory is local or in s3
        path_parsed = urlparse(matrix_filename)
        scheme = path_parsed.scheme  # If '' of 'file' is a regular file or 's3'

        if scheme in ('', 'file'):
            if not self.replace and os.path.exists(matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        elif scheme == 's3':
            if not self.replace and s3fs.S3FileSystem().exists(
                    matrix_filename):
                logging.info('Skipping %s because matrix already exists',
                             matrix_filename)
                return
        else:
            raise ValueError(f"""URL scheme not supported:
              {scheme} (from {matrix_filename})
            """)

        logging.info('Creating matrix %s > %s', matrix_metadata['matrix_id'],
                     matrix_filename)
        # make the entity time table and query the labels and features tables
        logging.info('Making entity date table for matrix %s', matrix_uuid)
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times, label_name, label_type, matrix_metadata['state'],
                matrix_type, matrix_uuid, matrix_metadata['label_timespan'])
        except ValueError as e:
            logging.warning(
                'Not able to build entity-date table due to: %s - will not build matrix',
                exc_info=True)
            return
        logging.info(
            'Extracting feature group data from database into file '
            'for matrix %s', matrix_uuid)
        features_csv_names = self.write_features_data(as_of_times,
                                                      feature_dictionary,
                                                      entity_date_table_name,
                                                      matrix_uuid)
        logging.info(f"Feature data extracted for matrix {matrix_uuid}")
        try:
            logging.info(
                'Extracting label data from database into file for '
                'matrix %s', matrix_uuid)
            labels_csv_name = self.write_labels_data(
                label_name, label_type, entity_date_table_name, matrix_uuid,
                matrix_metadata['label_timespan'])
            features_csv_names.insert(0, labels_csv_name)

            logging.info(f"Label data extracted for matrix {matrix_uuid}")
            # stitch together the csvs
            logging.info('Merging feature files for matrix %s', matrix_uuid)
            output = self.merge_feature_csvs(features_csv_names,
                                             matrix_directory, matrix_uuid)
            logging.info(f"Features data merged for matrix {matrix_uuid}")
        finally:
            # clean up files and database before finishing
            for csv_name in features_csv_names:
                self.remove_file(csv_name)
        try:
            # store the matrix
            logging.info('Archiving matrix %s with metta', matrix_uuid)
            metta.archive_matrix(matrix_config=matrix_metadata,
                                 df_matrix=output,
                                 overwrite=True,
                                 directory=self.matrix_directory,
                                 format='csv')
            logging.info("Matrix {matrix_uuid} archived (using metta)")
            # If completely archived, save its information to matrices table
            # At this point, existence of matrix already tested, so no need to delete from db
            if matrix_type == 'train':
                lookback = matrix_metadata["max_training_history"]
            else:
                lookback = matrix_metadata["test_duration"]

            matrix = Matrix(
                matrix_id=matrix_metadata["matrix_id"],
                matrix_uuid=matrix_uuid,
                matrix_type=matrix_type,
                labeling_window=matrix_metadata["label_timespan"],
                num_observations=len(output),
                lookback_duration=lookback,
                feature_start_time=matrix_metadata["feature_start_time"],
                matrix_metadata=json.dumps(matrix_metadata,
                                           sort_keys=True,
                                           default=str))
            session = self.sessionmaker()
            session.add(matrix)
            session.commit()
            session.close()

        finally:
            if isinstance(output, str):
                os.remove(output)
Exemple #10
0
    def build_matrix(
        self,
        as_of_times,
        label_name,
        label_type,
        feature_dictionary,
        matrix_metadata,
        matrix_uuid,
        matrix_type,
    ):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logging.info("popped matrix %s build off the queue", matrix_uuid)
        if not table_has_data(self.db_config["sparse_state_table_name"],
                              self.db_engine):
            logging.warning(
                "cohort table is not populated, cannot build matrix")
            return
        if not table_has_data(
                "{}.{}".format(
                    self.db_config["labels_schema_name"],
                    self.db_config["labels_table_name"],
                ),
                self.db_engine,
        ):
            logging.warning(
                "labels table is not populated, cannot build matrix")
            return

        matrix_store = self.matrix_storage_engine.get_store(matrix_uuid)
        if not self.replace and matrix_store.exists:
            logging.info("Skipping %s because matrix already exists",
                         matrix_uuid)
            return

        logging.info(
            "Creating matrix %s > %s",
            matrix_metadata["matrix_id"],
            matrix_store.matrix_base_store.path,
        )
        # make the entity time table and query the labels and features tables
        logging.info("Making entity date table for matrix %s", matrix_uuid)
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times,
                label_name,
                label_type,
                matrix_metadata["state"],
                matrix_type,
                matrix_uuid,
                matrix_metadata["label_timespan"],
            )
        except ValueError as e:
            logging.warning(
                "Not able to build entity-date table due to: %s - will not build matrix",
                exc_info=True,
            )
            return
        logging.info(
            "Extracting feature group data from database into file "
            "for matrix %s",
            matrix_uuid,
        )
        dataframes = self.load_features_data(as_of_times, feature_dictionary,
                                             entity_date_table_name,
                                             matrix_uuid)
        logging.info(f"Feature data extracted for matrix {matrix_uuid}")
        logging.info(
            "Extracting label data from database into file for "
            "matrix %s",
            matrix_uuid,
        )
        labels_df = self.load_labels_data(
            label_name,
            label_type,
            entity_date_table_name,
            matrix_uuid,
            matrix_metadata["label_timespan"],
        )
        dataframes.insert(0, labels_df)

        logging.info(f"Label data extracted for matrix {matrix_uuid}")
        # stitch together the csvs
        logging.info("Merging feature files for matrix %s", matrix_uuid)
        output = self.merge_feature_csvs(dataframes, matrix_uuid)
        logging.info(f"Features data merged for matrix {matrix_uuid}")

        # store the matrix
        matrix_store.matrix = output
        matrix_store.metadata = matrix_metadata
        matrix_store.save()
        logging.info("Matrix {matrix_uuid} saved")
        # If completely archived, save its information to matrices table
        # At this point, existence of matrix already tested, so no need to delete from db
        if matrix_type == "train":
            lookback = matrix_metadata["max_training_history"]
        else:
            lookback = matrix_metadata["test_duration"]

        matrix = Matrix(
            matrix_id=matrix_metadata["matrix_id"],
            matrix_uuid=matrix_uuid,
            matrix_type=matrix_type,
            labeling_window=matrix_metadata["label_timespan"],
            num_observations=len(output),
            lookback_duration=lookback,
            feature_start_time=matrix_metadata["feature_start_time"],
            matrix_metadata=json.dumps(matrix_metadata,
                                       sort_keys=True,
                                       default=str),
            built_by_experiment=self.experiment_hash)
        session = self.sessionmaker()
        session.merge(matrix)
        session.commit()
        session.close()
Exemple #11
0
    def build_matrix(
        self,
        as_of_times,
        label_name,
        label_type,
        feature_dictionary,
        matrix_metadata,
        matrix_uuid,
        matrix_type,
    ):
        """ Write a design matrix to disk with the specified paramters.

        :param as_of_times: datetimes to be included in the matrix
        :param label_name: name of the label to be used
        :param label_type: the type of label to be used
        :param feature_dictionary: a dictionary of feature tables and features
                                   to be included in the matrix
        :param matrix_metadata: a dictionary of metadata about the matrix
        :param matrix_uuid: a unique id for the matrix
        :param matrix_type: the type (train/test) of matrix
        :type as_of_times: list
        :type label_name: str
        :type label_type: str
        :type feature_dictionary: dict
        :type matrix_metadata: dict
        :type matrix_uuid: str
        :type matrix_type: str

        :return: none
        :rtype: none
        """
        logger.spam(f"popped matrix {matrix_uuid} build off the queue")
        if not table_has_data(self.db_config["cohort_table_name"],
                              self.db_engine):
            logger.warning(
                "cohort table is not populated, cannot build matrix")
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return
        if not table_has_data(
                f"{self.db_config['labels_schema_name']}.{self.db_config['labels_table_name']}",
                self.db_engine,
        ):
            logger.warning(
                "labels table is not populated, cannot build matrix")
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return

        matrix_store = self.matrix_storage_engine.get_store(matrix_uuid)
        if not self.replace and matrix_store.exists:
            logger.notice(
                f"Skipping {matrix_uuid} because matrix already exists")
            if self.run_id:
                skipped_matrix(self.run_id, self.db_engine)
            return

        logger.debug(
            f'Storing matrix {matrix_metadata["matrix_id"]} in {matrix_store.matrix_base_store.path}'
        )
        # make the entity time table and query the labels and features tables
        logger.debug(f"Making entity date table for matrix {matrix_uuid}")
        try:
            entity_date_table_name = self.make_entity_date_table(
                as_of_times,
                label_name,
                label_type,
                matrix_metadata["state"],
                matrix_type,
                matrix_uuid,
                matrix_metadata["label_timespan"],
            )
        except ValueError as e:
            logger.exception(
                "Not able to build entity-date table,  will not build matrix",
            )
            if self.run_id:
                errored_matrix(self.run_id, self.db_engine)
            return
        logger.spam(
            f"Extracting feature group data from database into file  for matrix {matrix_uuid}"
        )
        dataframes = self.load_features_data(as_of_times, feature_dictionary,
                                             entity_date_table_name,
                                             matrix_uuid)
        logger.debug(f"Feature data extracted for matrix {matrix_uuid}")
        logger.spam(
            "Extracting label data from database into file for matrix {matrix_uuid}",
        )
        labels_df = self.load_labels_data(
            label_name,
            label_type,
            entity_date_table_name,
            matrix_uuid,
            matrix_metadata["label_timespan"],
        )
        dataframes.insert(0, labels_df)

        logger.debug(f"Label data extracted for matrix {matrix_uuid}")
        # stitch together the csvs
        logger.spam(f"Merging feature files for matrix {matrix_uuid}")
        output = self.merge_feature_csvs(dataframes, matrix_uuid)
        logger.debug(f"Features data merged for matrix {matrix_uuid}")

        matrix_store.metadata = matrix_metadata
        # store the matrix
        labels = output.pop(matrix_store.label_column_name)
        matrix_store.matrix_label_tuple = output, labels
        matrix_store.save()
        logger.info(
            f"Matrix {matrix_uuid} saved in {matrix_store.matrix_base_store.path}"
        )
        # If completely archived, save its information to matrices table
        # At this point, existence of matrix already tested, so no need to delete from db
        if matrix_type == "train":
            lookback = matrix_metadata["max_training_history"]
        else:
            lookback = matrix_metadata["test_duration"]

        matrix = Matrix(
            matrix_id=matrix_metadata["matrix_id"],
            matrix_uuid=matrix_uuid,
            matrix_type=matrix_type,
            labeling_window=matrix_metadata["label_timespan"],
            num_observations=len(output),
            lookback_duration=lookback,
            feature_start_time=matrix_metadata["feature_start_time"],
            feature_dictionary=feature_dictionary,
            matrix_metadata=matrix_metadata,
            built_by_experiment=self.experiment_hash)
        session = self.sessionmaker()
        session.merge(matrix)
        session.commit()
        session.close()
        if self.run_id:
            built_matrix(self.run_id, self.db_engine)
 def test_table_has_data(self):
     self.engine.execute("create table incidents (col1 varchar)")
     self.engine.execute("create table compliments (col1 varchar)")
     self.engine.execute("insert into compliments values ('good job')")
     assert dbreflect.table_has_data("compliments", self.engine)
     assert not dbreflect.table_has_data("incidents", self.engine)
Exemple #13
0
def test_run_crosstabs(finished_experiment, crosstabs_config):
    run_crosstabs(finished_experiment.db_engine, crosstabs_config)
    expected_table_name = crosstabs_config.output[
        "schema"] + "." + crosstabs_config.output["table"]
    table_has_data(expected_table_name, finished_experiment.db_engine)