Example #1
0
def feature_list(feature_dictionary):
    """Convert a feature dictionary to a sorted list

    Args: feature_dictionary (dict)

    Returns: sorted list of feature names
    """
    if not feature_dictionary:
        return FeatureNameList()
    return FeatureNameList(sorted(
        functools.reduce(
            operator.concat,
            (feature_dictionary[key] for key in feature_dictionary.keys()),
        )
    ))
Example #2
0
def matrix_metadata_creator(**override_kwargs):
    """Create a sample valid matrix metadata with optional overrides

    Args:
    **override_kwargs: Keys and values to override in the metadata

    Returns: (dict)
    """
    base_metadata = {
        "feature_start_time": datetime.date(2012, 12, 20),
        "end_time": datetime.date(2016, 12, 20),
        "label_name": "label",
        "as_of_date_frequency": "1w",
        "max_training_history": "5y",
        "matrix_id": "tester-1",
        "state": "active",
        "cohort_name": "default",
        "label_timespan": "1y",
        "metta-uuid": "1234",
        "matrix_type": "test",
        "feature_names": FeatureNameList(["ft1", "ft2"]),
        "feature_groups": ["all: True"],
        "indices": MatrixStore.indices,
        "as_of_times": [datetime.date(2016, 12, 20)],
    }
    for override_key, override_value in override_kwargs.items():
        base_metadata[override_key] = override_value

    return base_metadata
Example #3
0
    def subsets(self, feature_dictionary):
        """Generate subsets of a feature dict

        Args:
            feature_dictionary (dict) tables and the features contained in each

            The feature dictionary is meant to be keyed on source table. Example:

            {
                'feature_table_one': ['feature_one', feature_two'],
                'feature_table_two': ['feature_three', 'feature_four'],
            }

        Returns: (list) subsets of the feature dictionary, in the same
            table-based structure
        """
        logging.info(
            "Creating feature groups. config: %s, Master feature dictionary: %s",
            self.definition,
            feature_dictionary,
        )
        subsets = []
        for name, config in sorted(self.definition.items()):
            logging.info("Parsing config grouping method %s, items %s", name,
                         config)
            for config_item in config:
                subset = FeatureGroup(name="{}: {}".format(name, config_item))
                logging.info("Finding columns that might belong in %s", subset)
                for table, features in feature_dictionary.items():
                    logging.info(
                        "Searching features in table %s that match group %s",
                        table, subset)
                    matching_features = self.subsetters[name](config_item,
                                                              table, features)
                    logging.info(
                        "Found %s matching features in table %s that match group %s",
                        len(matching_features),
                        table,
                        subset,
                    )
                    if len(matching_features) > 0:
                        subset[table] = FeatureNameList(matching_features)

                subsets.append(subset)
        if not any(subset for subset in subsets if any(subset)):
            raise ValueError(
                f"Problem! The feature group definition {self.definition} did not find any matches",
                f"in feature dictionary {feature_dictionary}")
        logging.info("Found %s total feature subsets", len(subsets))
        return subsets
Example #4
0
def sample_metadata():
    return {
        "feature_start_time": datetime.date(2012, 12, 20),
        "end_time": datetime.date(2016, 12, 20),
        "label_name": "label",
        "as_of_date_frequency": "1w",
        "max_training_history": "5y",
        "state": "default",
        "cohort_name": "default",
        "label_timespan": "1y",
        "metta-uuid": "1234",
        "feature_names": FeatureNameList(["ft1", "ft2"]),
        "feature_groups": ["all: True"],
        "indices": ["entity_id"],
    }
Example #5
0
    def subsets(self, feature_dictionary):
        """Generate subsets of a feature dict

        Args:
            feature_dictionary (dict) tables and the features contained in each

            The feature dictionary is meant to be keyed on source table. Example:

            {
                'feature_table_one': ['feature_one', feature_two'],
                'feature_table_two': ['feature_three', 'feature_four'],
            }

        Returns: (list) subsets of the feature dictionary, in the same
            table-based structure
        """
        logger.spam(
            f"Creating feature groups, using: {self.definition}, Master feature dictionary: {feature_dictionary}",
        )
        subsets = []
        for name, config in sorted(self.definition.items()):
            logger.spam(
                f"Parsing config grouping method {name}, items {config}")
            for config_item in config:
                subset = FeatureGroup(name=f"{name}: {config_item}")
                logger.spam(
                    f"Finding columns that might belong in [{name}: {config_item}]"
                )
                for table, features in feature_dictionary.items():
                    logger.spam(
                        f"Searching features in table {table} that match group {subset}"
                    )
                    matching_features = self.subsetters[name](config_item,
                                                              table, features)
                    logger.debug(
                        f"Found {len(matching_features)} matching features in table {table} that match group [{name}: {config_item}]",
                    )
                    if len(matching_features) > 0:
                        subset[table] = FeatureNameList(matching_features)

                subsets.append(subset)

        if not any(subset for subset in subsets if any(subset)):
            raise ValueError(
                f"Problem! The feature group definition {self.definition} did not find any matches",
                f"in feature dictionary {feature_dictionary}")
        logger.verbose(f"Found {len(subsets)} total feature subsets")
        return subsets
Example #6
0
    def feature_dictionary(self, feature_table_names, index_column_lookup):
        """ Create a dictionary of feature names, where keys are feature tables
        and values are lists of feature names.

        :return: feature_dictionary
        :rtype: dict
        """
        feature_dictionary = {}

        # iterate! store each table name + features names as key-value pair
        for feature_table_name in self._tables_to_include(feature_table_names):
            feature_names = [
                row[0] for row in self.db_engine.execute(
                    self._build_feature_names_query(
                        feature_table_name,
                        index_column_lookup[feature_table_name]))
            ]
            feature_dictionary[feature_table_name] = FeatureNameList(
                feature_names)
        logger.spam(f"Feature dictionary built: {feature_dictionary}")
        return feature_dictionary