def feature_list(feature_dictionary): """Convert a feature dictionary to a sorted list Args: feature_dictionary (dict) Returns: sorted list of feature names """ if not feature_dictionary: return FeatureNameList() return FeatureNameList(sorted( functools.reduce( operator.concat, (feature_dictionary[key] for key in feature_dictionary.keys()), ) ))
def matrix_metadata_creator(**override_kwargs): """Create a sample valid matrix metadata with optional overrides Args: **override_kwargs: Keys and values to override in the metadata Returns: (dict) """ base_metadata = { "feature_start_time": datetime.date(2012, 12, 20), "end_time": datetime.date(2016, 12, 20), "label_name": "label", "as_of_date_frequency": "1w", "max_training_history": "5y", "matrix_id": "tester-1", "state": "active", "cohort_name": "default", "label_timespan": "1y", "metta-uuid": "1234", "matrix_type": "test", "feature_names": FeatureNameList(["ft1", "ft2"]), "feature_groups": ["all: True"], "indices": MatrixStore.indices, "as_of_times": [datetime.date(2016, 12, 20)], } for override_key, override_value in override_kwargs.items(): base_metadata[override_key] = override_value return base_metadata
def subsets(self, feature_dictionary): """Generate subsets of a feature dict Args: feature_dictionary (dict) tables and the features contained in each The feature dictionary is meant to be keyed on source table. Example: { 'feature_table_one': ['feature_one', feature_two'], 'feature_table_two': ['feature_three', 'feature_four'], } Returns: (list) subsets of the feature dictionary, in the same table-based structure """ logging.info( "Creating feature groups. config: %s, Master feature dictionary: %s", self.definition, feature_dictionary, ) subsets = [] for name, config in sorted(self.definition.items()): logging.info("Parsing config grouping method %s, items %s", name, config) for config_item in config: subset = FeatureGroup(name="{}: {}".format(name, config_item)) logging.info("Finding columns that might belong in %s", subset) for table, features in feature_dictionary.items(): logging.info( "Searching features in table %s that match group %s", table, subset) matching_features = self.subsetters[name](config_item, table, features) logging.info( "Found %s matching features in table %s that match group %s", len(matching_features), table, subset, ) if len(matching_features) > 0: subset[table] = FeatureNameList(matching_features) subsets.append(subset) if not any(subset for subset in subsets if any(subset)): raise ValueError( f"Problem! The feature group definition {self.definition} did not find any matches", f"in feature dictionary {feature_dictionary}") logging.info("Found %s total feature subsets", len(subsets)) return subsets
def sample_metadata(): return { "feature_start_time": datetime.date(2012, 12, 20), "end_time": datetime.date(2016, 12, 20), "label_name": "label", "as_of_date_frequency": "1w", "max_training_history": "5y", "state": "default", "cohort_name": "default", "label_timespan": "1y", "metta-uuid": "1234", "feature_names": FeatureNameList(["ft1", "ft2"]), "feature_groups": ["all: True"], "indices": ["entity_id"], }
def subsets(self, feature_dictionary): """Generate subsets of a feature dict Args: feature_dictionary (dict) tables and the features contained in each The feature dictionary is meant to be keyed on source table. Example: { 'feature_table_one': ['feature_one', feature_two'], 'feature_table_two': ['feature_three', 'feature_four'], } Returns: (list) subsets of the feature dictionary, in the same table-based structure """ logger.spam( f"Creating feature groups, using: {self.definition}, Master feature dictionary: {feature_dictionary}", ) subsets = [] for name, config in sorted(self.definition.items()): logger.spam( f"Parsing config grouping method {name}, items {config}") for config_item in config: subset = FeatureGroup(name=f"{name}: {config_item}") logger.spam( f"Finding columns that might belong in [{name}: {config_item}]" ) for table, features in feature_dictionary.items(): logger.spam( f"Searching features in table {table} that match group {subset}" ) matching_features = self.subsetters[name](config_item, table, features) logger.debug( f"Found {len(matching_features)} matching features in table {table} that match group [{name}: {config_item}]", ) if len(matching_features) > 0: subset[table] = FeatureNameList(matching_features) subsets.append(subset) if not any(subset for subset in subsets if any(subset)): raise ValueError( f"Problem! The feature group definition {self.definition} did not find any matches", f"in feature dictionary {feature_dictionary}") logger.verbose(f"Found {len(subsets)} total feature subsets") return subsets
def feature_dictionary(self, feature_table_names, index_column_lookup): """ Create a dictionary of feature names, where keys are feature tables and values are lists of feature names. :return: feature_dictionary :rtype: dict """ feature_dictionary = {} # iterate! store each table name + features names as key-value pair for feature_table_name in self._tables_to_include(feature_table_names): feature_names = [ row[0] for row in self.db_engine.execute( self._build_feature_names_query( feature_table_name, index_column_lookup[feature_table_name])) ] feature_dictionary[feature_table_name] = FeatureNameList( feature_names) logger.spam(f"Feature dictionary built: {feature_dictionary}") return feature_dictionary