def _features_by_type(self, all_features, dataframe, max_depth, column_schemas=None): selected_features = [] if max_depth is not None and max_depth < 0: return selected_features if dataframe.ww.name not in all_features: return selected_features dataframe_features = all_features[dataframe.ww.name].copy() for fname, feature in all_features[dataframe.ww.name].items(): outputs = feature.number_output_features if outputs > 1: del dataframe_features[fname] for i in range(outputs): new_feat = feature[i] dataframe_features[new_feat.unique_name()] = new_feat for feat in dataframe_features: f = dataframe_features[feat] if column_schemas == "all" or any( is_valid_input(f.column_schema, schema) for schema in column_schemas): if (max_depth is None or f.get_depth(stop_at=self.seed_features) <= max_depth): selected_features.append(f) return selected_features
def _check_input_types(self): if len(self.base_features) == 0: return True input_types = self.primitive.input_types if input_types is not None: if type(input_types[0]) != list: input_types = [input_types] for t in input_types: zipped = list(zip(t, self.base_features)) if all([is_valid_input(f.column_schema, t) for t, f in zipped]): return True else: return True return False
def test_is_valid_input(): assert is_valid_input(candidate=ColumnSchema(), template=ColumnSchema()) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index', 'numeric'}), template=ColumnSchema(semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(semantic_tags={'index'}), template=ColumnSchema(semantic_tags={'index'})) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema()) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer), template=ColumnSchema(logical_type=Integer)) assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'numeric'}), template=ColumnSchema(logical_type=Integer)) assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}), template=ColumnSchema(logical_type=Double, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={}), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(logical_type=Integer, semantic_tags={'index'})) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(logical_type=Integer)) assert not is_valid_input(candidate=ColumnSchema(), template=ColumnSchema(semantic_tags={'index'}))
def _match_contains_numeric_foreign_key(match): match_schema = ColumnSchema(semantic_tags={"foreign_key", "numeric"}) return any(is_valid_input(f.column_schema, match_schema) for f in match)
def build_features(self, return_types=None, verbose=False): """Automatically builds feature definitions for target dataframe using Deep Feature Synthesis algorithm Args: return_types (list[woodwork.ColumnSchema] or str, optional): List of ColumnSchemas defining the types of columns to return. If None, defaults to returning all numeric, categorical and boolean types. If given as the string 'all', use all available return types. verbose (bool, optional): If True, print progress. Returns: list[BaseFeature]: Returns a list of features for target dataframe, sorted by feature depth (shallow first). """ all_features = {} self.where_clauses = defaultdict(set) if return_types is None: return_types = [ ColumnSchema(semantic_tags=["numeric"]), ColumnSchema(semantic_tags=["category"]), ColumnSchema(logical_type=Boolean), ColumnSchema(logical_type=BooleanNullable), ] elif return_types == "all": pass else: msg = "return_types must be a list, or 'all'" assert isinstance(return_types, list), msg self._run_dfs( self.es[self.target_dataframe_name], RelationshipPath([]), all_features, max_depth=self.max_depth, ) new_features = list(all_features[self.target_dataframe_name].values()) def filt(f): # remove identity features of the ID field of the target dataframe if (isinstance(f, IdentityFeature) and f.dataframe_name == self.target_dataframe_name and f.column_name == self.es[self.target_dataframe_name].ww.index): return False return True # filter out features with undesired return types if return_types != "all": new_features = [ f for f in new_features if any( is_valid_input(f.column_schema, schema) for schema in return_types) ] new_features = list(filter(filt, new_features)) new_features.sort(key=lambda f: f.get_depth()) new_features = self._filter_features(new_features) if self.max_features > 0: new_features = new_features[:self.max_features] if verbose: print("Built {} features".format(len(new_features))) verbose = None return new_features
def match_by_schema(features, column_schema): matches = [] for f in features: if is_valid_input(f.column_schema, column_schema): matches += [f] return matches