def _features_by_type(self,
                          all_features,
                          dataframe,
                          max_depth,
                          column_schemas=None):

        selected_features = []

        if max_depth is not None and max_depth < 0:
            return selected_features

        if dataframe.ww.name not in all_features:
            return selected_features

        dataframe_features = all_features[dataframe.ww.name].copy()
        for fname, feature in all_features[dataframe.ww.name].items():
            outputs = feature.number_output_features
            if outputs > 1:
                del dataframe_features[fname]
                for i in range(outputs):
                    new_feat = feature[i]
                    dataframe_features[new_feat.unique_name()] = new_feat

        for feat in dataframe_features:
            f = dataframe_features[feat]
            if column_schemas == "all" or any(
                    is_valid_input(f.column_schema, schema)
                    for schema in column_schemas):
                if (max_depth is None or
                        f.get_depth(stop_at=self.seed_features) <= max_depth):
                    selected_features.append(f)

        return selected_features
Esempio n. 2
0
    def _check_input_types(self):
        if len(self.base_features) == 0:
            return True

        input_types = self.primitive.input_types
        if input_types is not None:
            if type(input_types[0]) != list:
                input_types = [input_types]

            for t in input_types:
                zipped = list(zip(t, self.base_features))
                if all([is_valid_input(f.column_schema, t) for t, f in zipped]):
                    return True
        else:
            return True
        return False
def test_is_valid_input():
    assert is_valid_input(candidate=ColumnSchema(), template=ColumnSchema())

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                          template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index', 'numeric'}),
                          template=ColumnSchema(semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(semantic_tags={'index'}),
                          template=ColumnSchema(semantic_tags={'index'}))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                          template=ColumnSchema())

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer),
                          template=ColumnSchema(logical_type=Integer))

    assert is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'numeric'}),
                          template=ColumnSchema(logical_type=Integer))

    assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={'index'}),
                              template=ColumnSchema(logical_type=Double, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(logical_type=Integer, semantic_tags={}),
                              template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(logical_type=Integer, semantic_tags={'index'}))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(logical_type=Integer))

    assert not is_valid_input(candidate=ColumnSchema(),
                              template=ColumnSchema(semantic_tags={'index'}))
def _match_contains_numeric_foreign_key(match):
    match_schema = ColumnSchema(semantic_tags={"foreign_key", "numeric"})
    return any(is_valid_input(f.column_schema, match_schema) for f in match)
    def build_features(self, return_types=None, verbose=False):
        """Automatically builds feature definitions for target
            dataframe using Deep Feature Synthesis algorithm

        Args:
            return_types (list[woodwork.ColumnSchema] or str, optional):
                List of ColumnSchemas defining the types of
                columns to return. If None, defaults to returning all
                numeric, categorical and boolean types. If given as
                the string 'all', use all available return types.

            verbose (bool, optional): If True, print progress.

        Returns:
            list[BaseFeature]: Returns a list of
                features for target dataframe, sorted by feature depth
                (shallow first).
        """
        all_features = {}

        self.where_clauses = defaultdict(set)

        if return_types is None:
            return_types = [
                ColumnSchema(semantic_tags=["numeric"]),
                ColumnSchema(semantic_tags=["category"]),
                ColumnSchema(logical_type=Boolean),
                ColumnSchema(logical_type=BooleanNullable),
            ]
        elif return_types == "all":
            pass
        else:
            msg = "return_types must be a list, or 'all'"
            assert isinstance(return_types, list), msg

        self._run_dfs(
            self.es[self.target_dataframe_name],
            RelationshipPath([]),
            all_features,
            max_depth=self.max_depth,
        )

        new_features = list(all_features[self.target_dataframe_name].values())

        def filt(f):
            # remove identity features of the ID field of the target dataframe
            if (isinstance(f, IdentityFeature)
                    and f.dataframe_name == self.target_dataframe_name
                    and f.column_name
                    == self.es[self.target_dataframe_name].ww.index):
                return False

            return True

        # filter out features with undesired return types
        if return_types != "all":
            new_features = [
                f for f in new_features if any(
                    is_valid_input(f.column_schema, schema)
                    for schema in return_types)
            ]
        new_features = list(filter(filt, new_features))

        new_features.sort(key=lambda f: f.get_depth())

        new_features = self._filter_features(new_features)

        if self.max_features > 0:
            new_features = new_features[:self.max_features]

        if verbose:
            print("Built {} features".format(len(new_features)))
            verbose = None
        return new_features
def match_by_schema(features, column_schema):
    matches = []
    for f in features:
        if is_valid_input(f.column_schema, column_schema):
            matches += [f]
    return matches