Ejemplo n.º 1
0
def test_precalculated_features(pd_es):
    error_msg = (
        "This primitive should never be used because the features are precalculated"
    )

    class ErrorPrim(AggregationPrimitive):
        """A primitive whose function raises an error."""

        name = "error_prim"
        input_types = [ColumnSchema(semantic_tags={"numeric"})]
        return_type = ColumnSchema(semantic_tags={"numeric"})

        def get_function(self, agg_type="pandas"):
            def error(s):
                raise RuntimeError(error_msg)

            return error

    value = ft.Feature(pd_es["log"].ww["value"])
    agg = ft.Feature(value,
                     parent_dataframe_name="sessions",
                     primitive=ErrorPrim)
    agg2 = ft.Feature(agg,
                      parent_dataframe_name="customers",
                      primitive=ErrorPrim)
    direct = ft.Feature(agg2, dataframe_name="sessions")

    # Set up a FeatureSet which knows which features are precalculated.
    precalculated_feature_trie = Trie(default=set,
                                      path_constructor=RelationshipPath)
    precalculated_feature_trie.get_node(direct.relationship_path).value.add(
        agg2.unique_name())
    feature_set = FeatureSet(
        [direct], approximate_feature_trie=precalculated_feature_trie)

    # Fake precalculated data.
    values = [0, 1, 2]
    parent_fm = pd.DataFrame({agg2.get_name(): values})
    precalculated_fm_trie = Trie(path_constructor=RelationshipPath)
    precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm

    calculator = FeatureSetCalculator(
        pd_es,
        feature_set=feature_set,
        precalculated_features=precalculated_fm_trie)

    instance_ids = [0, 2, 3, 5]
    fm = calculator.run(np.array(instance_ids))

    assert list(
        fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]]

    # Calculating without precalculated features should error.
    with pytest.raises(RuntimeError, match=error_msg):
        FeatureSetCalculator(pd_es,
                             feature_set=FeatureSet([direct
                                                     ])).run(instance_ids)
def test_precalculated_features(es):
    error_msg = 'This primitive should never be used because the features are precalculated'

    class ErrorPrim(AggregationPrimitive):
        """A primitive whose function raises an error."""
        name = "error_prim"
        input_types = [Numeric]
        return_type = Numeric

        def get_function(self):
            def error(s):
                raise RuntimeError(error_msg)

            return error

    value = ft.Feature(es['log']['value'])
    agg = ft.Feature(value, parent_entity=es['sessions'], primitive=ErrorPrim)
    agg2 = ft.Feature(agg, parent_entity=es['customers'], primitive=ErrorPrim)
    direct = ft.Feature(agg2, entity=es['sessions'])

    # Set up a FeatureSet which knows which features are precalculated.
    precalculated_feature_trie = Trie(default=set,
                                      path_constructor=RelationshipPath)
    precalculated_feature_trie.get_node(direct.relationship_path).value.add(
        agg2.unique_name())
    feature_set = FeatureSet(
        [direct], approximate_feature_trie=precalculated_feature_trie)

    # Fake precalculated data.
    values = [0, 1, 2]
    parent_fm = pd.DataFrame({agg2.get_name(): values})
    precalculated_fm_trie = Trie(path_constructor=RelationshipPath)
    precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm

    calculator = FeatureSetCalculator(
        es,
        feature_set=feature_set,
        precalculated_features=precalculated_fm_trie)

    instance_ids = [0, 2, 3, 5]
    fm = calculator.run(np.array(instance_ids))

    assert list(
        fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]]

    # Calculating without precalculated features should error.
    with pytest.raises(RuntimeError, match=error_msg):
        FeatureSetCalculator(es, feature_set=FeatureSet([direct
                                                         ])).run(instance_ids)
Ejemplo n.º 3
0
    def __init__(self,
                 entityset,
                 feature_set,
                 time_last=None,
                 training_window=None,
                 precalculated_features=None):
        """
        Args:
            feature_set (FeatureSet): The features to calculate values for.

            time_last (pd.Timestamp, optional): Last allowed time. Data from exactly this
                time not allowed.

            training_window (Timedelta, optional): Window defining how much time before the cutoff time data
                can be used when calculating features. If None, all data before cutoff time is used.

            precalculated_features (Trie[RelationshipPath -> pd.DataFrame]):
                Maps RelationshipPaths to dataframes of precalculated_features

        """
        self.entityset = entityset
        self.feature_set = feature_set
        self.training_window = training_window

        if time_last is None:
            time_last = datetime.now()

        self.time_last = time_last

        if precalculated_features is None:
            precalculated_features = Trie(path_constructor=RelationshipPath)

        self.precalculated_features = precalculated_features
Ejemplo n.º 4
0
def gather_approximate_features(feature_set):
    # A trie where the edges are RelationshipPaths and the nodes contain lists
    # of features.
    approximate_feature_trie = Trie(default=list,
                                    path_constructor=RelationshipPath)

    # A set of feature names.
    approximate_feature_set = set()

    for feature in feature_set.target_features:
        if feature_set.uses_full_entity(feature, check_dependents=True):
            continue

        if isinstance(feature, DirectFeature):
            path = feature.relationship_path
            base_feature = feature.base_features[0]

            while isinstance(base_feature, DirectFeature):
                path = path + base_feature.relationship_path
                base_feature = base_feature.base_features[0]

            if isinstance(base_feature, AggregationFeature):
                feature_list = approximate_feature_trie.get_node(path).value
                feature_list.append(base_feature)
                approximate_feature_set.add(base_feature.unique_name())

    return approximate_feature_trie, approximate_feature_set
Ejemplo n.º 5
0
def test_feature_trie_ignores_approximate_features(es):
    value = ft.IdentityFeature(es['log']['value'], )
    agg = ft.AggregationFeature(value,
                                es['sessions'],
                                primitive=ft.primitives.Mean)
    agg_of_agg = ft.AggregationFeature(agg,
                                       es['customers'],
                                       primitive=ft.primitives.Sum)
    direct = ft.DirectFeature(agg_of_agg, es['sessions'])
    features = [direct, agg]

    approximate_feature_trie = Trie(default=list,
                                    path_constructor=RelationshipPath)
    approximate_feature_trie.get_node(
        direct.relationship_path).value = [agg_of_agg]
    feature_set = FeatureSet(features,
                             approximate_feature_trie=approximate_feature_trie)
    trie = feature_set.feature_trie

    # Since agg_of_agg is ignored it and its dependencies should not be in the
    # trie.
    sub_trie = trie.get_node(direct.relationship_path)
    for _path, (_, _, features) in sub_trie:
        assert not features

    assert trie.value == (False, set(),
                          {direct.unique_name(),
                           agg.unique_name()})
    assert trie.get_node(agg.relationship_path).value == \
        (False, set(), {value.unique_name()})
Ejemplo n.º 6
0
def gather_approximate_features(feature_set):
    """
    Find features which can be approximated. Returned as a trie where the values
    are sets of feature names.

    Args:
        feature_set (FeatureSet): Features to search the dependencies of for
            features to approximate.

    Returns:
        Trie[RelationshipPath, set[str]]
    """
    approximate_feature_trie = Trie(default=set,
                                    path_constructor=RelationshipPath)

    for feature in feature_set.target_features:
        if feature_set.uses_full_entity(feature, check_dependents=True):
            continue

        if isinstance(feature, DirectFeature):
            path = feature.relationship_path
            base_feature = feature.base_features[0]

            while isinstance(base_feature, DirectFeature):
                path = path + base_feature.relationship_path
                base_feature = base_feature.base_features[0]

            if isinstance(base_feature, AggregationFeature):
                node_feature_set = approximate_feature_trie.get_node(
                    path).value
                node_feature_set.add(base_feature.unique_name())

    return approximate_feature_trie
Ejemplo n.º 7
0
def test_get_node():
    t = Trie(default=lambda: 'default')

    t.get_node([1, 2, 3]).value = '123'
    t.get_node([1, 2, 4]).value = '124'
    sub = t.get_node([1, 2])
    assert sub.get_node([3]).value == '123'
    assert sub.get_node([4]).value == '124'

    sub.get_node([4, 5]).value = '1245'
    assert t.get_node([1, 2, 4, 5]).value == '1245'
Ejemplo n.º 8
0
def test_get_node():
    t = Trie(default=lambda: "default")

    t.get_node([1, 2, 3]).value = "123"
    t.get_node([1, 2, 4]).value = "124"
    sub = t.get_node([1, 2])
    assert sub.get_node([3]).value == "123"
    assert sub.get_node([4]).value == "124"

    sub.get_node([4, 5]).value = "1245"
    assert t.get_node([1, 2, 4, 5]).value == "1245"
Ejemplo n.º 9
0
    def _build_feature_trie(self):
        """
        Build the feature trie by adding the target features and their dependencies recursively.
        """
        feature_trie = Trie(default=lambda: (False, set(), set()),
                            path_constructor=RelationshipPath)

        for f in self.target_features:
            self._add_feature_to_trie(feature_trie, f,
                                      self.approximate_feature_trie)

        return feature_trie
Ejemplo n.º 10
0
def test_setting_and_getting():
    t = Trie(default=lambda: "default")
    assert t.get_node([1, 2, 3]).value == "default"

    t.get_node([1, 2, 3]).value = "123"
    t.get_node([1, 2, 4]).value = "124"
    assert t.get_node([1, 2, 3]).value == "123"
    assert t.get_node([1, 2, 4]).value == "124"

    assert t.get_node([1]).value == "default"
    t.get_node([1]).value = "1"
    assert t.get_node([1]).value == "1"

    t.get_node([1, 2, 3]).value = "updated"
    assert t.get_node([1, 2, 3]).value == "updated"
Ejemplo n.º 11
0
    def _build_feature_trie(self):
        """
        Construct a trie mapping RelationshipPaths to a tuple of
        (bool, set[str], set[str]). The bool represents whether the full
        entity df is needed at that node, the first set contains the names of
        features which are needed on the full entity, and the second set
        contains the names of the rest of the features
        """
        feature_trie = Trie(default=lambda: (False, set(), set()),
                            path_constructor=RelationshipPath)

        for f in self.target_features:
            self._add_feature_to_trie(feature_trie, f)

        return feature_trie
Ejemplo n.º 12
0
def test_iteration():
    t = Trie(default=lambda: 'default', path_constructor=tuple)

    t.get_node((1, 2, 3)).value = '123'
    t.get_node((1, 2, 4)).value = '124'
    expected = [
        ((), 'default'),
        ((1, ), 'default'),
        ((1, 2), 'default'),
        ((1, 2, 3), '123'),
        ((1, 2, 4), '124'),
    ]

    for i, value in enumerate(t):
        assert value == expected[i]
Ejemplo n.º 13
0
def test_setting_and_getting():
    t = Trie(default=lambda: 'default')
    assert t.get_node([1, 2, 3]).value == 'default'

    t.get_node([1, 2, 3]).value = '123'
    t.get_node([1, 2, 4]).value = '124'
    assert t.get_node([1, 2, 3]).value == '123'
    assert t.get_node([1, 2, 4]).value == '124'

    assert t.get_node([1]).value == 'default'
    t.get_node([1]).value = '1'
    assert t.get_node([1]).value == '1'

    t.get_node([1, 2, 3]).value = 'updated'
    assert t.get_node([1, 2, 3]).value == 'updated'
Ejemplo n.º 14
0
def test_iteration():
    t = Trie(default=lambda: "default", path_constructor=tuple)

    t.get_node((1, 2, 3)).value = "123"
    t.get_node((1, 2, 4)).value = "124"
    expected = [
        ((), "default"),
        ((1, ), "default"),
        ((1, 2), "default"),
        ((1, 2, 3), "123"),
        ((1, 2, 4), "124"),
    ]

    for i, value in enumerate(t):
        assert value == expected[i]
Ejemplo n.º 15
0
    def __init__(self, features, approximate_feature_trie=None):
        """
        Args:
            features (list[Feature]): Features of the target entity.
            approximate_feature_trie (Trie[RelationshipPath, set[str]], optional): Dependency
                features to ignore because they have already been approximated. For example, if
                one of the target features is a direct feature of a feature A and A is included in
                approximate_feature_trie then neither A nor its dependencies will appear in
                FeatureSet.feature_trie.
        """
        self.target_eid = features[0].entity.id
        self.target_features = features
        self.target_feature_names = {f.unique_name() for f in features}

        if not approximate_feature_trie:
            approximate_feature_trie = Trie(default=list,
                                            path_constructor=RelationshipPath)
        self.approximate_feature_trie = approximate_feature_trie

        # Maps the unique name of each feature to the actual feature. This is necessary
        # because features do not support equality and so cannot be used as
        # dictionary keys. The equality operator on features produces a new
        # feature (which will always be truthy).
        self.features_by_name = {f.unique_name(): f for f in features}

        feature_dependents = defaultdict(set)
        for f in features:
            deps = f.get_dependencies(deep=True)
            for dep in deps:
                feature_dependents[dep.unique_name()].add(f.unique_name())
                self.features_by_name[dep.unique_name()] = dep
                subdeps = dep.get_dependencies(deep=True)
                for sd in subdeps:
                    feature_dependents[sd.unique_name()].add(dep.unique_name())

        # feature names (keys) and the features that rely on them (values).
        self.feature_dependents = {
            fname: [
                self.features_by_name[dname]
                for dname in feature_dependents[fname]
            ]
            for fname, f in self.features_by_name.items()
        }

        self._feature_trie = None
Ejemplo n.º 16
0
    def run(self, instance_ids):
        """
        Calculate values of features for the given instances of the target
        entity.

        Summary of algorithm:
        1. Construct a trie where the edges are relationships and each node
            contains a set of features for a single entity. See
            FeatureSet._build_feature_trie.
        2. Initialize a trie for storing dataframes.
        3. Traverse the trie using depth first search. At each node calculate
            the features and store the resulting dataframe in the dataframe
            trie (so that its values can be used by features which depend on
            these features). See _calculate_features_for_entity.
        4. Get the dataframe at the root of the trie (for the target entity) and
            return the columns corresponding to the requested features.

        Args:
            instance_ids (list): List of instance id for which to build features.

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.
        """
        assert len(instance_ids) > 0, "0 instance ids provided"

        feature_trie = self.feature_set.feature_trie

        df_trie = Trie(path_constructor=RelationshipPath)
        full_entity_df_trie = Trie(path_constructor=RelationshipPath)

        target_entity = self.entityset[self.feature_set.target_eid]
        self._calculate_features_for_entity(
            entity_id=self.feature_set.target_eid,
            feature_trie=feature_trie,
            df_trie=df_trie,
            full_entity_df_trie=full_entity_df_trie,
            precalculated_trie=self.precalculated_features,
            filter_variable=target_entity.index,
            filter_values=instance_ids)

        # The dataframe for the target entity should be stored at the root of
        # df_trie.
        df = df_trie.value

        if df.empty:
            return self.generate_default_df(instance_ids=instance_ids)

        # fill in empty rows with default values
        missing_ids = [
            i for i in instance_ids if i not in df[target_entity.index]
        ]
        if missing_ids:
            default_df = self.generate_default_df(instance_ids=missing_ids,
                                                  extra_columns=df.columns)
            df = df.append(default_df, sort=True)

        df.index.name = self.entityset[self.feature_set.target_eid].index
        column_list = []
        for feat in self.feature_set.target_features:
            column_list.extend(feat.get_feature_names())
        return df[column_list]
Ejemplo n.º 17
0
def approximate_features(feature_set,
                         cutoff_time,
                         window,
                         entityset,
                         training_window=None):
    '''Given a set of features and cutoff_times to be passed to
    calculate_feature_matrix, calculates approximate values of some features
    to speed up calculations.  Cutoff times are sorted into
    window-sized buckets and the approximate feature values are only calculated
    at one cutoff time for each bucket.


    ..note:: this only approximates DirectFeatures of AggregationFeatures, on
        the target entity. In future versions, it may also be possible to
        approximate these features on other top-level entities

    Args:
        cutoff_time (pd.DataFrame): specifies what time to calculate
            the features for each instance at. The resulting feature matrix will use data
            up to and including the cutoff_time. A DataFrame with
            'instance_id' and 'time' columns.

        window (Timedelta or str): frequency to group instances with similar
            cutoff times by for features with costly calculations. For example,
            if bucket is 24 hours, all instances with cutoff times on the same
            day will use the same calculation for expensive features.

        entityset (:class:`.EntitySet`): An already initialized entityset.

        feature_set (:class:`.FeatureSet`): The features to be calculated.

        training_window (`Timedelta`, optional):
            Window defining how much older than the cutoff time data
            can be to be included when calculating the feature. If None, all older data is used.

        save_progress (str, optional): path to save intermediate computational results
    '''
    approx_fms_trie = Trie(path_constructor=RelationshipPath)

    target_time_colname = 'target_time'
    cutoff_time[target_time_colname] = cutoff_time['time']
    approx_cutoffs = bin_cutoff_times(cutoff_time.copy(), window)
    cutoff_df_time_var = 'time'
    cutoff_df_instance_var = 'instance_id'
    # should this order be by dependencies so that calculate_feature_matrix
    # doesn't skip approximating something?
    for relationship_path, approx_feature_names in feature_set.approximate_feature_trie:
        if not approx_feature_names:
            continue

        cutoffs_with_approx_e_ids, new_approx_entity_index_var = \
            _add_approx_entity_index_var(entityset, feature_set.target_eid,
                                         approx_cutoffs.copy(), relationship_path)

        # Select only columns we care about
        columns_we_want = [
            new_approx_entity_index_var, cutoff_df_time_var,
            target_time_colname
        ]

        cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids[columns_we_want]
        cutoffs_with_approx_e_ids = cutoffs_with_approx_e_ids.drop_duplicates()
        cutoffs_with_approx_e_ids.dropna(subset=[new_approx_entity_index_var],
                                         inplace=True)

        approx_features = [
            feature_set.features_by_name[name] for name in approx_feature_names
        ]
        if cutoffs_with_approx_e_ids.empty:
            approx_fm = gen_empty_approx_features_df(approx_features)
        else:
            cutoffs_with_approx_e_ids.sort_values(
                [cutoff_df_time_var, new_approx_entity_index_var],
                inplace=True)
            # CFM assumes specific column names for cutoff_time argument
            rename = {new_approx_entity_index_var: cutoff_df_instance_var}
            cutoff_time_to_pass = cutoffs_with_approx_e_ids.rename(
                columns=rename)
            cutoff_time_to_pass = cutoff_time_to_pass[[
                cutoff_df_instance_var, cutoff_df_time_var
            ]]

            cutoff_time_to_pass.drop_duplicates(inplace=True)
            approx_fm = calculate_feature_matrix(
                approx_features,
                entityset,
                cutoff_time=cutoff_time_to_pass,
                training_window=training_window,
                approximate=None,
                cutoff_time_in_index=False,
                chunk_size=cutoff_time_to_pass.shape[0])

        approx_fms_trie.get_node(relationship_path).value = approx_fm

    return approx_fms_trie
Ejemplo n.º 18
0
    def run(self, instance_ids, progress_callback=None):
        """
        Calculate values of features for the given instances of the target
        entity.

        Summary of algorithm:
        1. Construct a trie where the edges are relationships and each node
            contains a set of features for a single entity. See
            FeatureSet._build_feature_trie.
        2. Initialize a trie for storing dataframes.
        3. Traverse the trie using depth first search. At each node calculate
            the features and store the resulting dataframe in the dataframe
            trie (so that its values can be used by features which depend on
            these features). See _calculate_features_for_entity.
        4. Get the dataframe at the root of the trie (for the target entity) and
            return the columns corresponding to the requested features.

        Args:
            instance_ids (np.ndarray or pd.Categorical): Instance ids for which
                to build features.

            progress_callback (callable): function to be called with incremental progress updates

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.
        """
        assert len(instance_ids) > 0, "0 instance ids provided"

        if progress_callback is None:
            # do nothing for the progress call back if not provided
            def progress_callback(*args):
                pass

        feature_trie = self.feature_set.feature_trie

        df_trie = Trie(path_constructor=RelationshipPath)
        full_entity_df_trie = Trie(path_constructor=RelationshipPath)

        target_entity = self.entityset[self.feature_set.target_eid]
        self._calculate_features_for_entity(
            entity_id=self.feature_set.target_eid,
            feature_trie=feature_trie,
            df_trie=df_trie,
            full_entity_df_trie=full_entity_df_trie,
            precalculated_trie=self.precalculated_features,
            filter_variable=target_entity.index,
            filter_values=instance_ids,
            progress_callback=progress_callback)

        # The dataframe for the target entity should be stored at the root of
        # df_trie.
        df = df_trie.value

        if df.empty:
            return self.generate_default_df(instance_ids=instance_ids)

        # fill in empty rows with default values
        missing_ids = [
            i for i in instance_ids if i not in df[target_entity.index]
        ]
        if missing_ids:
            default_df = self.generate_default_df(instance_ids=missing_ids,
                                                  extra_columns=df.columns)
            df = df.append(default_df, sort=True)

        df.index.name = self.entityset[self.feature_set.target_eid].index
        column_list = []

        # Order by instance_ids
        unique_instance_ids = pd.unique(instance_ids)
        # pd.unique changes the dtype for Categorical, so reset it.
        unique_instance_ids = unique_instance_ids.astype(instance_ids.dtype)
        df = df.reindex(unique_instance_ids)

        for feat in self.feature_set.target_features:
            column_list.extend(feat.get_feature_names())
        return df[column_list]
Ejemplo n.º 19
0
    def run(self,
            instance_ids,
            progress_callback=None,
            include_cutoff_time=True):
        """
        Calculate values of features for the given instances of the target
        dataframe.

        Summary of algorithm:
        1. Construct a trie where the edges are relationships and each node
            contains a set of features for a single dataframe. See
            FeatureSet._build_feature_trie.
        2. Initialize a trie for storing dataframes.
        3. Traverse the trie using depth first search. At each node calculate
            the features and store the resulting dataframe in the dataframe
            trie (so that its values can be used by features which depend on
            these features). See _calculate_features_for_dataframe.
        4. Get the dataframe at the root of the trie (for the target dataframe) and
            return the columns corresponding to the requested features.

        Args:
            instance_ids (np.ndarray or pd.Categorical): Instance ids for which
                to build features.

            progress_callback (callable): function to be called with incremental progress updates

            include_cutoff_time (bool): If True, data at cutoff time are included
                in calculating features.

        Returns:
            pd.DataFrame : Pandas DataFrame of calculated feature values.
                Indexed by instance_ids. Columns in same order as features
                passed in.
        """
        assert len(instance_ids) > 0, "0 instance ids provided"

        if progress_callback is None:
            # do nothing for the progress call back if not provided
            def progress_callback(*args):
                pass

        feature_trie = self.feature_set.feature_trie

        df_trie = Trie(path_constructor=RelationshipPath)
        full_dataframe_trie = Trie(path_constructor=RelationshipPath)

        target_dataframe = self.entityset[self.feature_set.target_df_name]

        self._calculate_features_for_dataframe(
            dataframe_name=self.feature_set.target_df_name,
            feature_trie=feature_trie,
            df_trie=df_trie,
            full_dataframe_trie=full_dataframe_trie,
            precalculated_trie=self.precalculated_features,
            filter_column=target_dataframe.ww.index,
            filter_values=instance_ids,
            progress_callback=progress_callback,
            include_cutoff_time=include_cutoff_time,
        )

        # The dataframe for the target dataframe should be stored at the root of
        # df_trie.
        df = df_trie.value

        # Fill in empty rows with default values. This only works for pandas dataframes
        # and is not currently supported for Dask dataframes.
        if isinstance(df, pd.DataFrame):
            index_dtype = df.index.dtype.name
            if df.empty:
                return self.generate_default_df(instance_ids=instance_ids)

            missing_ids = [
                i for i in instance_ids
                if i not in df[target_dataframe.ww.index]
            ]
            if missing_ids:
                default_df = self.generate_default_df(instance_ids=missing_ids,
                                                      extra_columns=df.columns)

                df = default_df.append(df, sort=True)

            df.index.name = self.entityset[
                self.feature_set.target_df_name].ww.index

            # Order by instance_ids
            unique_instance_ids = pd.unique(instance_ids)
            unique_instance_ids = unique_instance_ids.astype(
                instance_ids.dtype)
            df = df.reindex(unique_instance_ids)

            # Keep categorical index if original index was categorical
            if index_dtype == "category":
                df.index = df.index.astype("category")

        column_list = []

        for feat in self.feature_set.target_features:
            column_list.extend(feat.get_feature_names())

        if is_instance(df, (dd, ps), "DataFrame"):
            column_list.extend([target_dataframe.ww.index])

        return df[column_list]