def test_no_finite_values_yields_0(self):
        df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 0})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 0})
    def test_ignores_non_finite_values(self):
        df = pd.DataFrame([0, 1, 2, 3, np.NaN, np.PINF, np.NINF], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 3})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 1.5})
    def test_range_values_correct_with_uneven_length(self):
        df = pd.DataFrame([0, 1, 2], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 2})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 1})
    def test_range_values_correct_with_uneven_length(self):
        df = pd.DataFrame([0, 1, 2], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 2})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 1})
    def test_no_finite_values_yields_0(self):
        df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 0})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 0})
    def test_ignores_non_finite_values(self):
        df = pd.DataFrame([0, 1, 2, 3, np.NaN, np.PINF, np.NINF], columns=["value"])

        col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

        self.assertEqual(col_to_max, {"value": 3})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 1.5})
Exemple #7
0
    def test_no_finite_values_yields_0(self):
        df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"])

        with warnings.catch_warnings(record=True) as w:
            col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df)

            self.assertEqual(len(w), 1)
            self.assertEqual(str(w[0].message),
                             "The columns ['value'] did not have any finite values. Filling with zeros.")

        self.assertEqual(col_to_max, {"value": 0})
        self.assertEqual(col_to_min, {"value": 0})
        self.assertEqual(col_to_median, {"value": 0})
Exemple #8
0
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If evaluate_only_added_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor.set_timeseries_container(
            self.timeseries_container)

        if self.evaluate_only_added_features:
            # Do not merge the time series features to the old features
            X_augmented = self.feature_extractor.transform(
                pd.DataFrame(index=X.index))
        else:
            X_augmented = self.feature_extractor.transform(X)

        if self.feature_extractor.settings.IMPUTE is impute:
            col_to_max, col_to_min, col_to_median = get_range_values_per_column(
                X_augmented)
            self.feature_extractor.settings.IMPUTE = partial(
                impute_dataframe_range,
                col_to_max=col_to_max,
                col_to_min=col_to_min,
                col_to_median=col_to_median)

        self.feature_selector.fit(X_augmented, y)

        return self
Exemple #9
0
    def fit(self, X, y=None):
        """
            Compute the min, max and median for all columns in the DataFrame. For more information,
            please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function.

            :param X: DataFrame to calculate min, max and median values on
            :type X: pandas.DataFrame
            :param y: Unneeded.
            :type y: Any

            :return: the estimator with the computed min, max and median values
            :rtype: Imputer
            """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        col_to_max, col_to_min, col_to_median = get_range_values_per_column(X)

        if self.col_to_NINF_repl_preset is not None:
            if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()):
                raise ValueError(
                    "Preset dictionary 'col_to_NINF_repl_preset' contain more keys "
                    "than the column names in X")
            col_to_min.update(self.col_to_NINF_repl_preset)
        self._col_to_NINF_repl = col_to_min

        if self.col_to_PINF_repl_preset is not None:
            if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()):
                raise ValueError(
                    "Preset dictionary 'col_to_PINF_repl_preset' contain more keys "
                    "than the column names in X")
            col_to_max.update(self.col_to_PINF_repl_preset)
        self._col_to_PINF_repl = col_to_max

        if self.col_to_NAN_repl_preset is not None:
            if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()):
                raise ValueError(
                    "Preset dictionary 'col_to_NAN_repl_preset' contain more keys "
                    "than the column names in X")
            col_to_median.update(self.col_to_NAN_repl_preset)
        self._col_to_NAN_repl = col_to_median

        return self
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        self.feature_extractor.set_timeseries_container(self.timeseries_container)

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(X_augmented)
        X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return self
        def fit(self, X, y=None):
            """
            Compute the min, max and median for all columns in the DataFrame. For more information,
            please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function.

            :param X: DataFrame to calculate min, max and median values on
            :type X: pandas.DataFrame
            :param y: Unneeded.
            :type y: Any

            :return: the estimator with the computed min, max and median values
            :rtype: Imputer
            """
            if not isinstance(X, pd.DataFrame):
                X = pd.DataFrame(X)

            col_to_max, col_to_min, col_to_median = get_range_values_per_column(X)

            if self.col_to_NINF_repl_preset is not None:
                if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()):
                    raise ValueError("Preset dictionary 'col_to_NINF_repl_preset' contain more keys "
                                     "than the column names in X")
                col_to_min.update(self.col_to_NINF_repl_preset)
            self._col_to_NINF_repl = col_to_min

            if self.col_to_PINF_repl_preset is not None:
                if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()):
                    raise ValueError("Preset dictionary 'col_to_PINF_repl_preset' contain more keys "
                                     "than the column names in X")
                col_to_max.update(self.col_to_PINF_repl_preset)
            self._col_to_PINF_repl = col_to_max

            if self.col_to_NAN_repl_preset is not None:
                if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()):
                    raise ValueError("Preset dictionary 'col_to_NAN_repl_preset' contain more keys "
                                     "than the column names in X")
                col_to_median.update(self.col_to_NAN_repl_preset)
            self._col_to_NAN_repl = col_to_median

            return self
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor = FeatureAugmenter(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value,
            timeseries_container=self.timeseries_container,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profile=self.profile,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting)

        self.feature_selector = FeatureSelector(
            test_for_binary_target_binary_feature=self.
            test_for_binary_target_binary_feature,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature,
            test_for_real_target_binary_feature=self.
            test_for_real_target_binary_feature,
            test_for_real_target_real_feature=self.
            test_for_real_target_real_feature,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            ml_task=self.ml_task)

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(
            X_augmented)
        X_augmented = impute_dataframe_range(X_augmented,
                                             col_to_max=self.col_to_max,
                                             col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return self
Exemple #13
0
    def _fit_and_augment(self, X, y):
        """
        Helper for the :func:`~fit` and :func:`~fit_transform` functions, which does most of the work described in
        :func:`~fit`.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: a data sample with the extraced time series features. If filter_only_tsfresh_features is False
            the data sample will also include the information in X.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor = FeatureAugmenter(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value,
            timeseries_container=self.timeseries_container,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profile=self.profile,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting)

        self.feature_selector = FeatureSelector(
            test_for_binary_target_binary_feature=self.
            test_for_binary_target_binary_feature,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature,
            test_for_real_target_binary_feature=self.
            test_for_real_target_binary_feature,
            test_for_real_target_real_feature=self.
            test_for_real_target_real_feature,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            ml_task=self.ml_task,
            multiclass=self.multiclass,
            n_significant=self.n_significant,
            multiclass_p_values=self.multiclass_p_values,
        )

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(
            X_augmented)
        X_augmented = impute_dataframe_range(X_augmented,
                                             col_to_max=self.col_to_max,
                                             col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return X_augmented