Ejemplo n.º 1
0
    def test_restrict_dataframe(self):
        df = pd.DataFrame({'id': [1, 2, 3] * 2})

        df_restricted = dataframe_functions.restrict_input_to_index(df, 'id', [2])
        self.assertEqual(list(df_restricted.id), [2, 2])

        df_restricted2 = dataframe_functions.restrict_input_to_index(df, 'id', [1, 2, 3])
        self.assertTrue(df_restricted2.equals(df))
Ejemplo n.º 2
0
    def test_restrict_dataframe(self):
        df = pd.DataFrame({'id': [1, 2, 3] * 2})

        df_restricted = dataframe_functions.restrict_input_to_index(df, 'id', [2])
        self.assertEqual(list(df_restricted.id), [2, 2])

        df_restricted2 = dataframe_functions.restrict_input_to_index(df, 'id', [1, 2, 3])
        self.assertTrue(df_restricted2.equals(df))
Ejemplo n.º 3
0
    def test_restrict_dict(self):
        kind_to_df = {'a': pd.DataFrame({'id': [1, 2, 3]}), 'b': pd.DataFrame({'id': [3, 4, 5]})}

        kind_to_df_restricted = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [3])
        self.assertEqual(list(kind_to_df_restricted['a'].id), [3])
        self.assertEqual(list(kind_to_df_restricted['b'].id), [3])

        kind_to_df_restricted2 = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [1, 2, 3, 4, 5])
        self.assertTrue(kind_to_df_restricted2['a'].equals(kind_to_df['a']))
        self.assertTrue(kind_to_df_restricted2['b'].equals(kind_to_df['b']))
Ejemplo n.º 4
0
    def test_restrict_dict(self):
        kind_to_df = {'a': pd.DataFrame({'id': [1, 2, 3]}), 'b': pd.DataFrame({'id': [3, 4, 5]})}

        kind_to_df_restricted = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [3])
        self.assertEqual(list(kind_to_df_restricted['a'].id), [3])
        self.assertEqual(list(kind_to_df_restricted['b'].id), [3])

        kind_to_df_restricted2 = dataframe_functions.restrict_input_to_index(kind_to_df, 'id', [1, 2, 3, 4, 5])
        self.assertTrue(kind_to_df_restricted2['a'].equals(kind_to_df['a']))
        self.assertTrue(kind_to_df_restricted2['b'].equals(kind_to_df['b']))
Ejemplo n.º 5
0
def extract_relevant_features(timeseries_container,
                              y,
                              X=None,
                              feature_extraction_settings=None,
                              feature_selection_settings=None,
                              column_id=None,
                              column_sort=None,
                              column_kind=None,
                              column_value=None):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with features relevant with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: See parameter `timeseries_container` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param y: See parameter `y` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param X: See parameter `X` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param column_id: See parameter `column_id` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_sort: See parameter `column_sort` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_kind: See parameter `column_kind` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_value: See parameter `column_value` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_extraction_settings: See parameter `feature_extraction_settings` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_selection_settings: See parameter `feature_selection_settings` in :func:`~tsfresh.feature_selection.selection.select_features`

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container,
                                                       column_id, X.index)

    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute

    X_ext = extract_features(
        timeseries_container,
        feature_extraction_settings=feature_extraction_settings,
        column_id=column_id,
        column_sort=column_sort,
        column_kind=column_kind,
        column_value=column_value)
    X_sel = select_features(
        X_ext, y, feature_selection_settings=feature_selection_settings)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X
Ejemplo n.º 6
0
    def transform(self, X):
        """
        Add the features calculated using the timeseries_container and add them to the corresponding rows in the input
        pandas.DataFrame X.

        To save some computing time, you should only include those time serieses in the container, that you
        need. You can set the timeseries container with the method :func:`set_timeseries_container`.

        :param X: the DataFrame to which the calculated timeseries features will be added. This is *not* the
               dataframe with the timeseries itself.
        :type X: pandas.DataFrame

        :return: The input DataFrame, but with added features.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        # Extract only features for the IDs in X.index
        timeseries_container_X = restrict_input_to_index(
            self.timeseries_container, self.column_id, X.index)

        extracted_features = extract_features(
            timeseries_container_X,
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            impute_function=self.impute_function,
            profile=self.profile,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting)

        X = pd.merge(X,
                     extracted_features,
                     left_index=True,
                     right_index=True,
                     how="left")

        return X
Ejemplo n.º 7
0
    def transform(self, X):
        """
        Add the features calculated using the timeseries_container and add them to the corresponding rows in the input
        pandas.DataFrame X.

        To save some computing time, you should only include those time serieses in the container that you
        need. You can set the timeseries container with the method :func:`set_timeseries_container`.

        :param X: the DataFrame to which the calculated timeseries features will be added. This is *not* the
               dataframe with the timeseries itself.
        :type X: pandas.DataFrame

        :return: The input DataFrame, but with added features.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        # Extract only features for the IDs in X.index
        timeseries_container_X = restrict_input_to_index(
            self.timeseries_container, self.column_id, X.index)

        extracted_features = extract_features(
            timeseries_container_X,
            feature_extraction_settings=self.settings,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value)

        X = pd.merge(X,
                     extracted_features,
                     left_index=True,
                     right_index=True,
                     how="left")

        return X
Ejemplo n.º 8
0
    def transform(self, X):
        """
        Add the features calculated using the timeseries_container and add them to the corresponding rows in the input
        pandas.DataFrame X.

        To save some computing time, you should only include those time serieses in the container, that you
        need. You can set the timeseries container with the method :func:`set_timeseries_container`.

        :param X: the DataFrame to which the calculated timeseries features will be added. This is *not* the
               dataframe with the timeseries itself.
        :type X: pandas.DataFrame

        :return: The input DataFrame, but with added features.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        # Extract only features for the IDs in X.index
        timeseries_container_X = restrict_input_to_index(self.timeseries_container, self.column_id, X.index)

        extracted_features = extract_features(timeseries_container_X,
                                              default_fc_parameters=self.default_fc_parameters,
                                              kind_to_fc_parameters=self.kind_to_fc_parameters,
                                              column_id=self.column_id, column_sort=self.column_sort,
                                              column_kind=self.column_kind, column_value=self.column_value,
                                              chunksize=self.chunksize,
                                              n_jobs=self.n_jobs, show_warnings=self.show_warnings,
                                              disable_progressbar=self.disable_progressbar,
                                              impute_function=self.impute_function,
                                              profile=self.profile,
                                              profiling_filename=self.profiling_filename,
                                              profiling_sorting=self.profiling_sorting)

        X = pd.merge(X, extracted_features, left_index=True, right_index=True, how="left")

        return X
Ejemplo n.º 9
0
def extract_relevant_features(
        timeseries_container,
        y,
        X=None,
        default_fc_parameters=None,
        kind_to_fc_parameters=None,
        column_id=None,
        column_sort=None,
        column_kind=None,
        column_value=None,
        show_warnings=defaults.SHOW_WARNINGS,
        disable_progressbar=defaults.DISABLE_PROGRESSBAR,
        profile=defaults.PROFILING,
        profiling_filename=defaults.PROFILING_FILENAME,
        profiling_sorting=defaults.PROFILING_SORTING,
        test_for_binary_target_binary_feature=defaults.
    TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
        test_for_binary_target_real_feature=defaults.
    TEST_FOR_BINARY_TARGET_REAL_FEATURE,
        test_for_real_target_binary_feature=defaults.
    TEST_FOR_REAL_TARGET_BINARY_FEATURE,
        test_for_real_target_real_feature=defaults.
    TEST_FOR_REAL_TARGET_REAL_FEATURE,
        fdr_level=defaults.FDR_LEVEL,
        hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
        n_jobs=defaults.N_PROCESSES,
        chunksize=defaults.CHUNKSIZE,
        ml_task='auto'):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with relevant features with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
            See :func:`~tsfresh.feature_extraction.extraction.extract_features`.

    :param X: A DataFrame containing additional features
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container,
                                                       column_id, X.index)

    X_ext = extract_features(timeseries_container,
                             default_fc_parameters=default_fc_parameters,
                             kind_to_fc_parameters=kind_to_fc_parameters,
                             show_warnings=show_warnings,
                             disable_progressbar=disable_progressbar,
                             profile=profile,
                             profiling_filename=profiling_filename,
                             profiling_sorting=profiling_sorting,
                             n_jobs=n_jobs,
                             column_id=column_id,
                             column_sort=column_sort,
                             column_kind=column_kind,
                             column_value=column_value,
                             impute_function=impute)

    X_sel = select_features(
        X_ext,
        y,
        test_for_binary_target_binary_feature=
        test_for_binary_target_binary_feature,
        test_for_binary_target_real_feature=test_for_binary_target_real_feature,
        test_for_real_target_binary_feature=test_for_real_target_binary_feature,
        test_for_real_target_real_feature=test_for_real_target_real_feature,
        fdr_level=fdr_level,
        hypotheses_independent=hypotheses_independent,
        n_jobs=n_jobs,
        chunksize=chunksize,
        ml_task=ml_task)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X
Ejemplo n.º 10
0
def extract_relevant_features(timeseries_container, y, X=None,
                              default_fc_parameters=None,
                              kind_to_fc_parameters=None,
                              column_id=None, column_sort=None, column_kind=None, column_value=None,
                              show_warnings=defaults.SHOW_WARNINGS,
                              disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                              profile=defaults.PROFILING,
                              profiling_filename=defaults.PROFILING_FILENAME,
                              profiling_sorting=defaults.PROFILING_SORTING,
                              test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                              test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                              test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                              test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE,
                              fdr_level=defaults.FDR_LEVEL,
                              hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                              n_jobs=defaults.N_PROCESSES,
                              chunksize=defaults.CHUNKSIZE,
                              ml_task='auto'):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with relevant features with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
            See :func:`~tsfresh.feature_extraction.extraction.extract_features`.

    :param X: A DataFrame containing additional features
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index)

    X_ext = extract_features(timeseries_container,
                             default_fc_parameters=default_fc_parameters,
                             kind_to_fc_parameters=kind_to_fc_parameters,
                             show_warnings=show_warnings,
                             disable_progressbar=disable_progressbar,
                             profile=profile,
                             profiling_filename=profiling_filename,
                             profiling_sorting=profiling_sorting,
                             n_jobs=n_jobs,
                             column_id=column_id, column_sort=column_sort,
                             column_kind=column_kind, column_value=column_value,
                             impute_function=impute)

    X_sel = select_features(X_ext, y,
                            test_for_binary_target_binary_feature=test_for_binary_target_binary_feature,
                            test_for_binary_target_real_feature=test_for_binary_target_real_feature,
                            test_for_real_target_binary_feature=test_for_real_target_binary_feature,
                            test_for_real_target_real_feature=test_for_real_target_real_feature,
                            fdr_level=fdr_level, hypotheses_independent=hypotheses_independent,
                            n_jobs=n_jobs,
                            chunksize=chunksize,
                            ml_task=ml_task)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X