Exemple #1
0
def extract_relevant_features(timeseries_container,
                              y,
                              X=None,
                              feature_extraction_settings=None,
                              feature_selection_settings=None,
                              column_id=None,
                              column_sort=None,
                              column_kind=None,
                              column_value=None):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with features relevant with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: See parameter `timeseries_container` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param y: See parameter `y` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param X: See parameter `X` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param column_id: See parameter `column_id` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_sort: See parameter `column_sort` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_kind: See parameter `column_kind` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_value: See parameter `column_value` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_extraction_settings: See parameter `feature_extraction_settings` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_selection_settings: See parameter `feature_selection_settings` in :func:`~tsfresh.feature_selection.selection.select_features`

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container,
                                                       column_id, X.index)

    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute

    X_ext = extract_features(
        timeseries_container,
        feature_extraction_settings=feature_extraction_settings,
        column_id=column_id,
        column_sort=column_sort,
        column_kind=column_kind,
        column_value=column_value)
    X_sel = select_features(
        X_ext, y, feature_selection_settings=feature_selection_settings)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X
def select_features(feature_set):
    """Select features in the train set based on TSfresh feature selection."""
    df_train = feature_set["train"]
    df_valid = feature_set["valid"]
    df_test = feature_set["test"]

    # drop cols with NaN values
    df_train = df_train.drop(df_train.columns[df_train.isna().any()].tolist(),
                             axis=1)

    # define Features and Target Dataset
    X_train = df_train.copy()

    # TSFresh feature selection
    selected_features = feature_selection.select_features(X_train, Y_train)

    # reduce dfs to selected features
    df_train_sel = df_train[selected_features.columns]
    df_valid_sel = df_valid[selected_features.columns]
    df_test_sel = df_test[selected_features.columns]

    print(f"Shape after Selection: {df_train_sel.shape}")

    return {
        "name": feature_set["name"],
        "train": df_train_sel,
        "valid": df_valid_sel,
        "test": df_test_sel,
    }
def feature_selector(X, y, ml_task='auto', n_jobs=0):
    """
    Calculate the relevance table for the features contained in feature matrix `X` with respect to target vector `y`.
    The relevance table is calculated for the intended machine learning task `ml_task`.

    To accomplish this for each feature from the input pandas.DataFrame an univariate feature significance test
    is conducted. Those tests generate p values that are then evaluated by the Benjamini Hochberg procedure to
    decide which features to keep and which to delete.
    :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features.
              It can contain both binary or real-valued features at the same time.
    :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued.
    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :param n_jobs: Number of processes to use during the p-value calculation
    :return:  A pandas.DataFrame with each column of the input DataFrame X as index with information on the significance
             of this particular feature. The DataFrame has the columns
             "Feature",
             "type" (binary, real or const),
             "p_value" (the significance of this feature as a p-value, lower means more significant)
             "relevant" (True if the Benjamini Hochberg procedure rejected the null hypothesis [the feature is
             not relevant] for this feature)
    """
    return select_features(X, y, ml_task=ml_task, n_jobs=n_jobs)
def create_new_features(df_x, s_y, x_train_cols=[]):
    """
    Create new Features from Input-Dataframe by using TSFRESH
    :param df_x: Dataframe containing Time-Series
    :param s_y: Series of Target-Var
    :param x_train_cols:
    :return: Dataframe containing created Features
    """

    # add id column (same id for every row, because only one time series is
    # considered in this dataset)
    df_x["id"] = 1

    # create roll time series for generating time series features
    df_x_rolled = roll_time_series(
        df_x,
        column_id="id",
        column_sort="Date",
        column_kind=None,
        rolling_direction=1,
        max_timeshift=TSFRESH_TIME_WINDOWS - 1,
    )

    x = df_x.set_index("Date")

    # for each variable in input df new features are generated
    for current_feature in FEATURES:
        # noinspection PyTypeChecker
        generated_features = extract_features(
            df_x_rolled,
            column_id="id",
            n_jobs=3,
            column_kind=None,
            column_value=current_feature,
            impute_function=impute,
            default_fc_parameters=settings,
        )

        x = pd.concat([x, generated_features], axis=1)
        print(f"\nNew shape of Feature-Matrix: {x.shape}")

    print(f"\nAmount of Features before selection: {len(x.columns)}")

    # check if features of train set are already selected
    if len(x_train_cols) == 0:
        # select relevant features for train set
        selected_features = feature_selection.select_features(x, s_y)
        print(f"\nAmount of Features after selection: "
              f"{len(selected_features.columns)}")
    else:
        # no selection is needed, features are already selected for train set
        selected_features = x[x_train_cols]

    return selected_features
Exemple #5
0
def extract_relevant_features(
        timeseries_container,
        y,
        X=None,
        default_fc_parameters=None,
        kind_to_fc_parameters=None,
        column_id=None,
        column_sort=None,
        column_kind=None,
        column_value=None,
        show_warnings=defaults.SHOW_WARNINGS,
        disable_progressbar=defaults.DISABLE_PROGRESSBAR,
        profile=defaults.PROFILING,
        profiling_filename=defaults.PROFILING_FILENAME,
        profiling_sorting=defaults.PROFILING_SORTING,
        test_for_binary_target_binary_feature=defaults.
    TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
        test_for_binary_target_real_feature=defaults.
    TEST_FOR_BINARY_TARGET_REAL_FEATURE,
        test_for_real_target_binary_feature=defaults.
    TEST_FOR_REAL_TARGET_BINARY_FEATURE,
        test_for_real_target_real_feature=defaults.
    TEST_FOR_REAL_TARGET_REAL_FEATURE,
        fdr_level=defaults.FDR_LEVEL,
        hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
        n_jobs=defaults.N_PROCESSES,
        chunksize=defaults.CHUNKSIZE,
        ml_task='auto'):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with relevant features with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
            See :func:`~tsfresh.feature_extraction.extraction.extract_features`.

    :param X: A DataFrame containing additional features
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container,
                                                       column_id, X.index)

    X_ext = extract_features(timeseries_container,
                             default_fc_parameters=default_fc_parameters,
                             kind_to_fc_parameters=kind_to_fc_parameters,
                             show_warnings=show_warnings,
                             disable_progressbar=disable_progressbar,
                             profile=profile,
                             profiling_filename=profiling_filename,
                             profiling_sorting=profiling_sorting,
                             n_jobs=n_jobs,
                             column_id=column_id,
                             column_sort=column_sort,
                             column_kind=column_kind,
                             column_value=column_value,
                             impute_function=impute)

    X_sel = select_features(
        X_ext,
        y,
        test_for_binary_target_binary_feature=
        test_for_binary_target_binary_feature,
        test_for_binary_target_real_feature=test_for_binary_target_real_feature,
        test_for_real_target_binary_feature=test_for_real_target_binary_feature,
        test_for_real_target_real_feature=test_for_real_target_real_feature,
        fdr_level=fdr_level,
        hypotheses_independent=hypotheses_independent,
        n_jobs=n_jobs,
        chunksize=chunksize,
        ml_task=ml_task)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X
def extract_relevant_features(timeseries_container, y, X=None,
                              default_fc_parameters=None,
                              kind_to_fc_parameters=None,
                              column_id=None, column_sort=None, column_kind=None, column_value=None,
                              show_warnings=defaults.SHOW_WARNINGS,
                              disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                              profile=defaults.PROFILING,
                              profiling_filename=defaults.PROFILING_FILENAME,
                              profiling_sorting=defaults.PROFILING_SORTING,
                              test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                              test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                              test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                              test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE,
                              fdr_level=defaults.FDR_LEVEL,
                              hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                              n_jobs=defaults.N_PROCESSES,
                              chunksize=defaults.CHUNKSIZE,
                              ml_task='auto'):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with relevant features with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
            See :func:`~tsfresh.feature_extraction.extraction.extract_features`.

    :param X: A DataFrame containing additional features
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index)

    X_ext = extract_features(timeseries_container,
                             default_fc_parameters=default_fc_parameters,
                             kind_to_fc_parameters=kind_to_fc_parameters,
                             show_warnings=show_warnings,
                             disable_progressbar=disable_progressbar,
                             profile=profile,
                             profiling_filename=profiling_filename,
                             profiling_sorting=profiling_sorting,
                             n_jobs=n_jobs,
                             column_id=column_id, column_sort=column_sort,
                             column_kind=column_kind, column_value=column_value,
                             impute_function=impute)

    X_sel = select_features(X_ext, y,
                            test_for_binary_target_binary_feature=test_for_binary_target_binary_feature,
                            test_for_binary_target_real_feature=test_for_binary_target_real_feature,
                            test_for_real_target_binary_feature=test_for_real_target_binary_feature,
                            test_for_real_target_real_feature=test_for_real_target_real_feature,
                            fdr_level=fdr_level, hypotheses_independent=hypotheses_independent,
                            n_jobs=n_jobs,
                            chunksize=chunksize,
                            ml_task=ml_task)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X