def extract_relevant_features(timeseries_container, y, X=None, feature_extraction_settings=None, feature_selection_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None): """ High level convenience function to extract time series features from `timeseries_container`. Then return feature matrix `X` possibly augmented with features relevant with respect to target vector `y`. For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and :func:`~tsfresh.feature_selection.selection.select_features`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_relevant_features >>> df, y = load_robot_execution_failures() >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time') :param timeseries_container: See parameter `timeseries_container` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param y: See parameter `y` in :func:`~tsfresh.feature_selection.selection.select_features` :param X: See parameter `X` in :func:`~tsfresh.feature_selection.selection.select_features` :param column_id: See parameter `column_id` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param column_sort: See parameter `column_sort` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param column_kind: See parameter `column_kind` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param column_value: See parameter `column_value` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param feature_extraction_settings: See parameter `feature_extraction_settings` in :func:`~tsfresh.feature_extraction.extraction.extract_features` :param feature_selection_settings: See parameter `feature_selection_settings` in :func:`~tsfresh.feature_selection.selection.select_features` :return: Feature matrix X, possibly extended with relevant time series features. """ if X is not None: timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index) if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute X_ext = extract_features( timeseries_container, feature_extraction_settings=feature_extraction_settings, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) X_sel = select_features( X_ext, y, feature_selection_settings=feature_selection_settings) if X is None: X = X_sel else: X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left") return X
def select_features(feature_set): """Select features in the train set based on TSfresh feature selection.""" df_train = feature_set["train"] df_valid = feature_set["valid"] df_test = feature_set["test"] # drop cols with NaN values df_train = df_train.drop(df_train.columns[df_train.isna().any()].tolist(), axis=1) # define Features and Target Dataset X_train = df_train.copy() # TSFresh feature selection selected_features = feature_selection.select_features(X_train, Y_train) # reduce dfs to selected features df_train_sel = df_train[selected_features.columns] df_valid_sel = df_valid[selected_features.columns] df_test_sel = df_test[selected_features.columns] print(f"Shape after Selection: {df_train_sel.shape}") return { "name": feature_set["name"], "train": df_train_sel, "valid": df_valid_sel, "test": df_test_sel, }
def feature_selector(X, y, ml_task='auto', n_jobs=0): """ Calculate the relevance table for the features contained in feature matrix `X` with respect to target vector `y`. The relevance table is calculated for the intended machine learning task `ml_task`. To accomplish this for each feature from the input pandas.DataFrame an univariate feature significance test is conducted. Those tests generate p values that are then evaluated by the Benjamini Hochberg procedure to decide which features to keep and which to delete. :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features. It can contain both binary or real-valued features at the same time. :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued. :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`. Defaults to `'auto'`, meaning the intended task is inferred from `y`. If `y` has a boolean, integer or object dtype, the task is assumend to be classification, else regression. :param n_jobs: Number of processes to use during the p-value calculation :return: A pandas.DataFrame with each column of the input DataFrame X as index with information on the significance of this particular feature. The DataFrame has the columns "Feature", "type" (binary, real or const), "p_value" (the significance of this feature as a p-value, lower means more significant) "relevant" (True if the Benjamini Hochberg procedure rejected the null hypothesis [the feature is not relevant] for this feature) """ return select_features(X, y, ml_task=ml_task, n_jobs=n_jobs)
def create_new_features(df_x, s_y, x_train_cols=[]): """ Create new Features from Input-Dataframe by using TSFRESH :param df_x: Dataframe containing Time-Series :param s_y: Series of Target-Var :param x_train_cols: :return: Dataframe containing created Features """ # add id column (same id for every row, because only one time series is # considered in this dataset) df_x["id"] = 1 # create roll time series for generating time series features df_x_rolled = roll_time_series( df_x, column_id="id", column_sort="Date", column_kind=None, rolling_direction=1, max_timeshift=TSFRESH_TIME_WINDOWS - 1, ) x = df_x.set_index("Date") # for each variable in input df new features are generated for current_feature in FEATURES: # noinspection PyTypeChecker generated_features = extract_features( df_x_rolled, column_id="id", n_jobs=3, column_kind=None, column_value=current_feature, impute_function=impute, default_fc_parameters=settings, ) x = pd.concat([x, generated_features], axis=1) print(f"\nNew shape of Feature-Matrix: {x.shape}") print(f"\nAmount of Features before selection: {len(x.columns)}") # check if features of train set are already selected if len(x_train_cols) == 0: # select relevant features for train set selected_features = feature_selection.select_features(x, s_y) print(f"\nAmount of Features after selection: " f"{len(selected_features.columns)}") else: # no selection is needed, features are already selected for train set selected_features = x[x_train_cols] return selected_features
def extract_relevant_features( timeseries_container, y, X=None, default_fc_parameters=None, kind_to_fc_parameters=None, column_id=None, column_sort=None, column_kind=None, column_value=None, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, profile=defaults.PROFILING, profiling_filename=defaults.PROFILING_FILENAME, profiling_sorting=defaults.PROFILING_SORTING, test_for_binary_target_binary_feature=defaults. TEST_FOR_BINARY_TARGET_BINARY_FEATURE, test_for_binary_target_real_feature=defaults. TEST_FOR_BINARY_TARGET_REAL_FEATURE, test_for_real_target_binary_feature=defaults. TEST_FOR_REAL_TARGET_BINARY_FEATURE, test_for_real_target_real_feature=defaults. TEST_FOR_REAL_TARGET_REAL_FEATURE, fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, n_jobs=defaults.N_PROCESSES, chunksize=defaults.CHUNKSIZE, ml_task='auto'): """ High level convenience function to extract time series features from `timeseries_container`. Then return feature matrix `X` possibly augmented with relevant features with respect to target vector `y`. For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and :func:`~tsfresh.feature_selection.selection.select_features`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_relevant_features >>> df, y = load_robot_execution_failures() >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time') :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. See :func:`~tsfresh.feature_extraction.extraction.extract_features`. :param X: A DataFrame containing additional features :type X: pandas.DataFrame :param y: The target vector :type y: pandas.Series :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used. :type n_jobs: int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param profile: Turn on profiling during feature extraction :type profile: bool :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for more information) :type profiling_sorting: basestring :param profiling_filename: Where to save the profiling results. :type profiling_filename: basestring :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused) :type test_for_binary_target_binary_feature: str :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature :type test_for_binary_target_real_feature: str :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused) :type test_for_real_target_binary_feature: str :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused) :type test_for_real_target_real_feature: str :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant features among all created features. :type fdr_level: float :param hypotheses_independent: Can the significance of the features be assumed to be independent? Normally, this should be set to False as the features are never independent (e.g. mean and median) :type hypotheses_independent: bool :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`. Defaults to `'auto'`, meaning the intended task is inferred from `y`. If `y` has a boolean, integer or object dtype, the task is assumend to be classification, else regression. :type ml_task: str :return: Feature matrix X, possibly extended with relevant time series features. """ if X is not None: timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index) X_ext = extract_features(timeseries_container, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, show_warnings=show_warnings, disable_progressbar=disable_progressbar, profile=profile, profiling_filename=profiling_filename, profiling_sorting=profiling_sorting, n_jobs=n_jobs, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, impute_function=impute) X_sel = select_features( X_ext, y, test_for_binary_target_binary_feature= test_for_binary_target_binary_feature, test_for_binary_target_real_feature=test_for_binary_target_real_feature, test_for_real_target_binary_feature=test_for_real_target_binary_feature, test_for_real_target_real_feature=test_for_real_target_real_feature, fdr_level=fdr_level, hypotheses_independent=hypotheses_independent, n_jobs=n_jobs, chunksize=chunksize, ml_task=ml_task) if X is None: X = X_sel else: X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left") return X
def extract_relevant_features(timeseries_container, y, X=None, default_fc_parameters=None, kind_to_fc_parameters=None, column_id=None, column_sort=None, column_kind=None, column_value=None, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, profile=defaults.PROFILING, profiling_filename=defaults.PROFILING_FILENAME, profiling_sorting=defaults.PROFILING_SORTING, test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE, test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE, test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE, test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE, fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, n_jobs=defaults.N_PROCESSES, chunksize=defaults.CHUNKSIZE, ml_task='auto'): """ High level convenience function to extract time series features from `timeseries_container`. Then return feature matrix `X` possibly augmented with relevant features with respect to target vector `y`. For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and :func:`~tsfresh.feature_selection.selection.select_features`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_relevant_features >>> df, y = load_robot_execution_failures() >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time') :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. See :func:`~tsfresh.feature_extraction.extraction.extract_features`. :param X: A DataFrame containing additional features :type X: pandas.DataFrame :param y: The target vector :type y: pandas.Series :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used. :type n_jobs: int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param profile: Turn on profiling during feature extraction :type profile: bool :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for more information) :type profiling_sorting: basestring :param profiling_filename: Where to save the profiling results. :type profiling_filename: basestring :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused) :type test_for_binary_target_binary_feature: str :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature :type test_for_binary_target_real_feature: str :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused) :type test_for_real_target_binary_feature: str :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused) :type test_for_real_target_real_feature: str :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant features among all created features. :type fdr_level: float :param hypotheses_independent: Can the significance of the features be assumed to be independent? Normally, this should be set to False as the features are never independent (e.g. mean and median) :type hypotheses_independent: bool :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`. Defaults to `'auto'`, meaning the intended task is inferred from `y`. If `y` has a boolean, integer or object dtype, the task is assumend to be classification, else regression. :type ml_task: str :return: Feature matrix X, possibly extended with relevant time series features. """ if X is not None: timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index) X_ext = extract_features(timeseries_container, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, show_warnings=show_warnings, disable_progressbar=disable_progressbar, profile=profile, profiling_filename=profiling_filename, profiling_sorting=profiling_sorting, n_jobs=n_jobs, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value, impute_function=impute) X_sel = select_features(X_ext, y, test_for_binary_target_binary_feature=test_for_binary_target_binary_feature, test_for_binary_target_real_feature=test_for_binary_target_real_feature, test_for_real_target_binary_feature=test_for_real_target_binary_feature, test_for_real_target_real_feature=test_for_real_target_real_feature, fdr_level=fdr_level, hypotheses_independent=hypotheses_independent, n_jobs=n_jobs, chunksize=chunksize, ml_task=ml_task) if X is None: X = X_sel else: X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left") return X