Example #1
0
def _extract_features_per_kind(kind_to_df_map, settings, column_id, column_value, serial=False):
    """
    Parallelize the feature extraction per kind.

    :param kind_to_df_map: The time series to compute the features for in our internal format
    :type kind_to_df_map: dict of pandas.DataFrame

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param settings: settings object that controls which features are calculated
    :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :param serial: Do not parallelize the extraction. This can be handy if (1) you want to debug something
       (2) you want to profile something or (3) your environment does not support multiprocessing

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series,
                                                           column_id=column_id,
                                                           column_value=column_value,
                                                           settings=settings)
    pool = Pool(settings.n_processes)

    chunksize = helper_functions.calculate_best_chunksize(kind_to_df_map, settings)

    total_number_of_expected_results = len(kind_to_df_map)

    if serial:
        map_function = map
    else:
        map_function = partial(pool.imap_unordered, chunksize=chunksize)

    extracted_features = tqdm(map_function(partial_extract_features_for_one_time_series, kind_to_df_map.items()),
                              total=total_number_of_expected_results,
                              desc="Feature Extraction", disable=settings.disable_progressbar)

    pool.close()

    # Concatenate all partial results
    result = pd.concat(extracted_features, axis=1, join='outer').astype(np.float64)

    # Impute the result if requested
    if settings.IMPUTE is not None:
        settings.IMPUTE(result)

    pool.join()
    return result
Example #2
0
def _extract_features_parallel_per_sample(kind_to_df_map, settings, column_id, column_value):
    """
    Parallelize the feature extraction per kind and per sample.

    As the splitting of the dataframes per kind along column_id is quite costly, we settled for an async map in this
    function. The result objects are temporarily stored in a fifo queue from which they can be retrieved in order
    of submission.

    :param kind_to_df_map: The time series to compute the features for in our internal format
    :type kind_to_df_map: dict of pandas.DataFrame

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param settings: settings object that controls which features are calculated
    :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series,
                                                           column_id=column_id,
                                                           column_value=column_value,
                                                           settings=settings)
    pool = Pool(settings.n_processes)
    total_number_of_expected_results = 0

    # Submit map jobs per kind per sample
    results_fifo = Queue()

    for kind, df_kind in kind_to_df_map.items():
        df_grouped_by_id = df_kind.groupby(column_id)

        total_number_of_expected_results += len(df_grouped_by_id)

        chunksize = helper_functions.calculate_best_chunksize(df_grouped_by_id, settings)

        results_fifo.put(
            pool.imap_unordered(
                partial_extract_features_for_one_time_series,
                [(kind, df_group) for _, df_group in df_grouped_by_id],
                chunksize=chunksize
            )
        )

    pool.close()

    # Wait for the jobs to complete and concatenate the partial results
    dfs_per_kind = []

    # Do this all with a progress bar
    with tqdm(total=total_number_of_expected_results, desc="Feature Extraction", disable=settings.disable_progressbar) as progress_bar:
        # We need some sort of measure, when a new result is there. So we wrap the
        # map_results into another iterable which updates the progress bar each time
        # a new result is there
        def iterable_with_tqdm_update(queue, progress_bar):
            for element in queue:
                progress_bar.update(1)
                yield element

        result = pd.DataFrame()
        while not results_fifo.empty():
            map_result = results_fifo.get()
            dfs_kind = iterable_with_tqdm_update(map_result, progress_bar)
            df_tmp = pd.concat(dfs_kind, axis=0).astype(np.float64)
            result = pd.concat([result, df_tmp], axis=1).astype(np.float64)

    # Impute the result if requested
    if settings.IMPUTE is not None:
        settings.IMPUTE(result)

    pool.join()
    return result
Example #3
0
def check_fs_sig_bh(X, y, settings=None):
    """
    The wrapper function that calls the significance test functions in this package.
    In total, for each feature from the input pandas.DataFrame a univariate feature significance test is conducted.
    Those tests generate p values that are then evaluated by the Benjamini Hochberg procedure to decide which features
    to keep and which to delete.

    We are testing
    
        :math:`H_0` = the Feature is not relevant and cannot be added

    against

        :math:`H_1` = the Feature is relevant and should be kept
   
    or in other words
 
        :math:`H_0` = Target and Feature are independent / the Feature has no influence on the target

        :math:`H_1` = Target and Feature are associated / dependent

    When the target is binary this becomes
    
        :math:`H_0 = \\left( F_{\\text{target}=1} = F_{\\text{target}=0} \\right)`

        :math:`H_1 = \\left( F_{\\text{target}=1} \\neq F_{\\text{target}=0} \\right)`
    
    Where :math:`F` is the distribution of the target.

    In the same way we can state the hypothesis when the feature is binary
    
        :math:`H_0 =  \\left( T_{\\text{feature}=1} = T_{\\text{feature}=0} \\right)`

        :math:`H_1 = \\left( T_{\\text{feature}=1} \\neq T_{\\text{feature}=0} \\right)`

    Here :math:`T` is the distribution of the target.

    TODO: And for real valued?

    :param X: The DataFrame containing all the features and the target
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param settings: The feature selection settings to use to perform the tests.
    :type settings: FeatureSignificanceTestsSettings

    :return: A pandas.DataFrame with each column of the input DataFrame X as index with information on the significance
            of this particular feature. The DataFrame has the columns
            "Feature",
            "type" (binary, real or const),
            "p_value" (the significance of this feature as a p-value, lower means more significant)
            "rejected" (if the Benjamini Hochberg procedure rejected this feature)
    :rtype: pandas.DataFrame

    """
    if settings is None:
        settings = FeatureSignificanceTestsSettings()

    target_is_binary = len(set(y)) == 2

    # todo: solve the multiclassification case. for a multi classification the algorithm considers the target to be
    # regression. Instead one could perform a binary one versus all classification.

    # Only allow entries for which the target is known!
    y = y.astype(np.float)
    X = X.copy().loc[~(y == np.NaN), :]

    # Create the DataFrame df_features containing the information about the different hypotheses
    # Every row contains information over one feature column from X
    df_features = pd.DataFrame()

    df_features['Feature'] = list(set(X.columns))
    df_features = df_features.set_index('Feature', drop=False)

    # Add relevant columns to df_features
    df_features["rejected"] = np.nan
    df_features["type"] = np.nan
    df_features["p_value"] = np.nan

    # Calculate the feature significance in parallel
    pool = Pool(settings.n_processes)

    # Helper function which wraps the _calculate_p_value with many arguments already set
    f = partial(_calculate_p_value, y=y, settings=settings, target_is_binary=target_is_binary)

    chunksize = helper_functions.calculate_best_chunksize(df_features, settings)
    total_number_of_features = len(df_features)
    results = tqdm(pool.imap_unordered(f, [X[feature] for feature in df_features['Feature']], chunksize=chunksize),
                   total=total_number_of_features, desc="Feature Selection")

    p_values_of_features = pd.DataFrame(list(results))
    df_features.update(p_values_of_features)

    pool.close()
    pool.join()

    # Perform the real feature rejection
    if "const" in set(df_features.type):
        df_features_bh = benjamini_hochberg_test(df_features.loc[~(df_features.type == "const")], settings)
        df_features = pd.concat([df_features_bh, df_features.loc[df_features.type == "const"]])
    else:
        df_features = benjamini_hochberg_test(df_features, settings)
        
    # It is very important that we have a boolean "rejected" column, so we do a cast here to be sure
    df_features["rejected"] = df_features["rejected"].astype("bool")

    if settings.write_selection_report:
        # Write results of BH - Test to file
        if not os.path.exists(settings.result_dir):
            os.mkdir(settings.result_dir)

        with open(os.path.join(settings.result_dir, "fs_bh_results.txt"), 'w') as file_out:
            file_out.write(("Performed BH Test to control the false discovery rate(FDR); \n"
                            "FDR-Level={0};Hypothesis independent={1}\n"
                            ).format(settings.fdr_level, settings.hypotheses_independent))
            df_features.to_csv(index=False, path_or_buf=file_out, sep=';', float_format='%.4f')
    return df_features