コード例 #1
0
    def test_fs_tb_fr_series(self):
        with pytest.raises(TypeError):
            target_binary_feature_real_test(x=[0, 1, 2],
                                            y=pd.Series([0, 1, 2]))

        with pytest.raises(TypeError):
            target_binary_feature_real_test(x=pd.Series([0, 1, 2]),
                                            y=[0, 1, 2])
コード例 #2
0
    def test_fs_tb_fr_config(self):
        # Unneeded data (the function call will fail probably)
        x = pd.Series(np.random.normal(0, 1, 250), name="TEST")
        y = pd.Series(np.random.binomial(1, 0.5, 250))

        with pytest.raises(ValueError):
            target_binary_feature_real_test(x=x,
                                            y=y,
                                            test="other_unknown_function")
コード例 #3
0
    def test_fs_tb_fr_binary(self):
        with pytest.raises(ValueError):
            target_binary_feature_real_test(
                x=pd.Series([0, 1, 2]),
                y=pd.Series([0, 1, 2]),
                test=TEST_FOR_BINARY_TARGET_REAL_FEATURE)

        # Should not fail
        target_binary_feature_real_test(
            x=pd.Series([0, 1, 2]),
            y=pd.Series([0, 2, 0]),
            test=TEST_FOR_BINARY_TARGET_REAL_FEATURE)
コード例 #4
0
def _calculate_p_value(feature_column, y, settings, target_is_binary):
    """
    Internal helper function to calculate the p-value of a given feature using one of the dedicated
    functions target_*_feature_*_test.

    :param feature_column: the feature column.
    :type feature_column: pandas.Series

    :param y: the binary target vector
    :type y: pandas.Series

    :param settings: The settings object to control how the significance is calculated.
    :type settings: FeatureSignificanceTestsSettings

    :param target_is_binary: Whether the target is binary or not
    :type target_is_binary: bool

    :return: the p-value of the feature significance test and the type of the tested feature as a Series.
             Lower p-values indicate a higher feature significance.
    :rtype: pd.Series
    """
    # Do not process constant features
    if len(pd.unique(feature_column.values)) == 1:
        _logger.warning(
            "[test_feature_significance] Feature {} is constant".format(
                feature_column.name))
        return pd.Series({
            "type": "const",
            "rejected": False
        },
                         name=feature_column.name)

    else:
        if target_is_binary:
            # Decide if the current feature is binary or not
            if len(set(feature_column.values)) == 2:
                type = "binary"
                p_value = target_binary_feature_binary_test(
                    feature_column, y, settings)
            else:
                type = "real"
                p_value = target_binary_feature_real_test(
                    feature_column, y, settings)
        else:
            # Decide if the current feature is binary or not
            if len(set(feature_column.values)) == 2:
                type = "binary"
                p_value = target_real_feature_binary_test(
                    feature_column, y, settings)
            else:
                type = "real"
                p_value = target_real_feature_real_test(
                    feature_column, y, settings)

        return pd.Series({
            "p_value": p_value,
            "type": type
        },
                         name=feature_column.name)
コード例 #5
0
def _calculate_p_value(feature_column, y, target_is_binary,
                       test_for_binary_target_real_feature):
    """
    Internal helper function to calculate the p-value of a given feature using one of the dedicated
    functions target_*_feature_*_test.

    :param feature_column: the feature column.
    :type feature_column: pandas.Series

    :param y: the binary target vector
    :type y: pandas.Series

    :param target_is_binary: Whether the target is binary or not
    :type target_is_binary: bool

    :param test_for_binary_target_real_feature: The significance test to be used for binary target and real valued
                                                features. Either ``'mann'`` for the Mann-Whitney-U test or ``'smir'``
                                                for the Kolmogorov-Smirnov test.
    :type test_for_binary_target_real_feature: str

    :return: the p-value of the feature significance test and the type of the tested feature as a Series.
             Lower p-values indicate a higher feature significance.
    :rtype: pd.Series
    """
    # Do not process constant features
    if len(pd.unique(feature_column.values)) == 1:
        _logger.warning(
            "[test_feature_significance] Feature {} is constant".format(
                feature_column.name))
        return pd.Series({
            "type": "const",
            "rejected": False
        },
                         name=feature_column.name)

    else:
        if target_is_binary:
            # Decide if the current feature is binary or not
            if len(set(feature_column.values)) == 2:
                type = "binary"
                p_value = target_binary_feature_binary_test(feature_column, y)
            else:
                type = "real"
                p_value = target_binary_feature_real_test(
                    feature_column, y, test_for_binary_target_real_feature)
        else:
            # Decide if the current feature is binary or not
            if len(set(feature_column.values)) == 2:
                type = "binary"
                p_value = target_real_feature_binary_test(feature_column, y)
            else:
                type = "real"
                p_value = target_real_feature_real_test(feature_column, y)

        return pd.Series({
            "p_value": p_value,
            "type": type
        },
                         name=feature_column.name)
コード例 #6
0
 def test_feature_selection_target_binary_features_realvalued(self, minimal_p_value_for_unsignificant_features,
                                                                      real_feature,
                                                                      binary_target_not_related):
     """
     Test if the p_value returned by target_binary_feature_binary_test is
     large enough for highly unsignificant features.
     """
     p_value = target_binary_feature_real_test(real_feature, binary_target_not_related,
                                               TEST_FOR_BINARY_TARGET_REAL_FEATURE)
     assert minimal_p_value_for_unsignificant_features < p_value
コード例 #7
0
    def test_feature_selection_target_binary_features_realvalued_smir(self, maximal_p_value_for_significant_features,
                                                                           real_feature):
        """
        Test if the p_value returned by target_binary_feature_real_test is
        low enough for highly significant features.
        """
        y = pd.Series(np.ndarray(250))
        y[real_feature >= 0.3] = 1
        y[real_feature < 0.3] = 0
        y -= pd.Series(np.random.binomial(1, 0.2, 250))
        y[y == -1] = 0
        y[y == 2] = 1

        p_value = target_binary_feature_real_test(real_feature, y, test="smir")
        assert maximal_p_value_for_significant_features > p_value
コード例 #8
0
    def test_feature_selection_target_binary_features_realvalued_mann(self, maximal_p_value_for_significant_features,
                                                                           real_feature):
        """
        Test if the p_value returned by target_binary_feature_real_test is
        low enough for highly significant features.
        """
        y = pd.Series(np.ndarray(250))
        y[real_feature >= 0.3] = 1
        y[real_feature < 0.3] = 0
        y -= pd.Series(np.random.binomial(1, 0.1, 250))
        y[y == -1] = 0
        y[y == 2] = 1

        p_value = target_binary_feature_real_test(real_feature, y, TEST_FOR_BINARY_TARGET_REAL_FEATURE)
        assert maximal_p_value_for_significant_features > p_value
コード例 #9
0
ファイル: test_checks.py プロジェクト: michetonu/tsfresh
 def test_check_feature_is_series(self, binary_series, real_series):
     with pytest.raises(TypeError):
         target_binary_feature_real_test(x=real_series, y=binary_series.values)
コード例 #10
0
ファイル: test_checks.py プロジェクト: michetonu/tsfresh
 def test_checks_target_nan(self, binary_series_with_nan, real_series):
     with pytest.raises(ValueError):
         target_binary_feature_real_test(x=real_series, y=binary_series_with_nan,
                                         test=TEST_FOR_BINARY_TARGET_REAL_FEATURE)
コード例 #11
0
ファイル: test_checks.py プロジェクト: michetonu/tsfresh
 def test_checks_test_function(self, binary_series, real_series):
     with pytest.raises(ValueError):
         target_binary_feature_real_test(x=real_series, y=binary_series, test="other_unknown_function")
コード例 #12
0
ファイル: feature_selector.py プロジェクト: zergey/tsfresh
def check_fs_sig_bh(X, y, settings=None):
    """
    The wrapper function that calls the significance test functions in this package.
    In total, for each feature from the input pandas.DataFrame an univariate feature significance test is conducted.
    Those tests generate p values that are then evaluated by the Benjamini Hochberg procedure to decide which features
    to keep and which to delete.

    We are testing
    
        :math:`H_0` = the Feature is not relevant and can not be added

    against

        :math:`H_1` = the Feature is relevant and should be kept
   
    or in other words
 
        :math:`H_0` = Target and Feature are independent / the Feature has no influence on the target

        :math:`H_1` = Target and Feature are associated / dependent

    When the target is binary this becomes
    
        :math:`H_0 = \\left( F_{\\text{target}=1} = F_{\\text{target}=0} \\right)`

        :math:`H_1 = \\left( F_{\\text{target}=1} \\neq F_{\\text{target}=0} \\right)`
    
    Where :math:`F` is the distribution of the target.

    In the same way we can state the hypothesis when the feature is binary
    
        :math:`H_0 =  \\left( T_{\\text{feature}=1} = T_{\\text{feature}=0} \\right)`

        :math:`H_1 = \\left( T_{\\text{feature}=1} \\neq T_{\\text{feature}=0} \\right)`

    Here :math:`T` is the distribution of the target.

    TODO: And for real valued?

    :param X: The DataFrame containing all the features and the target
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param settings: The feature selection settings to use for performing the tests.
    :type settings: FeatureSignificanceTestsSettings

    :return: A pandas.DataFrame with each column of the input DataFrame X as index with information on the significance
            of this particular feature. The DataFrame has the columns
            "Feature",
            "type" (binary, real or const),
            "p_value" (the significance of this feature as a p-value, lower means more significant)
            "rejected" (if the Benjamini Hochberg procedure rejected this feature)
    :rtype: pandas.DataFrame

    """
    if settings is None:
        settings = FeatureSignificanceTestsSettings()

    target_is_binary = len(set(y)) == 2

    # todo: solve the multiclassification case. for a multi classification the algorithm considers the target to be
    # regression. Instead one could perform a binary one versus all classification.

    # Only allow entries for which the target is known!
    y = y.astype(np.float)
    X = X.copy().loc[~(y == np.NaN), :]

    # Create the DataFrame df_features containing the information about the different hypotheses
    # Every row contains information over one feature column from X
    df_features = pd.DataFrame()

    # Don't process features from the ignore-list
    df_features['Feature'] = list(set(X.columns))
    df_features = df_features.set_index('Feature', drop=False)

    # Don't process constant features
    for feature in df_features['Feature']:
        if len(pd.unique(X[feature])) == 1:
            df_features = df_features.drop(feature)
            _logger.warning(
                "[test_feature_significance] Feature {} is constant".format(
                    feature))

    # Add relevant columns to df_features
    df_features["type"] = np.nan
    df_features["p_value"] = np.nan
    df_features["rejected"] = np.nan

    # Process the features
    for feature in df_features['Feature']:
        if target_is_binary:
            # Decide if the current feature is binary or not
            if len(set(X[feature].values)) == 2:
                df_features.loc[df_features.Feature == feature,
                                "type"] = "binary"
                p_value = target_binary_feature_binary_test(
                    X[feature], y, settings)
            else:
                df_features.loc[df_features.Feature == feature,
                                "type"] = "real"
                p_value = target_binary_feature_real_test(
                    X[feature], y, settings)
        else:
            # Decide if the current feature is binary or not
            if len(set(X[feature].values)) == 2:
                df_features.loc[df_features.Feature == feature,
                                "type"] = "binary"
                p_value = target_real_feature_binary_test(
                    X[feature], y, settings)
            else:
                df_features.loc[df_features.Feature == feature,
                                "type"] = "real"
                p_value = target_real_feature_real_test(
                    X[feature], y, settings)

        # Add p_values to df_features
        df_features.loc[df_features['Feature'] == feature, "p_value"] = p_value

    # Check for constant features
    for feature in list(set(X.columns)):
        if len(pd.unique(X[feature])) == 1:
            df_features.loc[feature, "type"] = "const"
            df_features.loc[feature, "rejected"] = True

    # Perform the real feature rejection
    df_features = benjamini_hochberg_test(df_features, settings)

    if settings.write_selection_report:
        # Write results of BH - Test to file
        if not os.path.exists(settings.result_dir):
            os.mkdir(settings.result_dir)

        with open(os.path.join(settings.result_dir, "fs_bh_results.txt"),
                  'w') as file_out:
            file_out.write((
                "Performed BH Test to control the false discovery rate(FDR); \n"
                "FDR-Level={0};Hypothesis independent={1}\n").format(
                    settings.fdr_level, settings.hypotheses_independent))
            df_features.to_csv(index=False,
                               path_or_buf=file_out,
                               sep=';',
                               float_format='%.4f')

    return df_features
コード例 #13
0
ファイル: test_checks.py プロジェクト: SriRamaKusu/tsfresh
 def test_check_feature_is_series(self, binary_series, real_series):
     with pytest.raises(TypeError):
         target_binary_feature_real_test(x=real_series, y=binary_series.values)
コード例 #14
0
ファイル: test_checks.py プロジェクト: SriRamaKusu/tsfresh
 def test_checks_target_nan(self, binary_series_with_nan, real_series):
     with pytest.raises(ValueError):
         target_binary_feature_real_test(x=real_series, y=binary_series_with_nan,
                                         test=TEST_FOR_BINARY_TARGET_REAL_FEATURE)
コード例 #15
0
ファイル: test_checks.py プロジェクト: SriRamaKusu/tsfresh
 def test_checks_test_function(self, binary_series, real_series):
     with pytest.raises(ValueError):
         target_binary_feature_real_test(x=real_series, y=binary_series, test="other_unknown_function")
コード例 #16
0
    logging.info("Loaded Features from: " + file)

    # look for rows in X where len == 1
    # this is needed due to a quirk in the data processing functions -
    # some of the time windows end up only including 1 step (1ms)
    if loop == 0:
        idx_to_remove = np.setdiff1d(y.index, X.index)
        y = y.drop(idx_to_remove)
    loop = 1
    # test each feature with Mann-Whitney U Test
    logging.info("Testing Hypothesis...")
    for feature in X:
        p = []
        try:
            p.append(
                target_binary_feature_real_test(X[feature], y_bin[0], 'mann'))
            p.append(
                target_binary_feature_real_test(X[feature], y_bin[1], 'mann'))
            p.append(
                target_binary_feature_real_test(X[feature], y_bin[2], 'mann'))
        except ValueError:
            p.append(1000)
            p.append(1000)
            p.append(1000)

        p.append(feature)
        p_vector.append(p)

# Save target variable
y = y.reset_index(0, drop=True)
y.to_hdf('data/ach_at_combined_y.h5', key='y', complevel=9)