Esempio n. 1
0
    def select_relevant_features(self, X, y):
        '''Select statistically significant features while computing the relevance of these features.'''
        # calculate relevance tables for each binary class pair
        relevance_tables = list()
        for label in np.unique(y):
            y_binary = (y == label)
            relevance_tables.append(
                (label,
                 calculate_relevance_table(X,
                                           y_binary,
                                           fdr_level=self.fdr_level,
                                           n_jobs=self.n_jobs)))

        # concatenate relevance tables
        relevance_table_concat = pd.concat(
            [table for (lable, table) in relevance_tables])

        # perform benjamini hochberg test
        relevance_table_benjamini = benjamini_hochberg_test(
            relevance_table_concat,
            hypotheses_independent=False,
            fdr_level=self.fdr_level)

        # remove irrelevant features from the table
        relevance_table_benjamini = relevance_table_benjamini[
            relevance_table_benjamini.relevant == True]

        # select features occurred at least twice in the table
        feature_occurrences = relevance_table_benjamini.feature.value_counts()
        relevant_features = feature_occurrences[feature_occurrences == len(
            y.unique())].index.values
        occurrence_counts = feature_occurrences.value_counts()
        for i in range(1, 4):
            try:
                logging.info(
                    'Number of features occurred {} time(s) in the relevant features selected after benjamini hochberg test: {}'
                    .format(i, occurrence_counts[i]))
            except (
                    KeyError, IndexError
            ):  # when there is no feature occur the corresponding number of times
                pass
        # build final relevance table
        relevance_table_final = pd.DataFrame({
            'feature':
            relevant_features,
            'p_value': [
                relevance_table_benjamini.loc[f].p_value.max()
                for f in relevant_features
            ],
            'occurrence': [feature_occurrences[f] for f in relevant_features]
        }).sort_values(by=['p_value', 'occurrence']).reset_index(drop=True)
        logging.info(
            "Number of relevant features for all classes: {}/{}".format(
                relevance_table_final.shape[0], X.shape[1]))

        return relevance_table_final
Esempio n. 2
0
def _calculate_relevance_table_for_implicit_target(table_real, table_binary, X, test_real_feature, test_binary_feature,
                                                   hypotheses_independent, fdr_level, map_function):
    table_real['p_value'] = pd.Series(
        map_function(test_real_feature, [X[feature] for feature in table_real.index]),
        index=table_real.index
    )
    table_binary['p_value'] = pd.Series(
        map_function(test_binary_feature, [X[feature] for feature in table_binary.index]),
        index=table_binary.index
    )
    relevance_table = pd.concat([table_real, table_binary])
    return benjamini_hochberg_test(relevance_table, hypotheses_independent, fdr_level)
Esempio n. 3
0
def _calculate_relevance_table_for_implicit_target(table_real, table_binary, X, test_real_feature, test_binary_feature,
                                                   hypotheses_independent, fdr_level, map_function):
    table_real['p_value'] = pd.Series(
        map_function(test_real_feature, [X[feature] for feature in table_real.index]),
        index=table_real.index
    )
    table_binary['p_value'] = pd.Series(
        map_function(test_binary_feature, [X[feature] for feature in table_binary.index]),
        index=table_binary.index
    )
    relevance_table = pd.concat([table_real, table_binary])
    return benjamini_hochberg_test(relevance_table, hypotheses_independent, fdr_level)