def select_relevant_features(self, X, y): '''Select statistically significant features while computing the relevance of these features.''' # calculate relevance tables for each binary class pair relevance_tables = list() for label in np.unique(y): y_binary = (y == label) relevance_tables.append( (label, calculate_relevance_table(X, y_binary, fdr_level=self.fdr_level, n_jobs=self.n_jobs))) # concatenate relevance tables relevance_table_concat = pd.concat( [table for (lable, table) in relevance_tables]) # perform benjamini hochberg test relevance_table_benjamini = benjamini_hochberg_test( relevance_table_concat, hypotheses_independent=False, fdr_level=self.fdr_level) # remove irrelevant features from the table relevance_table_benjamini = relevance_table_benjamini[ relevance_table_benjamini.relevant == True] # select features occurred at least twice in the table feature_occurrences = relevance_table_benjamini.feature.value_counts() relevant_features = feature_occurrences[feature_occurrences == len( y.unique())].index.values occurrence_counts = feature_occurrences.value_counts() for i in range(1, 4): try: logging.info( 'Number of features occurred {} time(s) in the relevant features selected after benjamini hochberg test: {}' .format(i, occurrence_counts[i])) except ( KeyError, IndexError ): # when there is no feature occur the corresponding number of times pass # build final relevance table relevance_table_final = pd.DataFrame({ 'feature': relevant_features, 'p_value': [ relevance_table_benjamini.loc[f].p_value.max() for f in relevant_features ], 'occurrence': [feature_occurrences[f] for f in relevant_features] }).sort_values(by=['p_value', 'occurrence']).reset_index(drop=True) logging.info( "Number of relevant features for all classes: {}/{}".format( relevance_table_final.shape[0], X.shape[1])) return relevance_table_final
def _calculate_relevance_table_for_implicit_target(table_real, table_binary, X, test_real_feature, test_binary_feature, hypotheses_independent, fdr_level, map_function): table_real['p_value'] = pd.Series( map_function(test_real_feature, [X[feature] for feature in table_real.index]), index=table_real.index ) table_binary['p_value'] = pd.Series( map_function(test_binary_feature, [X[feature] for feature in table_binary.index]), index=table_binary.index ) relevance_table = pd.concat([table_real, table_binary]) return benjamini_hochberg_test(relevance_table, hypotheses_independent, fdr_level)