def apply_feature_selection(X_train, y_train, X_test, features): if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest': clf = RandomForestClassifier() clf = clf.fit(X_train.toarray(), y_train) features_scores = [(feature, score) for (score, feature) in sorted( zip(clf.feature_importances_, features), reverse=True)] selected_features = features_scores[:CONFIG['preprocessing'] ['top_features_to_select']] selected_indeces = np.searchsorted(features, [f[0] for f in selected_features]) X_train = X_train[:, selected_indeces] X_test = X_test[:, selected_indeces] return X_train, y_train, X_test, selected_features if CONFIG['preprocessing']['use_feature_selection'] == 'chi2': algorithm = chi2 elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA': algorithm = f_classif else: raise ValueError("No implementation for " + str(CONFIG['preprocessing']['use_feature_selection'])) feature_selector = SelectKBest( algorithm, k=CONFIG['preprocessing']['top_features_to_select']) feature_selector.fit(X_train, y_train) X_train = feature_selector.fit_transform(X_train, y_train) X_test = feature_selector.transform(X_test) features = [ (feature, score) for (score, feature ) in sorted(zip(feature_selector.scores_, features), reverse=True) ] selected_features = features[:CONFIG['preprocessing'] ['top_features_to_select']] return X_train, y_train, X_test, selected_features
def fit(self, k=100, percent=None): selector = SelectKBest(k=k) selector.fit(self.doc_vecs.todense(), np.asarray(self.labels)) scores = selector.scores_ indices = np.argsort(scores) if k is not None: select = k elif percent is not None: select = int(len(scores) * percent) else: raise ValueError('One of `k` or `percent` parameter must be not None.') indices = indices[:select] self._filtered_words = [self.words[i] for i in indices]
def feature_reduce(X, Y, num_features_to_keep): #use the chi-squared method to reduce features and reshape data test = SelectKBest(score_func=chi2, k=num_features_to_keep) fit = test.fit(X, Y) #return the data with reduced features return fit.transform(X)
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = int(feat_select.split('-')[0]) selector = SelectKBest(k=n) import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] elif re.match('.*-randombest', feat_select) is not None: n = int(feat_select.split('-')[0]) from random import shuffle features = range(0, X.shape[1]) shuffle(features) features_selected = features[:n] return features_selected
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = int(feat_select.split('-')[0]) selector = SelectKBest(k=n) import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] elif re.match('.*-randombest', feat_select) is not None: n = int(feat_select.split('-')[0]) from random import shuffle features = range(0, X.shape[1]) shuffle(features) features_selected = features[:n] return features_selected
def feature_reduce_f_class_if(X, Y, num_features_to_keep): test = SelectKBest(score_func=f_classif, k=num_features_to_keep) fit = test.fit(X, Y) #return the data with reduced features return fit.transform(X)
def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def get_best_features(self, data, labels, k=3): ''' Using the scikit-learn library, narrow down feature set. ''' num_feat = len(data.columns) while num_feat > k: num_feat = max(k, num_feat // 2) selector = SelectKBest(f_classif, k=num_feat) selector.fit(data, labels) chosen = selector.get_support() if sum(selector._pvalues[chosen]) > 0: data = data[data.columns[chosen]] else: # Many of our p-vals are zero. Accept all. data = data[data.columns[selector._pvalues == 0]] num_feat = k return data.columns
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = feat_select.split('-')[0] selector = SelectKBest(k=int(n)) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] return features_selected
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = feat_select.split('-')[0] selector = SelectKBest(k=int(n)) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] return features_selected
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
y_test[idx_start:idx_end] = cat idx_start += N_test print X_train.shape, y_train.shape print X_test.shape, y_test.shape print "start classification" # vectorization vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # feature reduction ch2 = SelectKBest(chi2, k="all") ch2.fit(X_train, y_train) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) # training clf = LinearSVC() clf.fit(X_train, y_train) if validation_mode == "train": X_test = X_train y_test = y_train # predict categories predicted = clf.predict(X_test) print numpy.mean(predicted == y_test)
from sklearn import datasets from sklearn.feature_selection.univariate_selection import SelectKBest, chi2 iris = datasets.load_iris() k_best0 = SelectKBest(score_func=chi2, k=2) fit = k_best0.fit(iris.data, iris.target) print(fit.scores_) features = fit.transform(iris.data) print(features) k_best1 = SelectKBest(score_func=chi2, k=4) newX = k_best1.fit_transform(iris.data, iris.target) print(newX)