def test_init(self): selector = SelectKBest(score_func=f_regression, k=1) selector.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0])) self.assertEqual([0, 1], selector._get_support_mask().tolist()) selector_proxy = SelectorProxy(selector) self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())
def feature_selection_with_scikit(): """ 1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 2-Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator """ p = 0.8 selector = VarianceThreshold(threshold=(p * (1 - p))) c = selector.fit_transform(X) print "Number of the attribute before: ", X.shape[1] print "number of the attribute after:", c.shape[1] # selecting k best attribute instead of chi2, f_classif can also be used skb = SelectKBest(chi2, k=10) X_new = skb.fit_transform(X, y) attr = np.where(skb._get_support_mask(), attributeNames, '-1') print "Best attribute choosen with SelectKBest: " i = 1 for att in attr: if att != '-1': print i, ": ", att i += 1 #using ExtraTreesClassifier print "Using feature importance..." etc = ExtraTreesClassifier() etc.fit(X, y).transform(X) print etc.feature_importances_ print etc.max_features print etc.max_depth print "Recursive feature selection : " from sklearn.svm import SVC import sklearn.linear_model as lm from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. estim = lm.LinearRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def feature_selection_with_scikit(): """ 1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 2-Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator """ p=0.8 selector = VarianceThreshold(threshold=(p * (1 - p))) c=selector.fit_transform(X) print "Number of the attribute before: ",X.shape[1] print "number of the attribute after:",c.shape[1] # selecting k best attribute instead of chi2, f_classif can also be used skb=SelectKBest(chi2, k=10) X_new=skb.fit_transform(X, y) attr=np.where(skb._get_support_mask(),attributeNames,'-1') print "Best attribute choosen with SelectKBest: " i=1 for att in attr: if att!='-1': print i, ": ",att i+=1 #using ExtraTreesClassifier print "Using feature importance..." etc=ExtraTreesClassifier() etc.fit(X,y).transform(X) print etc.feature_importances_ print etc.max_features print etc.max_depth print "Recursive feature selection : " from sklearn.svm import SVC import sklearn.linear_model as lm from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. estim=lm.LinearRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def select_master_unigram(t_categories_file, vocabulary_file, X_train_file, master_unigram_file): t_categories = pickle.load(open(t_categories_file, "r")) vocabulary = pickle.load(open(vocabulary_file, "r")) X_train = pickle.load(open(X_train_file, "r")) print("Selecting master unigrams by a chi-squared test") ch2 = SelectKBest(chi2, k=1500) X_train = ch2.fit(X_train, t_categories) mask = ch2._get_support_mask() master_unigram_index = [i for i, e in enumerate(mask) if e==True] inv_vocabulary = {v:k for k, v in vocabulary.items()} master_unigram = [inv_vocabulary[x] for x in master_unigram_index] pickle.dump(master_unigram, open(master_unigram_file, "wb"))
def select_features(self, X, y): estimator = SelectKBest(chi2, 700) estimator.fit(X, y.A1) support_mask = estimator._get_support_mask() features = [] i = 0 for feature in support_mask: if(feature): features.append(i) i +=1 self.kbest_features = features self.num_features = len(features)
def select_master_unigram(t_categories_file, vocabulary_file, X_train_file, master_unigram_file): t_categories = pickle.load(open(t_categories_file, "r")) vocabulary = pickle.load(open(vocabulary_file, "r")) X_train = pickle.load(open(X_train_file, "r")) print("Selecting master unigrams by a chi-squared test") ch2 = SelectKBest(chi2, k=1500) X_train = ch2.fit(X_train, t_categories) mask = ch2._get_support_mask() master_unigram_index = [i for i, e in enumerate(mask) if e == True] inv_vocabulary = {v: k for k, v in vocabulary.items()} master_unigram = [inv_vocabulary[x] for x in master_unigram_index] pickle.dump(master_unigram, open(master_unigram_file, "wb"))
def select_best(k, data, vocabulary=None): """ Select the top ``k`` most informative features (using a chi-square test) and drop everything else from the index and vocabulary. :param k: integer; the most informative features to maintain :param data: ``Data`` structure :param vocabulary: vocabulary dictionary (will be updated in-place) :return: a new ``Data`` structure """ L.debug("selecting K=%s best features", k) selector = SelectKBest(chi2, k=k) selector.fit(data.index, data.labels) mask = selector._get_support_mask() data = data._replace(index=data.index[:, mask]) prune(vocabulary, mask) return data