Ejemplo n.º 1
0
 def test_init(self):
     selector = SelectKBest(score_func=f_regression, k=1)
     selector.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5,
                                                                  1.0]))
     self.assertEqual([0, 1], selector._get_support_mask().tolist())
     selector_proxy = SelectorProxy(selector)
     self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())
def feature_selection_with_scikit():
    """
    1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t
     meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in
     all samples.
    2-Univariate feature selection works by selecting the best features based on univariate statistical tests.
     It can be seen as a preprocessing step to an estimator
    """
    p = 0.8
    selector = VarianceThreshold(threshold=(p * (1 - p)))
    c = selector.fit_transform(X)
    print "Number of the attribute before: ", X.shape[1]
    print "number of the attribute after:", c.shape[1]

    # selecting k best attribute instead of chi2, f_classif can also be used
    skb = SelectKBest(chi2, k=10)
    X_new = skb.fit_transform(X, y)
    attr = np.where(skb._get_support_mask(), attributeNames, '-1')

    print "Best attribute choosen with SelectKBest: "
    i = 1
    for att in attr:
        if att != '-1':
            print i, ": ", att
            i += 1

    #using  ExtraTreesClassifier
    print "Using feature importance..."
    etc = ExtraTreesClassifier()
    etc.fit(X, y).transform(X)
    print etc.feature_importances_
    print etc.max_features
    print etc.max_depth

    print "Recursive feature selection : "
    from sklearn.svm import SVC
    import sklearn.linear_model as lm
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Create the RFE object and compute a cross-validated score.
    estim = lm.LinearRegression()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=estim,
                  step=1,
                  cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
def feature_selection_with_scikit():
    """
    1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t
     meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in
     all samples.
    2-Univariate feature selection works by selecting the best features based on univariate statistical tests.
     It can be seen as a preprocessing step to an estimator
    """
    p=0.8
    selector = VarianceThreshold(threshold=(p * (1 - p)))
    c=selector.fit_transform(X)
    print  "Number of the attribute before: ",X.shape[1]
    print "number of the attribute after:",c.shape[1]

    # selecting k best attribute instead of chi2, f_classif can also be used
    skb=SelectKBest(chi2, k=10)
    X_new=skb.fit_transform(X, y)
    attr=np.where(skb._get_support_mask(),attributeNames,'-1')

    print "Best attribute choosen with SelectKBest: "
    i=1
    for att in attr:
        if att!='-1':
            print i, ": ",att
            i+=1

    #using  ExtraTreesClassifier
    print "Using feature importance..."
    etc=ExtraTreesClassifier()
    etc.fit(X,y).transform(X)
    print etc.feature_importances_
    print etc.max_features
    print etc.max_depth

    print "Recursive feature selection : "
    from sklearn.svm import SVC
    import sklearn.linear_model as lm
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Create the RFE object and compute a cross-validated score.
    estim=lm.LinearRegression()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
Ejemplo n.º 4
0
def select_master_unigram(t_categories_file, vocabulary_file, X_train_file, master_unigram_file):
	t_categories = pickle.load(open(t_categories_file, "r"))
	vocabulary = pickle.load(open(vocabulary_file, "r"))
	X_train = pickle.load(open(X_train_file, "r"))
	print("Selecting master unigrams by a chi-squared test")
	ch2 = SelectKBest(chi2, k=1500)
	X_train = ch2.fit(X_train, t_categories)
	mask = ch2._get_support_mask()
	master_unigram_index = [i for i, e in enumerate(mask) if e==True]
	inv_vocabulary = {v:k for k, v in vocabulary.items()}
	master_unigram = [inv_vocabulary[x] for x in master_unigram_index]
	pickle.dump(master_unigram, open(master_unigram_file, "wb"))
Ejemplo n.º 5
0
 def select_features(self, X, y):
     estimator = SelectKBest(chi2, 700)
     estimator.fit(X, y.A1)
     support_mask = estimator._get_support_mask()
     features = []
     i = 0
     for feature in support_mask:
         if(feature):
             features.append(i)
         i +=1
     self.kbest_features = features
     self.num_features = len(features)
Ejemplo n.º 6
0
def select_master_unigram(t_categories_file, vocabulary_file, X_train_file,
                          master_unigram_file):
    t_categories = pickle.load(open(t_categories_file, "r"))
    vocabulary = pickle.load(open(vocabulary_file, "r"))
    X_train = pickle.load(open(X_train_file, "r"))
    print("Selecting master unigrams by a chi-squared test")
    ch2 = SelectKBest(chi2, k=1500)
    X_train = ch2.fit(X_train, t_categories)
    mask = ch2._get_support_mask()
    master_unigram_index = [i for i, e in enumerate(mask) if e == True]
    inv_vocabulary = {v: k for k, v in vocabulary.items()}
    master_unigram = [inv_vocabulary[x] for x in master_unigram_index]
    pickle.dump(master_unigram, open(master_unigram_file, "wb"))
Ejemplo n.º 7
0
def select_best(k, data, vocabulary=None):
    """
    Select the top ``k`` most informative features (using a chi-square test)
    and drop everything else from the index and vocabulary.

    :param k: integer; the most informative features to maintain
    :param data: ``Data`` structure
    :param vocabulary: vocabulary dictionary (will be updated in-place)
    :return: a new ``Data`` structure
    """
    L.debug("selecting K=%s best features", k)
    selector = SelectKBest(chi2, k=k)
    selector.fit(data.index, data.labels)
    mask = selector._get_support_mask()
    data = data._replace(index=data.index[:, mask])
    prune(vocabulary, mask)
    return data
Ejemplo n.º 8
0
def select_best(k, data, vocabulary=None):
    """
    Select the top ``k`` most informative features (using a chi-square test)
    and drop everything else from the index and vocabulary.

    :param k: integer; the most informative features to maintain
    :param data: ``Data`` structure
    :param vocabulary: vocabulary dictionary (will be updated in-place)
    :return: a new ``Data`` structure
    """
    L.debug("selecting K=%s best features", k)
    selector = SelectKBest(chi2, k=k)
    selector.fit(data.index, data.labels)
    mask = selector._get_support_mask()
    data = data._replace(index=data.index[:, mask])
    prune(vocabulary, mask)
    return data