Beispiel #1
0
def giniProc(X,y):
	# obtain the gini_index score of each feature
	score = gini_index.gini_index(X, y)

	# rank features in descending order according to score
	idx = gini_index.feature_ranking(score)
	return idx
Beispiel #2
0
    def apply_impl(self, data):
        # TODO: verify if is possible implement this with numpy
        X, y = data.Xy
        y = pd.Categorical(y).codes

        self._score = gini_index.gini_index(X, y)
        self._rank = gini_index.feature_ranking(self._score)
        self._nro_features = math.ceil(self.ratio * X.shape[1])

        return self.use_impl(data)
    def test_gine_index(self):
        X, y = self.DATA

        f = FilterGiniIndex(ratio=0.5)
        f.fit(X, y)
        X_, y_ = f.transform(X, y)

        score = gini_index.gini_index(X, y)
        rank = gini_index.feature_ranking(score)
        selected = rank[0:5]

        assert f.fit(X, y) is f
        assert np.array_equal(f.rank(), rank)
        assert np.allclose(f.score(), score)
        assert np.allclose(X_, X[:, selected])
        assert np.array_equal(y_, y)
Beispiel #4
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the gini_index score of each feature
        score = gini_index.gini_index(X[train], y[train])

        # rank features in descending order according to score
        idx = gini_index.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print('Accuracy:', old_div(float(correct), 10))
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the gini_index score of each feature
        score = gini_index.gini_index(X[train], y[train])

        # rank features in descending order according to score
        idx = gini_index.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10
Beispiel #6
0
def gini_index_FS(X_train, y_train):
    score = gini_index.gini_index(X_train, y_train)
    # rank features in descending order according to score
    idx = gini_index.feature_ranking(score)
    return (idx, score)