Example #1
0
def giniProc(X,y):
	# obtain the gini_index score of each feature
	score = gini_index.gini_index(X, y)

	# rank features in descending order according to score
	idx = gini_index.feature_ranking(score)
	return idx
Example #2
0
    def apply_impl(self, data):
        # TODO: verify if is possible implement this with numpy
        X, y = data.Xy
        y = pd.Categorical(y).codes

        self._score = gini_index.gini_index(X, y)
        self._rank = gini_index.feature_ranking(self._score)
        self._nro_features = math.ceil(self.ratio * X.shape[1])

        return self.use_impl(data)
Example #3
0
def gini():
    before = datetime.datetime.now()
    result = gini_index.gini_index(data, labels, mode="index")
    after = datetime.datetime.now()
    print("Gini")
    result = result[:treshold]
    print(len(result))
    print("cas: " + str(after - before))
    print('\n')
    if len(result) < len(header):
        transform_and_save(result, "Gini")
    def test_gine_index(self):
        X, y = self.DATA

        f = FilterGiniIndex(ratio=0.5)
        f.fit(X, y)
        X_, y_ = f.transform(X, y)

        score = gini_index.gini_index(X, y)
        rank = gini_index.feature_ranking(score)
        selected = rank[0:5]

        assert f.fit(X, y) is f
        assert np.array_equal(f.rank(), rank)
        assert np.allclose(f.score(), score)
        assert np.allclose(X_, X[:, selected])
        assert np.array_equal(y_, y)
Example #5
0
    def test_arizona(self):
        data, target = self.coil['X'], self.coil['Y']
        start_time = time.time()
        features = gini_index.gini_index(data, target)
        print("ARIZONA time --- %s seconds ---" % (time.time() - start_time))

        start_time = time.time()
        features = GLOB_MEASURE["GiniIndex"](data, target)
        print("ITMO time --- %s seconds ---" % (time.time() - start_time))

        start_time = time.time()
        features = f_score.f_score(data, target)
        print("ARIZONA time --- %s seconds ---" % (time.time() - start_time))

        start_time = time.time()
        features = GLOB_MEASURE["FRatio"](data.shape[-1])(data, target)
        print("ITMO time --- %s seconds ---" % (time.time() - start_time))
def weight():
#    x_train, datamat, y_train,labelmat = cross_validation.train_test_split(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],comtest.iloc[0:len(comtest),-1], test_size = 0.2,random_state = j) 
#    datamat=np.array(datamat,dtype=np.float)
#    labelmat=np.array(labelmat,dtype=np.int)
    datamat=np.array(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],dtype=np.float)  #提取病例数据及其标签
    labelmat=np.array(comtest.iloc[0:len(comtest),-1],dtype=np.int)
    datamat=preprocess(datamat)
    for i in range(len(labelmat)):
        if labelmat[i]==0:
            labelmat[i]=-1;#adaboost只能区分-1和1的标签
            
    Relief = reliefF.reliefF(datamat, labelmat)   #计算Relieff下的特征权重
    print('Relief, 第%s次验证 '%(1))
    Fisher= fisher_score.fisher_score(datamat, labelmat)  #计算fisher下的特征权重
    print('Fisher, 第%s次验证 '%(1))
    gini= gini_index.gini_index(datamat,labelmat)  #计算gini下的特征权重
    gini=-gini
    print('gini, 第%s次验证 '%(1))
    print("done_ %s" )
    return Relief, Fisher, gini
Example #7
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the gini_index score of each feature
        score = gini_index.gini_index(X[train], y[train])

        # rank features in descending order according to score
        idx = gini_index.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print('Accuracy:', old_div(float(correct), 10))
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the gini_index score of each feature
        score = gini_index.gini_index(X[train], y[train])

        # rank features in descending order according to score
        idx = gini_index.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10
Example #9
0
std_neg=np.std(negtive_feature,ddof=1,axis=0)#负类中各特征值的标准差
F_up=np.square(mean_pos-mean_feature)+np.square(mean_neg-mean_feature)
F_down=np.square(std_pos)+np.square(std_neg)
F_score=F_up/F_down
"""
#------------calculate the FS score with scikit-feature package--------------#
from skfeature.function.similarity_based import fisher_score
from skfeature.function.information_theoretical_based import MRMR
from skfeature.function.similarity_based import reliefF
from skfeature.function.statistical_based import gini_index

Relief = reliefF.reliefF(datamat, labelmat)
Fisher= fisher_score.fisher_score(datamat, labelmat)
# mRMR,J,M,=MRMR.mrmr(datamat,labelmat,n_selected_features=80)
# mRMR=-mRMR
gini= gini_index.gini_index(datamat,labelmat)
gini=-gini
FSscore=np.column_stack((Relief,Fisher,gini))#合并三个分数

FSscore=ann.preprocess(FSscore)
FinalScore=np.sum(FSscore,axis=1)
FS=np.column_stack((FSscore,FinalScore))
FS_nor=ann.preprocess(FS)#将最后一列联合得分归一化
FS=pd.DataFrame(FS_nor,columns=["Relief", "Fisher","gini","FinalScore"],index=featurenames)
# FS.to_csv("F:\Githubcode\AdaBoost\myown\FSscore.csv")


sorteigen=FS.sort_values(by='FinalScore',ascending=False,axis=0)
sorteigen.to_csv('FSsort.csv')
#------------crossalidation with ann--------------#
meanfit=[]#用来存储逐渐增加特征值过程中,不同数目特征值对应的BER平均值
Example #10
0
def gini_index_FS(X_train, y_train):
    score = gini_index.gini_index(X_train, y_train)
    # rank features in descending order according to score
    idx = gini_index.feature_ranking(score)
    return (idx, score)
Example #11
0
def main(datasetName):
    # datasetName='LBP variants/RULBP5'
    print('Dataset-', datasetName)
    data = pandas.read_csv('Data/' + datasetName + '.csv')
    (rows, cols) = np.shape(data)
    print('numRows: ', rows), print("numCols: ", cols)
    target = data.values[:, cols - 1]
    datanew = data.values[:, 0:cols - 1]
    train, test, trainLabel, testLabel = train_test_split(datanew,
                                                          target,
                                                          stratify=target,
                                                          test_size=0.2)
    (rows1, cols1) = np.shape(train)
    (rows2, cols2) = np.shape(test)
    cols = cols1

    # train = trainData.values[:,0:cols-1]
    # trainLabel = trainData.values[:,cols-1]
    # test=testData.values[:,0:cols-1]
    # testLabel = testData.values[:,cols-1]

    print('numTrainRows: ', rows1), print("numTrainCols: ", cols1)
    print('numTestRows: ', rows2), print("numTestCols: ", cols2)
    numFeatures = cols - 1

    # PccScore calculation
    PccScore = np.zeros(numFeatures)
    for loop1 in range(numFeatures):
        curFeature = train[:, loop1]
        curVal = 0.0
        for loop2 in range(numFeatures):
            if loop1 != loop2:
                corrFeature = train[:, loop2]
                PccScore[loop1] += pearsonr(curFeature, corrFeature)[0]
    PccScore = normalize(PccScore)

    print('PCC calculation done..')
    #MiScore calculation
    MiScore = np.zeros(numFeatures)

    for loop in range(numFeatures):
        curFeature = train[:, loop]
        MiScore[loop] = metrics.mutual_info_score(curFeature, trainLabel)
    MiScore = normalize(MiScore)
    print('MI calculation done..')

    #combScore calculation
    impClass = 0.8
    impFeature = 1 - impClass
    combScore = np.zeros(numFeatures)
    for loop in range(numFeatures):
        combScore[loop] = impClass * MiScore[loop] + impFeature * (
            -PccScore[loop])
        # combScore[loop]=MiScore[loop]-PccScore[loop]
    print('Combo calculation done..')

    sortedFeaturesMi = np.argsort(-MiScore)
    sortedFeaturesPcc = np.argsort(PccScore)
    sortedFeaturesComb = np.argsort(-combScore)

    table2 = PrettyTable(
        ['NumFeat', 'comb', 'Mi', 'Pcc', 'ReliefF', 'Ttest', 'GI', 'MAX'])
    f = open('Results/Results_' + datasetName, 'w+')
    f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
            ('NumFeat', 'comb', 'Mi', 'Pcc', 'ReliefF', 'Ttest', 'GI', 'MAX'))

    for loop in range(20, cols - 1, 20):
        f = open('Results/Results_' + datasetName, 'a+')
        maxVal = 0
        maxMethod = 'None'

        #chi2
        # chi2_features = SelectKBest(chi2, k = loop)
        # chi2Train = chi2_features.fit_transform(train,trainLabel)
        # chi2Test = chi2_features.transform(test)
        # clf=rf(n_estimators=100)
        # clf.fit(chi2Train,trainLabel)
        # accChi2=clf.score(chi2Test,testLabel)

        #ttest
        tt_features = ttest_f(train, trainLabel, loop)
        temp = tt_features
        newTrain = train[:, temp]
        newTest = test[:, temp]
        clf.fit(newTrain, trainLabel)
        accTtest = clf.score(newTest, testLabel)
        if maxVal < accTtest:
            maxVal = accTtest
            maxMethod = 'Ttest'

        #gini index
        score = gini_index.gini_index(train, trainLabel)
        ranking = np.argsort(score)[::-1]
        temp = ranking[0:loop]
        newTrain = train[:, temp]
        newTest = test[:, temp]
        clf.fit(newTrain, trainLabel)
        accGini = clf.score(newTest, testLabel)
        if maxVal < accTGini:
            maxVal = accGini
            maxMethod = 'GI'

        #calculate relieff
        fsRlf = rlf(n_features_to_keep=loop)
        rlfTrain = fsRlf.fit_transform(train, trainLabel)
        rlfTest = fsRlf.transform(test)
        clf = rf(n_estimators=100)
        clf.fit(rlfTrain, trainLabel)
        accRlf = clf.score(rlfTest, testLabel)
        if maxVal < accRlf:
            maxVal = accRlf
            maxMethod = 'ReliefF'
        accComb = genAcc(train, trainLabel, test, testLabel,
                         sortedFeaturesComb[0:loop])
        if maxVal < accComb:
            maxVal = accComb
            maxMethod = 'comb'
        accMi = genAcc(train, trainLabel, test, testLabel,
                       sortedFeaturesMi[0:loop])
        if maxVal < accMi:
            maxVal = accMi
            maxMethod = 'Mi'
        accPcc = genAcc(train, trainLabel, test, testLabel,
                        sortedFeaturesPcc[0:loop])
        if maxVal < accPcc:
            maxVal = accPcc
            maxMethod = 'Pcc'
        table2.add_row([
            loop, accComb, accMi, accPcc, accRlf, accTtest, accGini, maxMethod
        ])
        print(table2)
        f.write('%d\t%4.2f\t%4.2f\t%4.2f\t%4.2f\t%4.2f\t%4.2f\t%s\n' %
                (loop, accComb * 100, accMi * 100, accPcc * 100, accRlf * 100,
                 accTtets * 100, accGini * 100, maxMethod))

    f.close()