Beispiel #1
0
def main():
    # load training data
    mat0 = scipy.io.loadmat('fea.mat')
    X = mat0['fea_tr']  # data
    X = X.astype(float)

    mat1 = scipy.io.loadmat('gnd.mat')
    y = mat1['gnd_tr']  # label

    n_samples, n_features = X.shape  # number of samples and number of features

    #load test data
    mat2 = scipy.io.loadmat('fea_t.mat')
    X_t = mat2['fea_tst']  # data
    X_t = X_t.astype(float)

    mat3 = scipy.io.loadmat('gnd_t.mat')
    y_t = mat3['gnd_tst']  # label

    n_samples_t, n_features_t = X_t.shape  # number of samples and number of features

    # perform evaluation on classification task
    num_fea = 400  # number of selected features

    gnb = GaussianNB()

    idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea)

    # obtain the index of each feature on the training set
    idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea)

    # obtain the dataset on the selected features
    features = X[:, idx[0:num_fea]]

    # train a classification model with the selected features on the training dataset
    gnb.fit(features, y)

    # obtain the dataset on the selected features of the test set for prediction purposes
    features_t = X_t[:, idx[0:num_fea]]

    # predict the class labels of test data
    y_predict = gnb.predict(features_t)

    # obtain the classification accuracy on the test data
    acc = accuracy_score(y_t, y_predict)

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(acc) / 10
Beispiel #2
0
    def get_mrmr_score(self, max_dim):
        """ Итеративно выполняет отбор признаков на основе меры "minimal redundancy maximum relevance" (MRMR).

            Args:
                max_dim(int): предельное число признаков, которые следует отобрать.

        """

        x_train = scale(self.features)  # feature normalization
        y_train = self.targets  # targets vector

        kf = KFold(n_splits=5, shuffle=True, random_state=241)  # make CV tool
        ar_scorer = make_scorer(roc_auc_score)  # make scorer tool
        clf = MLPRegressor(
            hidden_layer_sizes=(20,
                                10))  # multilayer perceptron as a classifier
        auc_roc_scores = []
        for n_features in range(1, max_dim + 1):
            mrmr_idx, _, _ = MRMR.mrmr(x_train,
                                       y_train,
                                       n_selected_features=n_features)
            features = x_train[:, mrmr_idx]
            vect_auc_roc_score = cross_val_score(clf,
                                                 features,
                                                 y_train,
                                                 scoring=ar_scorer,
                                                 cv=kf)  # train
            auc_roc_scores.append(np.mean(
                vect_auc_roc_score))  # save mean value of auc roc on CV

        return auc_roc_scores
Beispiel #3
0
    def mrmr_feature_select(self, n_selected_features=50):
        """
        Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012
        select features index[0] is the most important feature
        j_cmi:  basic scoring criteria for linear combination of shannon information term
        j_cmi=I(f;y)-beta*sum_j(I(fj;f))+gamma*sum(I(fj;f|y))  conditional mutual information mrmr gama=0
        
        互信息(Mutual Information)是度量两个事件集合之间的相关性(mutual dependence)。互信息是点间互信息(PMI)的期望值

        MIfy: mutual information between selected features and response y
        """
        #         plot_tsne(self.X,Y=self.Y,targets=self.target_names, filename=self.filename +'.before_mrmr_feature_selection')
        n_samples, n_features = self.X.shape
        x = np.array(self.X)
        if n_selected_features and n_features > n_selected_features:
            # filter half more features or select 50 features int(n_features*percent)  #
            #             self.logger.info("selecting {} features using mrmr".format(num_fea))
            idx, j_cmi, MIfy = MRMR.mrmr(
                x, self.Y, n_selected_features=n_selected_features)
        else:
            idx, j_cmi, MIfy = MRMR.mrmr(
                x, self.Y
            )  #select automatically  may still  remain many features or
        num_fea = len(idx)
        # obtain the dataset on the selected features
        self.features = self.X.columns[idx].values
        mrmr_report = pd.DataFrame(
            {
                "features": self.features,
                "j_cmi": j_cmi,
                "MIfy": MIfy
            },
            columns=['features', 'j_cmi', 'MIfy'])
        mrmr_report = mrmr_report.sort_values('MIfy', ascending=False)
        mrmr_report.to_csv(self.filename + ".mrmr_features.report.csv",
                           index=False)

        self.X = self.X.iloc[:, idx]  #select mrmr features
        sel_bools = self.X.sum(axis=1) != 0  # filter  all 0 rows samples.
        self.X = self.X[sel_bools]
        self.Y = self.Y[sel_bools]
        self.X.to_csv(self.filename + ".mrmr_sel_features.csv")
        self.logger.info("Selected {} features using mrmr".format(num_fea))
        self.stats.append(("mrmr_dim", self.X.shape))
def feature_extract(discretize_data, target_data, num_fea):
    import scipy.io
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import cross_validate
    from sklearn.model_selection import train_test_split
    from sklearn import svm
    from skfeature.function.information_theoretical_based import MRMR

    feature_extraction = MRMR.mrmr(discretize_data, target_data, n_selected_features=num_fea)
    return feature_extraction
Beispiel #5
0
def mrmr():
    before = datetime.datetime.now()
    result = MRMR.mrmr(data, labels, mode="index", n_selected_features=treshold)
    after = datetime.datetime.now()
    print("mRMR")
    print(len(result))
    print("cas: " + str(after - before))
    print('\n')
    if len(result) < len(header):
        transform_and_save(result, "MRMR")
def feature_max_relevance_min_redundancy(x_data, y_data):
    features_scores = MRMR.mrmr(x_data.values, y_data.values, n_selected_features=20)
    features_index = [int(index[0]) for index in features_scores]
    feat_list = x_data.columns.values[features_index]
    feat_list_with_imp = [(feat_list[i], features_scores[i][1]) for i in range(len(features_scores))]
    # dfscores = pd.DataFrame(features_scores)
    # dfcolumns = pd.DataFrame(x_data.columns)
    # featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores = pd.DataFrame(feat_list_with_imp)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    top_20_features = featureScores.nlargest(20, 'Score')
    return top_20_features
def mRMR(X, y, n_selected_features):
    '''
    X, n * d, n cases and d features;
    y, [0, 1]
    n_selected_feature, top n features to select in the importance rank of features
    '''
    feaName = list(X)
    X_ = np.asarray(X)
    
    index_feature, score, muinfo = MRMR.mrmr(X_, y, n_selected_features=n_selected_features)
            
    selected_features = list(map(lambda x: feaName[x], index_feature))
    X_new  = X.iloc[:, index_feature]
    
    return X_new, selected_features
Beispiel #8
0
def select(Xtr, ytr, c_labels, n_clus, k):
    Score_temp = np.zeros((Xtr.shape[1], n_clus))
    idx = np.zeros((Xtr.shape[1], n_clus))

    for i in range(n_clus):
        arr1 = (c_labels == i)
        X1 = Xtr[arr1]
        y1 = ytr[arr1]
        idx1, _, Score1 = MRMR.mrmr(X1, y1, n_selected_features=k)
        Score_temp[idx1, i] = Score1
        idx[idx1, i] = 1
    features = idx.sum(axis=-1) > 0
    num_sel = features.sum()
    Score = Score_temp[features]
    return Score, features
Beispiel #9
0
    def i_execute(data, cols):
        y = data.GroundTruth.values
        x_raw = data.drop(['GroundTruth'], axis=1)
        x = x_raw.values

        f_idx, jcmi, mify = MRMR.mrmr(x, y, n_selected_features=len(cols))
        print(f_idx)
        print(jcmi)
        print(mify)

        headers = ["Name", "Score"]
        values = sorted(zip(x_raw.columns[f_idx], mify),
                        key=lambda xi: xi[1] * -1)

        print(tabulate(values, headers, tablefmt="plain"))
Beispiel #10
0
 def runMRMR(self):
     datasetKeys = self.data.keys()
     for datasetKey in datasetKeys:
         self.log.emit(
             'mRMR feature selection on {} dataset...'.format(datasetKey),
             indents=1)
         f = self.data[datasetKey]['f']
         X = self.data[datasetKey]['X']
         y = self.data[datasetKey]['y']
         fIdxs = MRMR.mrmr(X, y, n_selected_features=10)
         fRank = [f[i] for i in fIdxs]
         self.addToSelectedFeatures('mRMR',
                                    datasetKey,
                                    fOrig=f,
                                    fIdxs=fIdxs,
                                    fRank=fRank)
Beispiel #11
0
def run_fold(trial,P,X,y,method,dataset,parttype):
    print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial)
    n_samples, n_features = X.shape
    train = P[:,trial] == 1
    trnX = X[train]
    trnY = y[train]

    start_time = time.time()
    if method == 'fisher': 
        score = fisher_score.fisher_score(trnX,trnY)
        features = fisher_score.feature_ranking(score)
    elif method == 'chi2':
        score = chi_square.chi_square(trnX,trnY)
        features = chi_square.feature_ranking(score)
    elif method == 'relieff':
        score = reliefF.reliefF(trnX,trnY)
        features = reliefF.feature_ranking(score)
    elif method == 'jmi':
        features = JMI.jmi(trnX,trnY,  n_selected_features=n_features)
    elif method == 'mrmr':
        features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features)
    elif method == 'infogain':
        features = MIM.mim(trnX,trnY,n_selected_features=n_features)
    elif method == 'svmrfe':
        features = svmrfe(trnX,trnY)
    elif method == 'hdmr':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    elif method == 'hdmrhaar':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    else:
        print(method + 'does no exist')

    cputime = time.time() - start_time
    print features
    print 'cputime %f' % cputime
    return {'features': features, 'cputime': cputime}
def run_feature_selection(X, Y, n_selected_features):

    lst = []

    if PARALLEL:
        # with multiprocessing.Pool(processes=4) as pool:
        #     lst.append(pool.apply(JMI.jmi, args=(X, Y), kwds={'n_selected_features': n_selected_features}))
        #     lst.append(pool.apply(MIM.mim, args=(X, Y), kwds={'n_selected_features': n_selected_features}))
        #     lst.append(pool.apply(MRMR.mrmr, args=(X, Y), kwds={'n_selected_features': n_selected_features}))
        #     lst.append(pool.apply(MIFS.mifs, args=(X, Y), kwds={'n_selected_features': n_selected_features}))

        # lst = [l[FEAT_IDX] for l in lst]

        with ProcessPoolExecutor(max_workers=4) as executor:
            lst.append(
                executor.submit(JMI.jmi,
                                X,
                                Y,
                                n_selected_features=n_selected_features))
            lst.append(
                executor.submit(MIM.mim,
                                X,
                                Y,
                                n_selected_features=n_selected_features))
            lst.append(
                executor.submit(MRMR.mrmr,
                                X,
                                Y,
                                n_selected_features=n_selected_features))
            lst.append(
                executor.submit(MIFS.mifs,
                                X,
                                Y,
                                n_selected_features=n_selected_features))
        lst = [l.result()[FEAT_IDX] for l in lst]
    else:
        lst.append(
            JMI.jmi(X, Y, n_selected_features=n_selected_features)[FEAT_IDX])
        lst.append(
            MIM.mim(X, Y, n_selected_features=n_selected_features)[FEAT_IDX])
        lst.append(
            MRMR.mrmr(X, Y, n_selected_features=n_selected_features)[FEAT_IDX])
        lst.append(
            MIFS.mifs(X, Y, n_selected_features=n_selected_features)[FEAT_IDX])

    return lst
Beispiel #13
0
    def execute(data, cols):
        y = data.GroundTruth.values
        x_orig = data.drop(['GroundTruth'], axis=1)
        x = x_orig.values

        clf = svm.LinearSVC()
        num_fea = len(cols)
        fold = model_selection.KFold(n_splits=3, shuffle=True)

        max_acc = 0
        max_idx = None
        max_scores = None
        for train, test in fold.split(x, y):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", ConvergenceWarning)
                idx, jcmi, mify = SKF_MRMR.mrmr(x[train],
                                                y[train],
                                                n_selected_features=num_fea)

                features = x[:, idx[0:num_fea]]

                clf.fit(features[train], y[train])
                y_predict = clf.predict(features[test])

                acc = accuracy_score(y[test], y_predict)

            if acc > max_acc:
                max_acc = acc
                max_idx = idx
                max_scores = jcmi

        headers = ["Name", "Score"]
        values = []
        for i in max_idx[0:num_fea]:
            values.append([x_orig.columns[i], 10 + max_scores[i]])

        sorted_by_score = sorted(values, key=MRMR.get_score, reverse=True)

        return tabulate(sorted_by_score, headers, tablefmt="plain")
Beispiel #14
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features
    print X.shape
    # split data into 10 folds
    ss = model_selection.StratifiedKFold(n_splits=10,
                                         random_state=None,
                                         shuffle=True)

    # perform evaluation on classification task
    num_fea = 10  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss.split(X, y):
        # obtain the index of each feature on the training set
        idx, _, _ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct) / 10
Beispiel #15
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx,_,_ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10
Beispiel #16
0
def MRMR_featureSelection(x, y):
    idx = MRMR.mrmr(x, y)
    rank = feature_ranking(idx)
    return rank
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import sys
from skfeature.function.information_theoretical_based import MRMR


expro = pd.read_csv(sys.argv[1],index_col=False)

X, y = np.array(expro.ix[:, 1:]), np.array(expro.ix[:, 0])

idx = MRMR.mrmr(X, y, n_selected_features=500)
markers = expro.ix[:,idx]
markers.to_csv(sys.argv[2],index=False)

def experiment(data, box, cv, output):
    """
    Write the results of an experiment.
        This function will run an experiment for a specific dataset for a bounding box. 
        There will be CV runs of randomized experiments run and the outputs will be 
        written to a file. 

        Parameters
        ----------
        data : string
            Dataset name.
            
        box : string 
            Bounding box on the file name.
        cv : int 
            Number of cross validation runs. 
            
        output : string
            If float or tuple, the projection will be the same for all features,
            otherwise if a list, the projection will be described feature by feature.
                    
        Returns
        -------
        None
            
        Raises
        ------
        ValueError
            If the percent poison exceeds the number of samples in the requested data.
    """
    #data, box, cv, output = 'conn-bench-sonar-mines-rocks', '1', 5, 'results/test.npz'

    # load normal and adversarial data 
    path_adversarial_data = 'data/attacks/' + data + '_[xiao][' + box + '].csv'
    df_normal = pd.read_csv('data/clean/' + data + '.csv', header=None).values
    df_adversarial = pd.read_csv(path_adversarial_data, header=None).values
    
    # separate out the normal and adversarial data 
    Xn, yn = df_normal[:,:-1], df_normal[:,-1]
    Xa, ya = df_adversarial[:,:-1], df_adversarial[:,-1]
    
    # change the labels from +/-1 to [0,1]
    ya[ya==-1], yn[yn==-1] = 0, 0

    # calculate the rattios of data that would be used for training and hold out  
    p0, p1 = 1./cv, (1. - 1./cv)
    N = len(Xn)
    # calculate the total number of training and testing samples and set the number of 
    # features that are going to be selected 
    Ntr, Nte = int(p1*N), int(p0*N)                                             ##### [OBS]: Losing one feature in the process
    n_selected_features = int(Xn.shape[1]*SEL_PERCENT)+1
       
    # zero the results out 
    acc_KNN = np.zeros((NPR,6))
    ####################################
    # CLASSIFICATION
    ##################################
    
    # run `cv` randomized experiments. note this is not performing cross-validation, rather
    # we are going to use randomized splits of the data.  
    for _ in range(cv): 
        # shuffle up the data for the experiment then split the data into a training and 
        # testing dataset
        i = np.random.permutation(N)
        Xtrk, ytrk, Xtek, ytek = Xn[i][:Ntr], yn[i][:Ntr], Xn[i][-Nte:], yn[i][-Nte:]

        
        ####### Classification on Normal Data with no FS #######################
        yn_allfeature_KNN = KNN_classification(Xtrk, ytrk, Xtek, ytek)
           
        ####### Classification on JMI-based features on Normal data #############
        sf_base_jmi = JMI.jmi(Xtrk, ytrk, n_selected_features=n_selected_features)[FEAT_IDX]
        #print("\nNOR: JMI features", sf_base_jmi)
        Xtr_jmi = Xtrk[:, sf_base_jmi]
        Xte_jmi = Xtek[:, sf_base_jmi]
        yn_JMI_KNN = KNN_classification(Xtr_jmi, ytrk, Xte_jmi, ytek)
                
        for n in range(NPR): 

            # calucate the number of poisoned data that we are going to need to make sure 
            # that the poisoning ratio is correct in the training data. e.g., if you have 
            # N=100 samples and you want to poison by 20% then the 20% needs to be from 
            # the training size. hence it is not 20. 
            Np = int(len(ytrk)*POI_RNG[n]+1)
            if Np >= len(ya): 
                # shouldn't happen but catch the case where we are requesting more poison
                # data samples than are available. NEED TO BE CAREFUL WHEN WE ARE CREATING 
                # THE ADVERSARIAL DATA
                ValueError('Number of poison data requested is larger than the available data.')

            # find the number of normal samples (i.e., not poisoned) samples in the 
            # training data. then create the randomized data set that has Nn normal data
            # samples and Np adversarial samples in the training data
            Nn = len(ytrk) - Np
            idx_normal, idx_adversarial = np.random.permutation(len(ytrk))[:Nn], \
                                            np.random.permutation(len(ya))[:Np]
            Xtrk_poisoned, ytrk_poisoned = np.concatenate((Xtrk[idx_normal], Xa[idx_adversarial])), \
                                            np.concatenate((ytrk[idx_normal], ya[idx_adversarial]))   
            
            ya_allfeature_KNN = KNN_classification(Xtrk_poisoned, ytrk_poisoned, Xtek, ytek)
            
            # run feature selection with the training data that has adversarial samples
            sf_adv_jmi = JMI.jmi(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            sf_adv_mim = MIM.mim(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            sf_adv_mrmr = MRMR.mrmr(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            sf_adv_misf = MIFS.mifs(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            
            # KNN Classification on JMI selected features
            Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi]
            Xtest_JMI = Xtek[:, sf_adv_jmi]
            ya_JMI_KNN = KNN_classification(Xtrk_poisoned_JMI, ytrk_poisoned, Xtest_JMI, ytek)
            # KNN Classification on MIM selected features
            Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim]
            Xtest_MIM = Xtek[:, sf_adv_mim]
            ya_MIM_KNN = KNN_classification(Xtrk_poisoned_MIM, ytrk_poisoned, Xtest_MIM, ytek)
            # KNN Classification on MRMR selected features
            Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr]
            Xtest_MRMR = Xtek[:, sf_adv_mrmr]
            ya_MRMR_KNN = KNN_classification(Xtrk_poisoned_MRMR, ytrk_poisoned, Xtest_MRMR, ytek)
            # KNN Classification on MISF selected features
            Xtrk_poisoned_MISF = Xtrk_poisoned[:, sf_adv_misf]
            Xtest_MISF = Xtek[:, sf_adv_misf]
            ya_MISF_KNN = KNN_classification(Xtrk_poisoned_MISF, ytrk_poisoned, Xtest_MISF, ytek)
            """
            ######### KNN Classification on adversarial data with no FS #################
            ya_allfeature_KNN = KNN_classification(Xtrk_poisoned, ytrk_poisoned, Xtek, ytek)
            #print("[ADV] KNN: No FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_allfeature_KNN))
            #print(classification_report(ytek, ya_allfeature_KNN))
            #print("[ADV] KNN: NO FS Accuracy for Poisoning ratio", POI_RNG[n], "\n",  accuracy_score(ytek, ya_allfeature_KNN))
            
            ######### KNN Classification on adversarial data with JMI FS #################
            sf_adv_jmi = JMI.jmi(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi]
            Xtest_JMI = Xtek[:, sf_adv_jmi]
            
            ya_JMI_KNN = KNN_classification(Xtrk_poisoned_JMI, ytrk_poisoned, Xtest_JMI, ytek)
            #print("\nJMI Features: ", sf_adv_jmi)
            #print("[ADV] KNN: JMI FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_JMI_KNN))
            #print("[ADV] KNN: JMI Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_JMI_KNN))
            
            ######### KNN Classification on adversarial data with MIM FS #################
            sf_adv_mim = MIM.mim(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim]
            Xtest_MIM = Xtek[:, sf_adv_mim]
            
            ya_MIM_KNN = KNN_classification(Xtrk_poisoned_MIM, ytrk_poisoned, Xtest_MIM, ytek)
            #print("\nMIM Features: ", sf_adv_mim)
            #print("[ADV] KNN: MIM FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MIM_KNN))
            #print("[ADV] KNN: MIM Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MIM_KNN))
            
            ######### KNN Classification on adversarial data with MRMR FS #################
            sf_adv_mrmr = MRMR.mrmr(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr]
            Xtest_MRMR = Xtek[:, sf_adv_mrmr]
            
            ya_MRMR_KNN = KNN_classification(Xtrk_poisoned_MRMR, ytrk_poisoned, Xtest_MRMR, ytek)
            #print("\nMRMR Features: ", sf_adv_mrmr)
            #print("[ADV] KNN: MRMR FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MRMR_KNN))
            #print("[ADV] KNN: MRMR Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MRMR_KNN))
            
            ######### KNN Classification on adversarial data with MISF FS #################
            sf_adv_misf = MIFS.mifs(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX]
            Xtrk_poisoned_MISF = Xtrk_poisoned[:, sf_adv_misf]
            Xtest_MISF = Xtek[:, sf_adv_misf]
            
            ya_MISF_KNN = KNN_classification(Xtrk_poisoned_MISF, ytrk_poisoned, Xtest_MISF, ytek)
            #print("\nMISF Features: ", sf_adv_misf)
            #print("[ADV] KNN: MISF FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MISF_KNN))
            #print("[ADV] KNN: MISF Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MISF_KNN))
            """
            # Calculate accumulated accuracy in a matrix of size 9x6
            acc_KNN[n, 0] += accuracy_score(ytek, yn_allfeature_KNN)    # Acc score of normal data without Feature Selection
            acc_KNN[n, 1] += accuracy_score(ytek, ya_allfeature_KNN)    # Acc score of adversarial data without Feature Selection
            acc_KNN[n, 2] += accuracy_score(ytek, ya_JMI_KNN)    # Acc score of adversarial data with JMI Feature Selection algo
            acc_KNN[n, 3] += accuracy_score(ytek, ya_MIM_KNN)    # Acc score of adversarial data with MIM Feature Selection algo
            acc_KNN[n, 4] += accuracy_score(ytek, ya_MRMR_KNN)    # Acc score of adversarial data with MRMR Feature Selection algo
            acc_KNN[n, 5] += accuracy_score(ytek, ya_MISF_KNN)    # Acc score of adversarial data with MISF Feature Selection algo
            
            
    #print(acc_KNN)
    # scale the accuracy statistics by 1.0/cv then write the output file
    acc_KNN = acc_KNN/cv
    print("\n Accuracy matrix of KNN")
    print("[COL]: Norm_noFS, Adv_noFS, Adv_JMI, Adv_MIM, Adv_MRMR, Adv_MISF")
    print("[ROW]: Poisoning ratios: 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2")
    print("\n", acc_KNN)
    
    np.savez(output, acc_KNN=acc_KNN)
    return None
Beispiel #19
0
def mrmr(X, y, feat_names, num_features):
    indexes,_,_ = MRMR.mrmr(X, np.ravel(y), n_selected_features=num_features)
    results = [feat_names[idx] for idx in indexes]

    return results
Beispiel #20
0
def main():

    max_iter = int(sys.argv[4])
    acc = 0
    acc_ts = 0
    acc1_ts = 0
    acc2_ts = 0
    acc1 = 0
    acc2 = 0
    avg_feature = 0
    n_clus = int(sys.argv[1])
    k = int(sys.argv[2])
    dataset = str(sys.argv[3])
    j = 0
    data = scipy.io.loadmat(dataset)
    X = data['X']
    X = X.astype(float)
    y = data['Y']
    y = y[:, 0]

    n, d = X.shape
    classes = np.unique(y)
    n_class = len(classes)
    ss = KFold(n, n_folds=max_iter, shuffle=True)
    clf = svm.SVC(kernel='linear', C=1)
    for train, test in ss:
        print("%f - th iteration" % (j + 1))
        Xtr, Xts, ytr, yts = X[train], X[test], y[train], y[test]
        ntr = Xtr.shape[0]
        nts = Xts.shape[0]

        kmeans = KMeans(init='k-means++', n_clusters=n_clus,
                        n_init=10).fit(Xtr)
        distr = kmeans.transform(Xtr)
        dists = kmeans.transform(Xts)
        distr = distr + 0.0001
        dists = dists + 0.0001
        one = np.ones(distr.shape)
        one1 = np.ones(dists.shape)
        distr = np.divide(one, distr)
        dists = np.divide(one1, dists)
        c_labels = kmeans.predict(Xtr)
        c_labelst = kmeans.predict(Xts)
        inv_dis_max = max(distr.max(), dists.max())
        distr = distr / inv_dis_max
        dists = dists / inv_dis_max
        print inv_dis_max
        print min(distr.min(), dists.min())

        Scores, features = select(Xtr, ytr, c_labels, n_clus, k)
        Xtrnew = Xtr[:, features]
        Xtsnew = Xts[:, features]
        print Scores
        fwtr = expit(10 * distr.dot(Scores.transpose()))
        fwts = expit(10 * dists.dot(Scores.transpose()))

        print fwtr.max(), fwtr.min()
        print fwts.max(), fwts.min()
        Xtrnew = np.multiply(Xtrnew, fwtr)
        Xtsnew = np.multiply(Xtsnew, fwts)

        ynew = np.zeros((ytr.shape))
        ysnew = np.zeros((yts.shape))

        clf.fit(Xtrnew, ytr)
        ynew = clf.predict(Xtrnew)
        ysnew = clf.predict(Xtsnew)

        clf.fit(Xtr, ytr)
        y1 = clf.predict(Xtr)
        y1s = clf.predict(Xts)

        num_fea = k
        avg_feature = k
        id1, _, _ = MRMR.mrmr(Xtr, ytr, n_selected_features=num_fea)
        Xtr2 = Xtr[:, id1[0:num_fea]]
        Xts2 = Xts[:, id1[0:num_fea]]
        clf.fit(Xtr2, ytr)
        y2 = clf.predict(Xtr2)
        y2s = clf.predict(Xts2)

        acc = acc + metrics.accuracy_score(ytr, ynew)
        acc1 = acc1 + metrics.accuracy_score(ytr, y1)
        acc2 = acc2 + metrics.accuracy_score(ytr, y2)
        acc_ts = acc_ts + metrics.accuracy_score(ysnew, yts)
        acc1_ts = acc1_ts + metrics.accuracy_score(y1s, yts)
        acc2_ts = acc2_ts + metrics.accuracy_score(y2s, yts)
        j = j + 1

    print("Original Dataset Size %f %f" % (X.shape))
    print("Average number of features after selection : %f" %
          (avg_feature * 1.0 / max_iter))
    print("Accuracy after Selection :Train Set: %f" % (acc * 100.0 / max_iter))
    print("Test Set : %f" % (acc_ts * 100.0 / max_iter))
    print("Accuracy with all features :Train Set : %f " %
          (acc1 * 100.0 / max_iter))
    print("Test Set: %f" % (acc1_ts * 100.0 / max_iter))
    print("Accuracy by mRMR for same number of features: Train Set: %f" %
          (acc2 * 100.0 / max_iter))
    print("test Seet :%f" % (acc2_ts * 100.0 / max_iter))
Beispiel #21
0
    def fit(self, X, y):

        idx = []

        if self.tp == 'ITB':

            if self.name == 'MRMR':
                idx = MRMR.mrmr(X,
                                y,
                                n_selected_features=self.params['num_feats'])

        elif self.tp == 'filter':

            if self.name == 'Relief':
                score = reliefF.reliefF(X, y, k=self.params['k'])
                idx = reliefF.feature_ranking(score)

            if self.name == 'Fisher':
                # obtain the score of each feature on the training set
                score = fisher_score.fisher_score(X, y)

                # rank features in descending order according to score
                idx = fisher_score.feature_ranking(score)

            if self.name == 'MI':
                idx = np.argsort(
                    mutual_info_classif(
                        X, y, n_neighbors=self.params['n_neighbors']))[::-1]

        elif self.tp == 'wrapper':

            model_fit = self.model.fit(X, y)
            model = SelectFromModel(model_fit, prefit=True)
            idx = model.get_support(indices=True)
        elif self.tp == 'SLB':

            # one-hot-encode on target
            y = construct_label_matrix(y)

            if self.name == 'SMBA':
                scba = fs.SCBA(data=X,
                               alpha=self.params['alpha'],
                               norm_type=self.params['norm_type'],
                               verbose=self.params['verbose'],
                               thr=self.params['thr'],
                               max_iter=self.params['max_iter'],
                               affine=self.params['affine'],
                               normalize=self.params['normalize'],
                               step=self.params['step'],
                               PCA=self.params['PCA'],
                               GPU=self.params['GPU'],
                               device=self.params['device'])

                nrmInd, sInd, repInd, _ = scba.admm()
                if self.params['type_indices'] == 'nrmInd':
                    idx = nrmInd
                elif self.params['type_indices'] == 'repInd':
                    idx = repInd
                else:
                    idx = sInd

            if self.name == 'RFS':
                W = RFS.rfs(X, y, gamma=self.params['gamma'])
                idx = feature_ranking(W)

            if self.name == 'll_l21':
                # obtain the feature weight matrix
                W, _, _ = ll_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)
                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)
            if self.name == 'ls_l21':
                # obtain the feature weight matrix
                W, _, _ = ls_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)

                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)

            if self.name == 'LASSO':

                LASSO = Lasso(alpha=self.params['alpha'], positive=True)

                y_pred_lasso = LASSO.fit(X, y)

                if y_pred_lasso.coef_.ndim == 1:
                    coeff = y_pred_lasso.coef_
                else:
                    coeff = np.asarray(y_pred_lasso.coef_[0, :])

                idx = np.argsort(-coeff)

            if self.name == 'EN':  # elastic net L1

                enet = ElasticNet(alpha=self.params['alpha'],
                                  l1_ratio=1,
                                  positive=True)
                y_pred_enet = enet.fit(X, y)

                if y_pred_enet.coef_.ndim == 1:
                    coeff = y_pred_enet.coef_
                else:
                    coeff = np.asarray(y_pred_enet.coef_[0, :])

                idx = np.argsort(-coeff)

        return idx
Beispiel #22
0
        tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
        print("Specificity" + repr(tn / (tn + fp)))

        sheet_test.write(r, c, roc_auc_score(Y_test, Y_pred))

        r = r + 1
    c = c + 1
    r = 0

MV_sel = []
MV_sel.append(('MIM', MIM.mim(X_train, Y_train, n_selected_features=num_fea)))
print('MIM')
MV_sel.append(('MIFS', MIFS.mifs(X_train, Y_train,
                                 n_selected_features=num_fea)))
print('MIFS')
MV_sel.append(('MRMR', MRMR.mrmr(X_train, Y_train,
                                 n_selected_features=num_fea)))
print('MRMR')
MV_sel.append(('CIFE', CIFE.cife(X_train, Y_train,
                                 n_selected_features=num_fea)))
print('CIFE')
MV_sel.append(('JMI', JMI.jmi(X_train, Y_train, n_selected_features=num_fea)))
print('JMI')
MV_sel.append(('CMIM', CMIM.cmim(X_train, Y_train,
                                 n_selected_features=num_fea)))
print('CMIM')
MV_sel.append(('ICAP', ICAP.icap(X_train, Y_train,
                                 n_selected_features=num_fea)))
print('ICAP')
MV_sel.append(('DISR', DISR.disr(X_train, Y_train,
                                 n_selected_features=num_fea)))
Beispiel #23
0
    pval.sort(key=takeSecond)
    idx = []
    for i in range(n_selected_features):
        idx.append(pval[i][0])
    return idx


# MULTIVARIATE FEATURE SELECTION X CLASSIFICATION (10 fold CV)

# print('BEFORE')
MV_sel = []
MV_sel.append(('WLCX', WLCX(X, Y, n_selected_features=num_fea)))
print('WLCX')
MV_sel.append(('MIFS', MIFS.mifs(X, Y, n_selected_features=num_fea)))
print('MIFS')
MV_sel.append(('MRMR', MRMR.mrmr(X, Y, n_selected_features=num_fea)))
print('MRMR')
MV_sel.append(('CIFE', CIFE.cife(X, Y, n_selected_features=num_fea)))
print('CIFE')
MV_sel.append(('JMI', JMI.jmi(X, Y, n_selected_features=num_fea)))
print('JMI')
MV_sel.append(('CMIM', CMIM.cmim(X, Y, n_selected_features=num_fea)))
print('CMIM')
MV_sel.append(('ICAP', ICAP.icap(X, Y, n_selected_features=num_fea)))
print('ICAP')
MV_sel.append(('DISR', DISR.disr(X, Y, n_selected_features=num_fea)))
for name, model in models:
    for kind, idx in MV_sel:
        # X_sel = X[:, idx[0:num_fea]]
        # X_test_ = X_test[:,idx[0:num_fea]]
        X_train_ = X_train[:, idx[0:num_fea]]
Beispiel #24
0
def supervised_mrmr(X, y=None, **kwargs):
    idx, _, _ = MRMR.mrmr(X, y)
    return idx
Beispiel #25
0
print("*** Discretize Data ***")
print(discretize_data)
print(discretize_data.shape)
# ***** Discretization End *****

# ***** Feature Extraction Stage (MRMR) Start *****
import scipy.io
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import svm
from skfeature.function.information_theoretical_based import MRMR

num_fea = 10
feature_extraction = MRMR.mrmr(discretize_data,
                               target_data,
                               n_selected_features=num_fea)
print('\n')
print("*** Selected Feature ***")
print(feature_extraction)
# ***** Feature Extraction Stage (MRMR) End *****

# ***** Concat Stage Start *****
selected_data = discretize_data[:, [0, 16, 9, 8, 17, 35, 2, 22, 20, 19]]

import numpy as np
concat_data = np.arange(12375).reshape(1125, 11)
concat_data = np.concatenate((selected_data, target_data[:, None]), axis=1)
print('\n')
print('*** Concat Data ***')
print(concat_data)
Beispiel #26
0
from sklearn.model_selection import cross_validate, KFold
from sklearn import svm
from skfeature.function.information_theoretical_based import MRMR

# load data
mat = scipy.io.loadmat('data/colon.mat')
X = mat['X']  # data  # (62, 2000)
X = X.astype(float)
y = mat['Y']  # label
y = y[:, 0]  # (62,)
#n_samples, n_features = X.shape    # number of samples and number of features

# perform evaluation on classification task
num_fea = 10  # number of selected features

# obtain the index of each feature on the training set
'''
Output
    ------
    F: {numpy array}, shape (n_features,)
        index of selected features, F[0] is the most important feature
    J_CMI: {numpy array}, shape: (n_features,)
        corresponding objective function value of selected features
    MIfy: {numpy array}, shape: (n_features,)
        corresponding mutual information between selected features and response
'''
idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea)

# obtain selected features
selected_features = X[:, idx[0:num_fea]]
print(x.shape, y.shape)
clf = load_clf(FINAL_CLASSIFIER)

perfs = np.zeros(10)

skf = StratifiedKFold(n_splits=n_splits, random_state=42)
fold_index = 0
for train_index, test_index in skf.split(x, y):
    print("fold:", fold_index + 1)
    if fold_index != 0:
        continue
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for i, k in enumerate(np.arange(10, 101, 10)):
        idx, _, _ = MRMR.mrmr(x_train, y_train, n_selected_features=k)
        x_train_selected = x_train[:, idx[0:k]]
        x_test_selected = x_test[:, idx[0:k]]

        clf.fit(x_train_selected, y_train)
        y_pred = clf.predict(x_test_selected)
        accu = accuracy_score(y_test, y_pred)

        print("selected features:", k, "accu:", accu)
        perfs[i] += accu

    fold_index += 1

print("n_splits:", n_splits)
print("mrmr", DATASET, FINAL_CLASSIFIER)
# perfs /= n_splits
Beispiel #28
0
def main():
    max_iter = int(sys.argv[3])
    clus_max = int(sys.argv[1])
    dataset = str(sys.argv[2])
    var = float(sys.argv[4])
    k = int(sys.argv[5])
    j = 0
    data = scipy.io.loadmat(dataset)
    X = data['X']
    X = X.astype(float)
    y = data['Y']
    y = y[:, 0]
    n, d = X.shape
    classes = np.unique(y)
    n_class = len(classes)
    s_scores = np.zeros((clus_max, 1))
    for i in range(clus_max):
        kmeans = KMeans(init='k-means++', n_clusters=i + 2, n_init=10)
        labels = kmeans.fit_predict(X)
        s_scores[i] = metrics.silhouette_score(X, labels)

    n_clus = np.argmax(s_scores) + 2
    ss = StratifiedKFold(n_splits=max_iter, shuffle=True)
    clf = svm.SVC()
    parameters = {'kernel': ('linear', 'rbf'), 'C': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(clf, param_grid=parameters)

    acc = 0
    acc1 = 0
    acc2 = 0
    acc_ts = 0
    acc1_ts = 0
    acc2_ts = 0
    avg_feature = 0
    redundant_features = 0
    un, inv, cnts = np.unique(y, return_inverse=True, return_counts=True)
    un = 1. / cnts
    un = un / min(un)
    cl_wts = un[inv]

    j = 0
    for train, test in ss.split(X, y):
        print("%f - th iteration" % (j + 1))
        Xtr, Xts, ytr, yts, cl_tr, cl_ts = X[train], X[test], y[train], y[
            test], cl_wts[train], cl_wts[test]
        ntr = Xtr.shape[0]
        nts = Xts.shape[0]

        kmeans = KMeans(init='k-means++', n_clusters=n_clus,
                        n_init=10).fit(Xtr)
        distr = kmeans.transform(Xtr)
        dists = kmeans.transform(Xts)
        distr = distr + 0.0001
        dists = dists + 0.0001
        one = np.ones(distr.shape)
        one1 = np.ones(dists.shape)
        distr = np.divide(one, distr)
        dists = np.divide(one1, dists)
        c_labels = kmeans.predict(Xtr)
        c_labelst = kmeans.predict(Xts)
        inv_dis_max = max(distr.max(), dists.max())
        distr = distr / inv_dis_max
        dists = dists / inv_dis_max

        Scores, features = select(Xtr, ytr, c_labels, n_clus, k)
        Xtrnew = Xtr[:, features]
        Xtsnew = Xts[:, features]
        for l in range(Xtrnew.shape[1]):
            for m in range(l):
                if (midd(Xtrnew[:, l], Xtrnew[:, m]) > 1):
                    redundant_features = redundant_features + 1

        fwtr = expit(var * distr.dot(Scores.transpose()))
        fwts = expit(var * dists.dot(Scores.transpose()))

        Xtrnew = np.multiply(Xtrnew, fwtr)
        Xtsnew = np.multiply(Xtsnew, fwts)

        ynew = np.zeros((ytr.shape))
        ysnew = np.zeros((yts.shape))
        for i in range(n_clus):
            arr1 = (c_labels == i)
            arr2 = (c_labelst == i)
            Xtrnewj = Xtrnew[arr1]
            Xtsnewj = Xtsnew[arr2]
            ytrj = ytr[arr1]
            grid_search.fit(Xtrnewj, ytrj)
            ynew[arr1] = grid_search.predict(Xtrnewj)
            ysnew[arr2] = grid_search.predict(Xtsnewj)

        grid_search.fit(Xtr, ytr)
        y1 = grid_search.predict(Xtr)
        y1s = grid_search.predict(Xts)

        num_fea = Xtrnew.shape[1]
        avg_feature[k] = avg_feature[k] + num_fea
        id1, _, _ = MRMR.mrmr(Xtr, ytr, n_selected_features=num_fea)
        Xtr2 = Xtr[:, id1[0:num_fea]]
        Xts2 = Xts[:, id1[0:num_fea]]
        grid_search.fit(Xtr2, ytr)
        y2 = grid_search.predict(Xtr2)
        y2s = grid_search.predict(Xts2)

        acc = acc + metrics.accuracy_score(ytr, ynew)
        acc1 = acc1 + metrics.accuracy_score(ytr, y1)
        acc2 = acc2 + metrics.accuracy_score(ytr, y2)
        acc_ts = acc_ts + metrics.accuracy_score(ysnew, yts)
        acc1_ts = acc1_ts + metrics.accuracy_score(y1s, yts)
        acc2_ts = acc2_ts + metrics.accuracy_score(y2s, yts)
        j = j + 1

    avg_feature = avg_feature * 1.0 / max_iter
    redundant_features = redundant_features * 1.0 / max_iter
    acc = acc * 100.0 / max_iter
    acc1 = acc1 * 100.0 / max_iter
    acc2 = acc2 * 100.0 / max_iter
    acc_ts = acc_ts * 100.0 / max_iter
    acc1_ts = acc1_ts * 100.0 / max_iter
    acc2_ts = acc2_ts * 100.0 / max_iter

    print("Original Dataset Size %f %f" % (X.shape))
    print("Average number of features after selection : ")
    print avg_feature
    print("Number of clusters :")
    print n_clus
    print("Redundant_feature_pairs :")
    print avg_feature
    print("Accuracy after Selection :Train Set:")
    print acc
    print("Test Set :")
    print(acc_ts)
    print("Accuracy with all features :Train Set ")
    print(acc1)
    print("Test Set: ")
    print(acc1_ts)
    print("Accuracy by mRMR for same number of features: Train Set")
    print(acc2)
    print("test Seet :")
    print(acc2_ts)

    scipy.io.savemat(
        dataset + '.mat', {
            'acc': acc,
            'acc1': acc1,
            'acc2': acc2,
            'acc_ts': acc_ts,
            'acc1_ts': acc1_ts,
            'acc2_ts': acc2_ts,
            'avg_feature': avg_feature,
            'redundant_features': redundant_features
        })
    result = pymrmr.mRMR(X, 'MIQ', 10)
    print(result)


def import_Data():
    Data = pd.read_csv('Disease_Data_BiGram.csv')
    # print(Data.shape)

    X = Data.iloc[:, 0:Data.shape[1] - 2]

    Y = Data['Class']
    Y_ = Data['Subject']

    return X, Y, Y_


FS = {}
X, Y, Y_ = import_Data()

FS['MRMR'] = X.columns[MRMR.mrmr(np.array(X), Y_, n_selected_features=15)[:15]]
FS['JMI'] = X.columns[JMI.jmi(np.array(X), Y_, n_selected_features=15)[:15]]
FS['MIFS'] = X.columns[MIFS.mifs(np.array(X), Y_, n_selected_features=15)[:15]]
FS['MIM'] = X.columns[MIM.mim(np.array(X), Y_, n_selected_features=15)[:15]]

FS = pd.DataFrame(FS)
print(FS)
FS.to_csv('Selected_Features_MultiVar_BiG.csv')

#print(pd.DataFrame(FS))
#model = apply_Model(X,Y_)
bestFeat.fit(train_X, train_Y)
feat_scr = zip(feats,bestFeat.scores_)
feat_scr = [f for f in feat_scr if not np.isnan(f[1])]
sorted_fetas = sorted(feat_scr, key=lambda k:k[1], reverse=True)

# estimator = SVR(kernel="linear")
# selector = RFE(estimator, 5, step=1)
# selector.fit(train_X, train_Y)  # slow

from sklearn.ensemble import GradientBoostingClassifier
g_cls = GradientBoostingClassifier(n_estimators=10)
g_cls.fit(train_X, train_Y)
g_feats = g_cls.feature_importances_
g_feat_scr = zip(feats,g_feats)
g_feat_scr = [f for f in g_feat_scr if not np.isnan(f[1])]
g_sorted_fetas = sorted(g_feat_scr, key=lambda k:k[1], reverse=True)


 
from skfeature.function.information_theoretical_based import FCBF, LCSI, MRMR, JMI
score = FCBF.fcbf(train_X, train_Y) 
fcbf_sorted= [feats[i] for i in score]

score = MRMR.mrmr(train_X, train_Y, n_selected_features = 50) 
MRMR_sorted= [feats[i] for i in score]

score = JMI.jmi(train_X, train_Y, n_selected_features = 50) 
JMI_sorted= [feats[i] for i in score]


Beispiel #31
0
        # print("TRAIN:", train_index, "TEST:", test_index)

        # 离散数据
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print('训练集数量:', X_train.shape, '测试集数量:', X_test.shape)
        # print(y_train)
        # print(y_test)

        # 原始数据
        X_train_raw, X_test_raw = X_raw[train_index], X_raw[test_index]
        y_train_raw, y_test_raw = y_raw[train_index], y_raw[test_index]
        print('训练集数量:', X_train_raw.shape, '测试集数量:', X_test_raw.shape)

        # obtain the index of each feature on the training set which is selected
        idx, _, _ = MRMR.mrmr(X_train, y_train, n_selected_features=num_fea)
        print("the index of selected feature is: " + str(idx))

        # save the index of selected feature
        feature_file = nDim_DIR + "特征.txt"
        f0 = open(feature_file, 'a+')  # 基因
        f0.write(str(idx))
        f0.write('\n')
        f0.close()

        # obtain the dataset on the selected features
        # update the discrete train data and the discrete test data
        X_train_select_disc = X_train[:, idx[0:num_fea]]
        X_test_select_disc = X_test[:, idx[0:num_fea]]

        # obtain the dataset on the selected features
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

bestFeat = SelectKBest()
bestFeat.fit(train_X, train_Y)
feat_scr = zip(feats, bestFeat.scores_)
feat_scr = [f for f in feat_scr if not np.isnan(f[1])]
sorted_fetas = sorted(feat_scr, key=lambda k: k[1], reverse=True)

# estimator = SVR(kernel="linear")
# selector = RFE(estimator, 5, step=1)
# selector.fit(train_X, train_Y)  # slow

from sklearn.ensemble import GradientBoostingClassifier
g_cls = GradientBoostingClassifier(n_estimators=10)
g_cls.fit(train_X, train_Y)
g_feats = g_cls.feature_importances_
g_feat_scr = zip(feats, g_feats)
g_feat_scr = [f for f in g_feat_scr if not np.isnan(f[1])]
g_sorted_fetas = sorted(g_feat_scr, key=lambda k: k[1], reverse=True)

from skfeature.function.information_theoretical_based import FCBF, LCSI, MRMR, JMI
score = FCBF.fcbf(train_X, train_Y)
fcbf_sorted = [feats[i] for i in score]

score = MRMR.mrmr(train_X, train_Y, n_selected_features=50)
MRMR_sorted = [feats[i] for i in score]

score = JMI.jmi(train_X, train_Y, n_selected_features=50)
JMI_sorted = [feats[i] for i in score]