def main(): # load training data mat0 = scipy.io.loadmat('fea.mat') X = mat0['fea_tr'] # data X = X.astype(float) mat1 = scipy.io.loadmat('gnd.mat') y = mat1['gnd_tr'] # label n_samples, n_features = X.shape # number of samples and number of features #load test data mat2 = scipy.io.loadmat('fea_t.mat') X_t = mat2['fea_tst'] # data X_t = X_t.astype(float) mat3 = scipy.io.loadmat('gnd_t.mat') y_t = mat3['gnd_tst'] # label n_samples_t, n_features_t = X_t.shape # number of samples and number of features # perform evaluation on classification task num_fea = 400 # number of selected features gnb = GaussianNB() idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea) # obtain the index of each feature on the training set idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea) # obtain the dataset on the selected features features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset gnb.fit(features, y) # obtain the dataset on the selected features of the test set for prediction purposes features_t = X_t[:, idx[0:num_fea]] # predict the class labels of test data y_predict = gnb.predict(features_t) # obtain the classification accuracy on the test data acc = accuracy_score(y_t, y_predict) # output the average classification accuracy over all 10 folds print 'Accuracy:', float(acc) / 10
def get_mrmr_score(self, max_dim): """ Итеративно выполняет отбор признаков на основе меры "minimal redundancy maximum relevance" (MRMR). Args: max_dim(int): предельное число признаков, которые следует отобрать. """ x_train = scale(self.features) # feature normalization y_train = self.targets # targets vector kf = KFold(n_splits=5, shuffle=True, random_state=241) # make CV tool ar_scorer = make_scorer(roc_auc_score) # make scorer tool clf = MLPRegressor( hidden_layer_sizes=(20, 10)) # multilayer perceptron as a classifier auc_roc_scores = [] for n_features in range(1, max_dim + 1): mrmr_idx, _, _ = MRMR.mrmr(x_train, y_train, n_selected_features=n_features) features = x_train[:, mrmr_idx] vect_auc_roc_score = cross_val_score(clf, features, y_train, scoring=ar_scorer, cv=kf) # train auc_roc_scores.append(np.mean( vect_auc_roc_score)) # save mean value of auc roc on CV return auc_roc_scores
def mrmr_feature_select(self, n_selected_features=50): """ Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012 select features index[0] is the most important feature j_cmi: basic scoring criteria for linear combination of shannon information term j_cmi=I(f;y)-beta*sum_j(I(fj;f))+gamma*sum(I(fj;f|y)) conditional mutual information mrmr gama=0 互信息(Mutual Information)是度量两个事件集合之间的相关性(mutual dependence)。互信息是点间互信息(PMI)的期望值 MIfy: mutual information between selected features and response y """ # plot_tsne(self.X,Y=self.Y,targets=self.target_names, filename=self.filename +'.before_mrmr_feature_selection') n_samples, n_features = self.X.shape x = np.array(self.X) if n_selected_features and n_features > n_selected_features: # filter half more features or select 50 features int(n_features*percent) # # self.logger.info("selecting {} features using mrmr".format(num_fea)) idx, j_cmi, MIfy = MRMR.mrmr( x, self.Y, n_selected_features=n_selected_features) else: idx, j_cmi, MIfy = MRMR.mrmr( x, self.Y ) #select automatically may still remain many features or num_fea = len(idx) # obtain the dataset on the selected features self.features = self.X.columns[idx].values mrmr_report = pd.DataFrame( { "features": self.features, "j_cmi": j_cmi, "MIfy": MIfy }, columns=['features', 'j_cmi', 'MIfy']) mrmr_report = mrmr_report.sort_values('MIfy', ascending=False) mrmr_report.to_csv(self.filename + ".mrmr_features.report.csv", index=False) self.X = self.X.iloc[:, idx] #select mrmr features sel_bools = self.X.sum(axis=1) != 0 # filter all 0 rows samples. self.X = self.X[sel_bools] self.Y = self.Y[sel_bools] self.X.to_csv(self.filename + ".mrmr_sel_features.csv") self.logger.info("Selected {} features using mrmr".format(num_fea)) self.stats.append(("mrmr_dim", self.X.shape))
def feature_extract(discretize_data, target_data, num_fea): import scipy.io from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn import svm from skfeature.function.information_theoretical_based import MRMR feature_extraction = MRMR.mrmr(discretize_data, target_data, n_selected_features=num_fea) return feature_extraction
def mrmr(): before = datetime.datetime.now() result = MRMR.mrmr(data, labels, mode="index", n_selected_features=treshold) after = datetime.datetime.now() print("mRMR") print(len(result)) print("cas: " + str(after - before)) print('\n') if len(result) < len(header): transform_and_save(result, "MRMR")
def feature_max_relevance_min_redundancy(x_data, y_data): features_scores = MRMR.mrmr(x_data.values, y_data.values, n_selected_features=20) features_index = [int(index[0]) for index in features_scores] feat_list = x_data.columns.values[features_index] feat_list_with_imp = [(feat_list[i], features_scores[i][1]) for i in range(len(features_scores))] # dfscores = pd.DataFrame(features_scores) # dfcolumns = pd.DataFrame(x_data.columns) # featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores = pd.DataFrame(feat_list_with_imp) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def mRMR(X, y, n_selected_features): ''' X, n * d, n cases and d features; y, [0, 1] n_selected_feature, top n features to select in the importance rank of features ''' feaName = list(X) X_ = np.asarray(X) index_feature, score, muinfo = MRMR.mrmr(X_, y, n_selected_features=n_selected_features) selected_features = list(map(lambda x: feaName[x], index_feature)) X_new = X.iloc[:, index_feature] return X_new, selected_features
def select(Xtr, ytr, c_labels, n_clus, k): Score_temp = np.zeros((Xtr.shape[1], n_clus)) idx = np.zeros((Xtr.shape[1], n_clus)) for i in range(n_clus): arr1 = (c_labels == i) X1 = Xtr[arr1] y1 = ytr[arr1] idx1, _, Score1 = MRMR.mrmr(X1, y1, n_selected_features=k) Score_temp[idx1, i] = Score1 idx[idx1, i] = 1 features = idx.sum(axis=-1) > 0 num_sel = features.sum() Score = Score_temp[features] return Score, features
def i_execute(data, cols): y = data.GroundTruth.values x_raw = data.drop(['GroundTruth'], axis=1) x = x_raw.values f_idx, jcmi, mify = MRMR.mrmr(x, y, n_selected_features=len(cols)) print(f_idx) print(jcmi) print(mify) headers = ["Name", "Score"] values = sorted(zip(x_raw.columns[f_idx], mify), key=lambda xi: xi[1] * -1) print(tabulate(values, headers, tablefmt="plain"))
def runMRMR(self): datasetKeys = self.data.keys() for datasetKey in datasetKeys: self.log.emit( 'mRMR feature selection on {} dataset...'.format(datasetKey), indents=1) f = self.data[datasetKey]['f'] X = self.data[datasetKey]['X'] y = self.data[datasetKey]['y'] fIdxs = MRMR.mrmr(X, y, n_selected_features=10) fRank = [f[i] for i in fIdxs] self.addToSelectedFeatures('mRMR', datasetKey, fOrig=f, fIdxs=fIdxs, fRank=fRank)
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def run_feature_selection(X, Y, n_selected_features): lst = [] if PARALLEL: # with multiprocessing.Pool(processes=4) as pool: # lst.append(pool.apply(JMI.jmi, args=(X, Y), kwds={'n_selected_features': n_selected_features})) # lst.append(pool.apply(MIM.mim, args=(X, Y), kwds={'n_selected_features': n_selected_features})) # lst.append(pool.apply(MRMR.mrmr, args=(X, Y), kwds={'n_selected_features': n_selected_features})) # lst.append(pool.apply(MIFS.mifs, args=(X, Y), kwds={'n_selected_features': n_selected_features})) # lst = [l[FEAT_IDX] for l in lst] with ProcessPoolExecutor(max_workers=4) as executor: lst.append( executor.submit(JMI.jmi, X, Y, n_selected_features=n_selected_features)) lst.append( executor.submit(MIM.mim, X, Y, n_selected_features=n_selected_features)) lst.append( executor.submit(MRMR.mrmr, X, Y, n_selected_features=n_selected_features)) lst.append( executor.submit(MIFS.mifs, X, Y, n_selected_features=n_selected_features)) lst = [l.result()[FEAT_IDX] for l in lst] else: lst.append( JMI.jmi(X, Y, n_selected_features=n_selected_features)[FEAT_IDX]) lst.append( MIM.mim(X, Y, n_selected_features=n_selected_features)[FEAT_IDX]) lst.append( MRMR.mrmr(X, Y, n_selected_features=n_selected_features)[FEAT_IDX]) lst.append( MIFS.mifs(X, Y, n_selected_features=n_selected_features)[FEAT_IDX]) return lst
def execute(data, cols): y = data.GroundTruth.values x_orig = data.drop(['GroundTruth'], axis=1) x = x_orig.values clf = svm.LinearSVC() num_fea = len(cols) fold = model_selection.KFold(n_splits=3, shuffle=True) max_acc = 0 max_idx = None max_scores = None for train, test in fold.split(x, y): with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) idx, jcmi, mify = SKF_MRMR.mrmr(x[train], y[train], n_selected_features=num_fea) features = x[:, idx[0:num_fea]] clf.fit(features[train], y[train]) y_predict = clf.predict(features[test]) acc = accuracy_score(y[test], y_predict) if acc > max_acc: max_acc = acc max_idx = idx max_scores = jcmi headers = ["Name", "Score"] values = [] for i in max_idx[0:num_fea]: values.append([x_orig.columns[i], 10 + max_scores[i]]) sorted_by_score = sorted(values, key=MRMR.get_score, reverse=True) return tabulate(sorted_by_score, headers, tablefmt="plain")
def main(): # load data mat = scipy.io.loadmat('../data/colon.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features print X.shape # split data into 10 folds ss = model_selection.StratifiedKFold(n_splits=10, random_state=None, shuffle=True) # perform evaluation on classification task num_fea = 10 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss.split(X, y): # obtain the index of each feature on the training set idx, _, _ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea) # obtain the dataset on the selected features features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print 'Accuracy:', float(correct) / 10
def main(): # load data mat = scipy.io.loadmat('../data/colon.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 10 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the index of each feature on the training set idx,_,_ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea) # obtain the dataset on the selected features features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print 'Accuracy:', float(correct)/10
def MRMR_featureSelection(x, y): idx = MRMR.mrmr(x, y) rank = feature_ranking(idx) return rank
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import sys from skfeature.function.information_theoretical_based import MRMR expro = pd.read_csv(sys.argv[1],index_col=False) X, y = np.array(expro.ix[:, 1:]), np.array(expro.ix[:, 0]) idx = MRMR.mrmr(X, y, n_selected_features=500) markers = expro.ix[:,idx] markers.to_csv(sys.argv[2],index=False)
def experiment(data, box, cv, output): """ Write the results of an experiment. This function will run an experiment for a specific dataset for a bounding box. There will be CV runs of randomized experiments run and the outputs will be written to a file. Parameters ---------- data : string Dataset name. box : string Bounding box on the file name. cv : int Number of cross validation runs. output : string If float or tuple, the projection will be the same for all features, otherwise if a list, the projection will be described feature by feature. Returns ------- None Raises ------ ValueError If the percent poison exceeds the number of samples in the requested data. """ #data, box, cv, output = 'conn-bench-sonar-mines-rocks', '1', 5, 'results/test.npz' # load normal and adversarial data path_adversarial_data = 'data/attacks/' + data + '_[xiao][' + box + '].csv' df_normal = pd.read_csv('data/clean/' + data + '.csv', header=None).values df_adversarial = pd.read_csv(path_adversarial_data, header=None).values # separate out the normal and adversarial data Xn, yn = df_normal[:,:-1], df_normal[:,-1] Xa, ya = df_adversarial[:,:-1], df_adversarial[:,-1] # change the labels from +/-1 to [0,1] ya[ya==-1], yn[yn==-1] = 0, 0 # calculate the rattios of data that would be used for training and hold out p0, p1 = 1./cv, (1. - 1./cv) N = len(Xn) # calculate the total number of training and testing samples and set the number of # features that are going to be selected Ntr, Nte = int(p1*N), int(p0*N) ##### [OBS]: Losing one feature in the process n_selected_features = int(Xn.shape[1]*SEL_PERCENT)+1 # zero the results out acc_KNN = np.zeros((NPR,6)) #################################### # CLASSIFICATION ################################## # run `cv` randomized experiments. note this is not performing cross-validation, rather # we are going to use randomized splits of the data. for _ in range(cv): # shuffle up the data for the experiment then split the data into a training and # testing dataset i = np.random.permutation(N) Xtrk, ytrk, Xtek, ytek = Xn[i][:Ntr], yn[i][:Ntr], Xn[i][-Nte:], yn[i][-Nte:] ####### Classification on Normal Data with no FS ####################### yn_allfeature_KNN = KNN_classification(Xtrk, ytrk, Xtek, ytek) ####### Classification on JMI-based features on Normal data ############# sf_base_jmi = JMI.jmi(Xtrk, ytrk, n_selected_features=n_selected_features)[FEAT_IDX] #print("\nNOR: JMI features", sf_base_jmi) Xtr_jmi = Xtrk[:, sf_base_jmi] Xte_jmi = Xtek[:, sf_base_jmi] yn_JMI_KNN = KNN_classification(Xtr_jmi, ytrk, Xte_jmi, ytek) for n in range(NPR): # calucate the number of poisoned data that we are going to need to make sure # that the poisoning ratio is correct in the training data. e.g., if you have # N=100 samples and you want to poison by 20% then the 20% needs to be from # the training size. hence it is not 20. Np = int(len(ytrk)*POI_RNG[n]+1) if Np >= len(ya): # shouldn't happen but catch the case where we are requesting more poison # data samples than are available. NEED TO BE CAREFUL WHEN WE ARE CREATING # THE ADVERSARIAL DATA ValueError('Number of poison data requested is larger than the available data.') # find the number of normal samples (i.e., not poisoned) samples in the # training data. then create the randomized data set that has Nn normal data # samples and Np adversarial samples in the training data Nn = len(ytrk) - Np idx_normal, idx_adversarial = np.random.permutation(len(ytrk))[:Nn], \ np.random.permutation(len(ya))[:Np] Xtrk_poisoned, ytrk_poisoned = np.concatenate((Xtrk[idx_normal], Xa[idx_adversarial])), \ np.concatenate((ytrk[idx_normal], ya[idx_adversarial])) ya_allfeature_KNN = KNN_classification(Xtrk_poisoned, ytrk_poisoned, Xtek, ytek) # run feature selection with the training data that has adversarial samples sf_adv_jmi = JMI.jmi(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] sf_adv_mim = MIM.mim(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] sf_adv_mrmr = MRMR.mrmr(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] sf_adv_misf = MIFS.mifs(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] # KNN Classification on JMI selected features Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi] Xtest_JMI = Xtek[:, sf_adv_jmi] ya_JMI_KNN = KNN_classification(Xtrk_poisoned_JMI, ytrk_poisoned, Xtest_JMI, ytek) # KNN Classification on MIM selected features Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim] Xtest_MIM = Xtek[:, sf_adv_mim] ya_MIM_KNN = KNN_classification(Xtrk_poisoned_MIM, ytrk_poisoned, Xtest_MIM, ytek) # KNN Classification on MRMR selected features Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr] Xtest_MRMR = Xtek[:, sf_adv_mrmr] ya_MRMR_KNN = KNN_classification(Xtrk_poisoned_MRMR, ytrk_poisoned, Xtest_MRMR, ytek) # KNN Classification on MISF selected features Xtrk_poisoned_MISF = Xtrk_poisoned[:, sf_adv_misf] Xtest_MISF = Xtek[:, sf_adv_misf] ya_MISF_KNN = KNN_classification(Xtrk_poisoned_MISF, ytrk_poisoned, Xtest_MISF, ytek) """ ######### KNN Classification on adversarial data with no FS ################# ya_allfeature_KNN = KNN_classification(Xtrk_poisoned, ytrk_poisoned, Xtek, ytek) #print("[ADV] KNN: No FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_allfeature_KNN)) #print(classification_report(ytek, ya_allfeature_KNN)) #print("[ADV] KNN: NO FS Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_allfeature_KNN)) ######### KNN Classification on adversarial data with JMI FS ################# sf_adv_jmi = JMI.jmi(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi] Xtest_JMI = Xtek[:, sf_adv_jmi] ya_JMI_KNN = KNN_classification(Xtrk_poisoned_JMI, ytrk_poisoned, Xtest_JMI, ytek) #print("\nJMI Features: ", sf_adv_jmi) #print("[ADV] KNN: JMI FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_JMI_KNN)) #print("[ADV] KNN: JMI Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_JMI_KNN)) ######### KNN Classification on adversarial data with MIM FS ################# sf_adv_mim = MIM.mim(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim] Xtest_MIM = Xtek[:, sf_adv_mim] ya_MIM_KNN = KNN_classification(Xtrk_poisoned_MIM, ytrk_poisoned, Xtest_MIM, ytek) #print("\nMIM Features: ", sf_adv_mim) #print("[ADV] KNN: MIM FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MIM_KNN)) #print("[ADV] KNN: MIM Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MIM_KNN)) ######### KNN Classification on adversarial data with MRMR FS ################# sf_adv_mrmr = MRMR.mrmr(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr] Xtest_MRMR = Xtek[:, sf_adv_mrmr] ya_MRMR_KNN = KNN_classification(Xtrk_poisoned_MRMR, ytrk_poisoned, Xtest_MRMR, ytek) #print("\nMRMR Features: ", sf_adv_mrmr) #print("[ADV] KNN: MRMR FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MRMR_KNN)) #print("[ADV] KNN: MRMR Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MRMR_KNN)) ######### KNN Classification on adversarial data with MISF FS ################# sf_adv_misf = MIFS.mifs(Xtrk_poisoned, ytrk_poisoned, n_selected_features=n_selected_features)[FEAT_IDX] Xtrk_poisoned_MISF = Xtrk_poisoned[:, sf_adv_misf] Xtest_MISF = Xtek[:, sf_adv_misf] ya_MISF_KNN = KNN_classification(Xtrk_poisoned_MISF, ytrk_poisoned, Xtest_MISF, ytek) #print("\nMISF Features: ", sf_adv_misf) #print("[ADV] KNN: MISF FS Confusion Matrix for Poisoning Ratio: ", POI_RNG[n], "\n", confusion_matrix(ytek, ya_MISF_KNN)) #print("[ADV] KNN: MISF Accuracy for Poisoning ratio", POI_RNG[n], "\n", accuracy_score(ytek, ya_MISF_KNN)) """ # Calculate accumulated accuracy in a matrix of size 9x6 acc_KNN[n, 0] += accuracy_score(ytek, yn_allfeature_KNN) # Acc score of normal data without Feature Selection acc_KNN[n, 1] += accuracy_score(ytek, ya_allfeature_KNN) # Acc score of adversarial data without Feature Selection acc_KNN[n, 2] += accuracy_score(ytek, ya_JMI_KNN) # Acc score of adversarial data with JMI Feature Selection algo acc_KNN[n, 3] += accuracy_score(ytek, ya_MIM_KNN) # Acc score of adversarial data with MIM Feature Selection algo acc_KNN[n, 4] += accuracy_score(ytek, ya_MRMR_KNN) # Acc score of adversarial data with MRMR Feature Selection algo acc_KNN[n, 5] += accuracy_score(ytek, ya_MISF_KNN) # Acc score of adversarial data with MISF Feature Selection algo #print(acc_KNN) # scale the accuracy statistics by 1.0/cv then write the output file acc_KNN = acc_KNN/cv print("\n Accuracy matrix of KNN") print("[COL]: Norm_noFS, Adv_noFS, Adv_JMI, Adv_MIM, Adv_MRMR, Adv_MISF") print("[ROW]: Poisoning ratios: 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2") print("\n", acc_KNN) np.savez(output, acc_KNN=acc_KNN) return None
def mrmr(X, y, feat_names, num_features): indexes,_,_ = MRMR.mrmr(X, np.ravel(y), n_selected_features=num_features) results = [feat_names[idx] for idx in indexes] return results
def main(): max_iter = int(sys.argv[4]) acc = 0 acc_ts = 0 acc1_ts = 0 acc2_ts = 0 acc1 = 0 acc2 = 0 avg_feature = 0 n_clus = int(sys.argv[1]) k = int(sys.argv[2]) dataset = str(sys.argv[3]) j = 0 data = scipy.io.loadmat(dataset) X = data['X'] X = X.astype(float) y = data['Y'] y = y[:, 0] n, d = X.shape classes = np.unique(y) n_class = len(classes) ss = KFold(n, n_folds=max_iter, shuffle=True) clf = svm.SVC(kernel='linear', C=1) for train, test in ss: print("%f - th iteration" % (j + 1)) Xtr, Xts, ytr, yts = X[train], X[test], y[train], y[test] ntr = Xtr.shape[0] nts = Xts.shape[0] kmeans = KMeans(init='k-means++', n_clusters=n_clus, n_init=10).fit(Xtr) distr = kmeans.transform(Xtr) dists = kmeans.transform(Xts) distr = distr + 0.0001 dists = dists + 0.0001 one = np.ones(distr.shape) one1 = np.ones(dists.shape) distr = np.divide(one, distr) dists = np.divide(one1, dists) c_labels = kmeans.predict(Xtr) c_labelst = kmeans.predict(Xts) inv_dis_max = max(distr.max(), dists.max()) distr = distr / inv_dis_max dists = dists / inv_dis_max print inv_dis_max print min(distr.min(), dists.min()) Scores, features = select(Xtr, ytr, c_labels, n_clus, k) Xtrnew = Xtr[:, features] Xtsnew = Xts[:, features] print Scores fwtr = expit(10 * distr.dot(Scores.transpose())) fwts = expit(10 * dists.dot(Scores.transpose())) print fwtr.max(), fwtr.min() print fwts.max(), fwts.min() Xtrnew = np.multiply(Xtrnew, fwtr) Xtsnew = np.multiply(Xtsnew, fwts) ynew = np.zeros((ytr.shape)) ysnew = np.zeros((yts.shape)) clf.fit(Xtrnew, ytr) ynew = clf.predict(Xtrnew) ysnew = clf.predict(Xtsnew) clf.fit(Xtr, ytr) y1 = clf.predict(Xtr) y1s = clf.predict(Xts) num_fea = k avg_feature = k id1, _, _ = MRMR.mrmr(Xtr, ytr, n_selected_features=num_fea) Xtr2 = Xtr[:, id1[0:num_fea]] Xts2 = Xts[:, id1[0:num_fea]] clf.fit(Xtr2, ytr) y2 = clf.predict(Xtr2) y2s = clf.predict(Xts2) acc = acc + metrics.accuracy_score(ytr, ynew) acc1 = acc1 + metrics.accuracy_score(ytr, y1) acc2 = acc2 + metrics.accuracy_score(ytr, y2) acc_ts = acc_ts + metrics.accuracy_score(ysnew, yts) acc1_ts = acc1_ts + metrics.accuracy_score(y1s, yts) acc2_ts = acc2_ts + metrics.accuracy_score(y2s, yts) j = j + 1 print("Original Dataset Size %f %f" % (X.shape)) print("Average number of features after selection : %f" % (avg_feature * 1.0 / max_iter)) print("Accuracy after Selection :Train Set: %f" % (acc * 100.0 / max_iter)) print("Test Set : %f" % (acc_ts * 100.0 / max_iter)) print("Accuracy with all features :Train Set : %f " % (acc1 * 100.0 / max_iter)) print("Test Set: %f" % (acc1_ts * 100.0 / max_iter)) print("Accuracy by mRMR for same number of features: Train Set: %f" % (acc2 * 100.0 / max_iter)) print("test Seet :%f" % (acc2_ts * 100.0 / max_iter))
def fit(self, X, y): idx = [] if self.tp == 'ITB': if self.name == 'MRMR': idx = MRMR.mrmr(X, y, n_selected_features=self.params['num_feats']) elif self.tp == 'filter': if self.name == 'Relief': score = reliefF.reliefF(X, y, k=self.params['k']) idx = reliefF.feature_ranking(score) if self.name == 'Fisher': # obtain the score of each feature on the training set score = fisher_score.fisher_score(X, y) # rank features in descending order according to score idx = fisher_score.feature_ranking(score) if self.name == 'MI': idx = np.argsort( mutual_info_classif( X, y, n_neighbors=self.params['n_neighbors']))[::-1] elif self.tp == 'wrapper': model_fit = self.model.fit(X, y) model = SelectFromModel(model_fit, prefit=True) idx = model.get_support(indices=True) elif self.tp == 'SLB': # one-hot-encode on target y = construct_label_matrix(y) if self.name == 'SMBA': scba = fs.SCBA(data=X, alpha=self.params['alpha'], norm_type=self.params['norm_type'], verbose=self.params['verbose'], thr=self.params['thr'], max_iter=self.params['max_iter'], affine=self.params['affine'], normalize=self.params['normalize'], step=self.params['step'], PCA=self.params['PCA'], GPU=self.params['GPU'], device=self.params['device']) nrmInd, sInd, repInd, _ = scba.admm() if self.params['type_indices'] == 'nrmInd': idx = nrmInd elif self.params['type_indices'] == 'repInd': idx = repInd else: idx = sInd if self.name == 'RFS': W = RFS.rfs(X, y, gamma=self.params['gamma']) idx = feature_ranking(W) if self.name == 'll_l21': # obtain the feature weight matrix W, _, _ = ll_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'ls_l21': # obtain the feature weight matrix W, _, _ = ls_l21.proximal_gradient_descent(X, y, z=self.params['z'], verbose=False) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(W) if self.name == 'LASSO': LASSO = Lasso(alpha=self.params['alpha'], positive=True) y_pred_lasso = LASSO.fit(X, y) if y_pred_lasso.coef_.ndim == 1: coeff = y_pred_lasso.coef_ else: coeff = np.asarray(y_pred_lasso.coef_[0, :]) idx = np.argsort(-coeff) if self.name == 'EN': # elastic net L1 enet = ElasticNet(alpha=self.params['alpha'], l1_ratio=1, positive=True) y_pred_enet = enet.fit(X, y) if y_pred_enet.coef_.ndim == 1: coeff = y_pred_enet.coef_ else: coeff = np.asarray(y_pred_enet.coef_[0, :]) idx = np.argsort(-coeff) return idx
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel() print("Specificity" + repr(tn / (tn + fp))) sheet_test.write(r, c, roc_auc_score(Y_test, Y_pred)) r = r + 1 c = c + 1 r = 0 MV_sel = [] MV_sel.append(('MIM', MIM.mim(X_train, Y_train, n_selected_features=num_fea))) print('MIM') MV_sel.append(('MIFS', MIFS.mifs(X_train, Y_train, n_selected_features=num_fea))) print('MIFS') MV_sel.append(('MRMR', MRMR.mrmr(X_train, Y_train, n_selected_features=num_fea))) print('MRMR') MV_sel.append(('CIFE', CIFE.cife(X_train, Y_train, n_selected_features=num_fea))) print('CIFE') MV_sel.append(('JMI', JMI.jmi(X_train, Y_train, n_selected_features=num_fea))) print('JMI') MV_sel.append(('CMIM', CMIM.cmim(X_train, Y_train, n_selected_features=num_fea))) print('CMIM') MV_sel.append(('ICAP', ICAP.icap(X_train, Y_train, n_selected_features=num_fea))) print('ICAP') MV_sel.append(('DISR', DISR.disr(X_train, Y_train, n_selected_features=num_fea)))
pval.sort(key=takeSecond) idx = [] for i in range(n_selected_features): idx.append(pval[i][0]) return idx # MULTIVARIATE FEATURE SELECTION X CLASSIFICATION (10 fold CV) # print('BEFORE') MV_sel = [] MV_sel.append(('WLCX', WLCX(X, Y, n_selected_features=num_fea))) print('WLCX') MV_sel.append(('MIFS', MIFS.mifs(X, Y, n_selected_features=num_fea))) print('MIFS') MV_sel.append(('MRMR', MRMR.mrmr(X, Y, n_selected_features=num_fea))) print('MRMR') MV_sel.append(('CIFE', CIFE.cife(X, Y, n_selected_features=num_fea))) print('CIFE') MV_sel.append(('JMI', JMI.jmi(X, Y, n_selected_features=num_fea))) print('JMI') MV_sel.append(('CMIM', CMIM.cmim(X, Y, n_selected_features=num_fea))) print('CMIM') MV_sel.append(('ICAP', ICAP.icap(X, Y, n_selected_features=num_fea))) print('ICAP') MV_sel.append(('DISR', DISR.disr(X, Y, n_selected_features=num_fea))) for name, model in models: for kind, idx in MV_sel: # X_sel = X[:, idx[0:num_fea]] # X_test_ = X_test[:,idx[0:num_fea]] X_train_ = X_train[:, idx[0:num_fea]]
def supervised_mrmr(X, y=None, **kwargs): idx, _, _ = MRMR.mrmr(X, y) return idx
print("*** Discretize Data ***") print(discretize_data) print(discretize_data.shape) # ***** Discretization End ***** # ***** Feature Extraction Stage (MRMR) Start ***** import scipy.io from sklearn.metrics import accuracy_score from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn import svm from skfeature.function.information_theoretical_based import MRMR num_fea = 10 feature_extraction = MRMR.mrmr(discretize_data, target_data, n_selected_features=num_fea) print('\n') print("*** Selected Feature ***") print(feature_extraction) # ***** Feature Extraction Stage (MRMR) End ***** # ***** Concat Stage Start ***** selected_data = discretize_data[:, [0, 16, 9, 8, 17, 35, 2, 22, 20, 19]] import numpy as np concat_data = np.arange(12375).reshape(1125, 11) concat_data = np.concatenate((selected_data, target_data[:, None]), axis=1) print('\n') print('*** Concat Data ***') print(concat_data)
from sklearn.model_selection import cross_validate, KFold from sklearn import svm from skfeature.function.information_theoretical_based import MRMR # load data mat = scipy.io.loadmat('data/colon.mat') X = mat['X'] # data # (62, 2000) X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # (62,) #n_samples, n_features = X.shape # number of samples and number of features # perform evaluation on classification task num_fea = 10 # number of selected features # obtain the index of each feature on the training set ''' Output ------ F: {numpy array}, shape (n_features,) index of selected features, F[0] is the most important feature J_CMI: {numpy array}, shape: (n_features,) corresponding objective function value of selected features MIfy: {numpy array}, shape: (n_features,) corresponding mutual information between selected features and response ''' idx, _, _ = MRMR.mrmr(X, y, n_selected_features=num_fea) # obtain selected features selected_features = X[:, idx[0:num_fea]]
print(x.shape, y.shape) clf = load_clf(FINAL_CLASSIFIER) perfs = np.zeros(10) skf = StratifiedKFold(n_splits=n_splits, random_state=42) fold_index = 0 for train_index, test_index in skf.split(x, y): print("fold:", fold_index + 1) if fold_index != 0: continue x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] for i, k in enumerate(np.arange(10, 101, 10)): idx, _, _ = MRMR.mrmr(x_train, y_train, n_selected_features=k) x_train_selected = x_train[:, idx[0:k]] x_test_selected = x_test[:, idx[0:k]] clf.fit(x_train_selected, y_train) y_pred = clf.predict(x_test_selected) accu = accuracy_score(y_test, y_pred) print("selected features:", k, "accu:", accu) perfs[i] += accu fold_index += 1 print("n_splits:", n_splits) print("mrmr", DATASET, FINAL_CLASSIFIER) # perfs /= n_splits
def main(): max_iter = int(sys.argv[3]) clus_max = int(sys.argv[1]) dataset = str(sys.argv[2]) var = float(sys.argv[4]) k = int(sys.argv[5]) j = 0 data = scipy.io.loadmat(dataset) X = data['X'] X = X.astype(float) y = data['Y'] y = y[:, 0] n, d = X.shape classes = np.unique(y) n_class = len(classes) s_scores = np.zeros((clus_max, 1)) for i in range(clus_max): kmeans = KMeans(init='k-means++', n_clusters=i + 2, n_init=10) labels = kmeans.fit_predict(X) s_scores[i] = metrics.silhouette_score(X, labels) n_clus = np.argmax(s_scores) + 2 ss = StratifiedKFold(n_splits=max_iter, shuffle=True) clf = svm.SVC() parameters = {'kernel': ('linear', 'rbf'), 'C': [0.01, 0.1, 1, 10, 100]} grid_search = GridSearchCV(clf, param_grid=parameters) acc = 0 acc1 = 0 acc2 = 0 acc_ts = 0 acc1_ts = 0 acc2_ts = 0 avg_feature = 0 redundant_features = 0 un, inv, cnts = np.unique(y, return_inverse=True, return_counts=True) un = 1. / cnts un = un / min(un) cl_wts = un[inv] j = 0 for train, test in ss.split(X, y): print("%f - th iteration" % (j + 1)) Xtr, Xts, ytr, yts, cl_tr, cl_ts = X[train], X[test], y[train], y[ test], cl_wts[train], cl_wts[test] ntr = Xtr.shape[0] nts = Xts.shape[0] kmeans = KMeans(init='k-means++', n_clusters=n_clus, n_init=10).fit(Xtr) distr = kmeans.transform(Xtr) dists = kmeans.transform(Xts) distr = distr + 0.0001 dists = dists + 0.0001 one = np.ones(distr.shape) one1 = np.ones(dists.shape) distr = np.divide(one, distr) dists = np.divide(one1, dists) c_labels = kmeans.predict(Xtr) c_labelst = kmeans.predict(Xts) inv_dis_max = max(distr.max(), dists.max()) distr = distr / inv_dis_max dists = dists / inv_dis_max Scores, features = select(Xtr, ytr, c_labels, n_clus, k) Xtrnew = Xtr[:, features] Xtsnew = Xts[:, features] for l in range(Xtrnew.shape[1]): for m in range(l): if (midd(Xtrnew[:, l], Xtrnew[:, m]) > 1): redundant_features = redundant_features + 1 fwtr = expit(var * distr.dot(Scores.transpose())) fwts = expit(var * dists.dot(Scores.transpose())) Xtrnew = np.multiply(Xtrnew, fwtr) Xtsnew = np.multiply(Xtsnew, fwts) ynew = np.zeros((ytr.shape)) ysnew = np.zeros((yts.shape)) for i in range(n_clus): arr1 = (c_labels == i) arr2 = (c_labelst == i) Xtrnewj = Xtrnew[arr1] Xtsnewj = Xtsnew[arr2] ytrj = ytr[arr1] grid_search.fit(Xtrnewj, ytrj) ynew[arr1] = grid_search.predict(Xtrnewj) ysnew[arr2] = grid_search.predict(Xtsnewj) grid_search.fit(Xtr, ytr) y1 = grid_search.predict(Xtr) y1s = grid_search.predict(Xts) num_fea = Xtrnew.shape[1] avg_feature[k] = avg_feature[k] + num_fea id1, _, _ = MRMR.mrmr(Xtr, ytr, n_selected_features=num_fea) Xtr2 = Xtr[:, id1[0:num_fea]] Xts2 = Xts[:, id1[0:num_fea]] grid_search.fit(Xtr2, ytr) y2 = grid_search.predict(Xtr2) y2s = grid_search.predict(Xts2) acc = acc + metrics.accuracy_score(ytr, ynew) acc1 = acc1 + metrics.accuracy_score(ytr, y1) acc2 = acc2 + metrics.accuracy_score(ytr, y2) acc_ts = acc_ts + metrics.accuracy_score(ysnew, yts) acc1_ts = acc1_ts + metrics.accuracy_score(y1s, yts) acc2_ts = acc2_ts + metrics.accuracy_score(y2s, yts) j = j + 1 avg_feature = avg_feature * 1.0 / max_iter redundant_features = redundant_features * 1.0 / max_iter acc = acc * 100.0 / max_iter acc1 = acc1 * 100.0 / max_iter acc2 = acc2 * 100.0 / max_iter acc_ts = acc_ts * 100.0 / max_iter acc1_ts = acc1_ts * 100.0 / max_iter acc2_ts = acc2_ts * 100.0 / max_iter print("Original Dataset Size %f %f" % (X.shape)) print("Average number of features after selection : ") print avg_feature print("Number of clusters :") print n_clus print("Redundant_feature_pairs :") print avg_feature print("Accuracy after Selection :Train Set:") print acc print("Test Set :") print(acc_ts) print("Accuracy with all features :Train Set ") print(acc1) print("Test Set: ") print(acc1_ts) print("Accuracy by mRMR for same number of features: Train Set") print(acc2) print("test Seet :") print(acc2_ts) scipy.io.savemat( dataset + '.mat', { 'acc': acc, 'acc1': acc1, 'acc2': acc2, 'acc_ts': acc_ts, 'acc1_ts': acc1_ts, 'acc2_ts': acc2_ts, 'avg_feature': avg_feature, 'redundant_features': redundant_features })
result = pymrmr.mRMR(X, 'MIQ', 10) print(result) def import_Data(): Data = pd.read_csv('Disease_Data_BiGram.csv') # print(Data.shape) X = Data.iloc[:, 0:Data.shape[1] - 2] Y = Data['Class'] Y_ = Data['Subject'] return X, Y, Y_ FS = {} X, Y, Y_ = import_Data() FS['MRMR'] = X.columns[MRMR.mrmr(np.array(X), Y_, n_selected_features=15)[:15]] FS['JMI'] = X.columns[JMI.jmi(np.array(X), Y_, n_selected_features=15)[:15]] FS['MIFS'] = X.columns[MIFS.mifs(np.array(X), Y_, n_selected_features=15)[:15]] FS['MIM'] = X.columns[MIM.mim(np.array(X), Y_, n_selected_features=15)[:15]] FS = pd.DataFrame(FS) print(FS) FS.to_csv('Selected_Features_MultiVar_BiG.csv') #print(pd.DataFrame(FS)) #model = apply_Model(X,Y_)
bestFeat.fit(train_X, train_Y) feat_scr = zip(feats,bestFeat.scores_) feat_scr = [f for f in feat_scr if not np.isnan(f[1])] sorted_fetas = sorted(feat_scr, key=lambda k:k[1], reverse=True) # estimator = SVR(kernel="linear") # selector = RFE(estimator, 5, step=1) # selector.fit(train_X, train_Y) # slow from sklearn.ensemble import GradientBoostingClassifier g_cls = GradientBoostingClassifier(n_estimators=10) g_cls.fit(train_X, train_Y) g_feats = g_cls.feature_importances_ g_feat_scr = zip(feats,g_feats) g_feat_scr = [f for f in g_feat_scr if not np.isnan(f[1])] g_sorted_fetas = sorted(g_feat_scr, key=lambda k:k[1], reverse=True) from skfeature.function.information_theoretical_based import FCBF, LCSI, MRMR, JMI score = FCBF.fcbf(train_X, train_Y) fcbf_sorted= [feats[i] for i in score] score = MRMR.mrmr(train_X, train_Y, n_selected_features = 50) MRMR_sorted= [feats[i] for i in score] score = JMI.jmi(train_X, train_Y, n_selected_features = 50) JMI_sorted= [feats[i] for i in score]
# print("TRAIN:", train_index, "TEST:", test_index) # 离散数据 X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print('训练集数量:', X_train.shape, '测试集数量:', X_test.shape) # print(y_train) # print(y_test) # 原始数据 X_train_raw, X_test_raw = X_raw[train_index], X_raw[test_index] y_train_raw, y_test_raw = y_raw[train_index], y_raw[test_index] print('训练集数量:', X_train_raw.shape, '测试集数量:', X_test_raw.shape) # obtain the index of each feature on the training set which is selected idx, _, _ = MRMR.mrmr(X_train, y_train, n_selected_features=num_fea) print("the index of selected feature is: " + str(idx)) # save the index of selected feature feature_file = nDim_DIR + "特征.txt" f0 = open(feature_file, 'a+') # 基因 f0.write(str(idx)) f0.write('\n') f0.close() # obtain the dataset on the selected features # update the discrete train data and the discrete test data X_train_select_disc = X_train[:, idx[0:num_fea]] X_test_select_disc = X_test[:, idx[0:num_fea]] # obtain the dataset on the selected features
from sklearn.feature_selection import RFE from sklearn.svm import SVR bestFeat = SelectKBest() bestFeat.fit(train_X, train_Y) feat_scr = zip(feats, bestFeat.scores_) feat_scr = [f for f in feat_scr if not np.isnan(f[1])] sorted_fetas = sorted(feat_scr, key=lambda k: k[1], reverse=True) # estimator = SVR(kernel="linear") # selector = RFE(estimator, 5, step=1) # selector.fit(train_X, train_Y) # slow from sklearn.ensemble import GradientBoostingClassifier g_cls = GradientBoostingClassifier(n_estimators=10) g_cls.fit(train_X, train_Y) g_feats = g_cls.feature_importances_ g_feat_scr = zip(feats, g_feats) g_feat_scr = [f for f in g_feat_scr if not np.isnan(f[1])] g_sorted_fetas = sorted(g_feat_scr, key=lambda k: k[1], reverse=True) from skfeature.function.information_theoretical_based import FCBF, LCSI, MRMR, JMI score = FCBF.fcbf(train_X, train_Y) fcbf_sorted = [feats[i] for i in score] score = MRMR.mrmr(train_X, train_Y, n_selected_features=50) MRMR_sorted = [feats[i] for i in score] score = JMI.jmi(train_X, train_Y, n_selected_features=50) JMI_sorted = [feats[i] for i in score]