def rank_features_using_chisquare(cls, data_frame, target_key, cols_to_ignore=None): X = data_frame.values keys = list(data_frame.keys()) target_col_idx = keys.index(target_key) # Removing the target column from keys del keys[target_col_idx] # Remove all columns that are asked to be ignored if cols_to_ignore is not None: for col in cols_to_ignore: idx = keys.index(col) del keys[idx] Y = data_frame.loc[:, target_key].values X = data_frame.loc[:, keys] neg_test_result = np.any(X < 0, axis=0) non_negative_value_columns = [ keys[i] for i, res in enumerate(neg_test_result) if not res ] # Het data for only positive valued columns X = data_frame.loc[:, non_negative_value_columns] score = chi_square.chi_square(X, Y) rank = chi_square.feature_ranking(score) ranked_features = [non_negative_value_columns[i] for i in rank] return score, ranked_features, non_negative_value_columns
def apply_impl(self, data): X, y = data.Xy # TODO: verify if is possible implement this with numpy y = pd.Categorical(y).codes self._score = chi_square.chi_square(X, y) # Input X must be non-negative. <- This happens when some scaler # generates negative values. self._rank = chi_square.feature_ranking(self._score) self._nro_features = math.ceil((self.ratio) * X.shape[1]) return self.use_impl(data)
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def test_chi_squared(self): X, y = self.DATA f = FilterChiSquare(ratio=0.5) f.fit(X, y) X_, y_ = f.transform(X, y) score = chi_square.chi_square(X, y) rank = chi_square.feature_ranking(score) selected = rank[0:5] assert f.fit(X, y) is f assert np.array_equal(f.rank(), rank) assert np.allclose(f.score(), score) assert np.array_equal(f.selected(), selected) assert np.allclose(X_, X[:, selected]) assert np.array_equal(y_, y)
def main(): # load data mat = scipy.io.loadmat('../data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features print X.shape # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the chi-square score of each feature score = chi_square.chi_square(X, y) # rank features in descending order according to score idx = chi_square.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print 'Accuracy:', float(correct) / 10
def main(): # load data mat = scipy.io.loadmat('../data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the chi-square score of each feature score = chi_square.chi_square(X, y) # rank features in descending order according to score idx = chi_square.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print ('Accuracy:', float(correct)/10)
X = dataset.iloc[:, 2: 32] # [all rows, col from index 2 to the last one excluding 'Unnamed: 32'] y = dataset.iloc[:, 1] # [all rows, col one only which contains the classes of cancer] labelencoder_Y = LabelEncoder() y = labelencoder_Y.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = X_train.values X_test = X_test.values # compute RFS scores score = RFS(X_train, y_train) idx = feature_ranking(score) np.save('features/rfs.npy', idx) print('Features saved') #idx = np.load('features/rfs.npy') # create copies of the data X_train_copy = X_train y_train_copy = y_train X_test_copy = X_test y_test_copy = y_test # train and compute accuracy of final model trained on selected features final_list = [] for num_fea in range(30, 0, -1): # load the copies of the original data X_train = X_train_copy
selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # chi_square score = chi_square.chi_square(np.abs(X_train), y_train) idx = chi_square.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # pca pca = PCA(n_components=num_features) pca.fit(X_train) selected_fea_train = pca.transform(X_train) selected_fea_test = pca.transform(X_test) clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # rfe estimator = LinearSVC(random_state=random_state)
def chi_square_FS(X, y): score = chi_square.chi_square(X, y) idx = chi_square.feature_ranking(score) return (idx, score)