def rank_features_using_chisquare(cls, data_frame, target_key, cols_to_ignore=None): X = data_frame.values keys = list(data_frame.keys()) target_col_idx = keys.index(target_key) # Removing the target column from keys del keys[target_col_idx] # Remove all columns that are asked to be ignored if cols_to_ignore is not None: for col in cols_to_ignore: idx = keys.index(col) del keys[idx] Y = data_frame.loc[:, target_key].values X = data_frame.loc[:, keys] neg_test_result = np.any(X < 0, axis=0) non_negative_value_columns = [ keys[i] for i, res in enumerate(neg_test_result) if not res ] # Het data for only positive valued columns X = data_frame.loc[:, non_negative_value_columns] score = chi_square.chi_square(X, Y) rank = chi_square.feature_ranking(score) ranked_features = [non_negative_value_columns[i] for i in rank] return score, ranked_features, non_negative_value_columns
def apply_impl(self, data): X, y = data.Xy # TODO: verify if is possible implement this with numpy y = pd.Categorical(y).codes self._score = chi_square.chi_square(X, y) # Input X must be non-negative. <- This happens when some scaler # generates negative values. self._rank = chi_square.feature_ranking(self._score) self._nro_features = math.ceil((self.ratio) * X.shape[1]) return self.use_impl(data)
def run_fold(trial,P,X,y,method,dataset,parttype): print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial) n_samples, n_features = X.shape train = P[:,trial] == 1 trnX = X[train] trnY = y[train] start_time = time.time() if method == 'fisher': score = fisher_score.fisher_score(trnX,trnY) features = fisher_score.feature_ranking(score) elif method == 'chi2': score = chi_square.chi_square(trnX,trnY) features = chi_square.feature_ranking(score) elif method == 'relieff': score = reliefF.reliefF(trnX,trnY) features = reliefF.feature_ranking(score) elif method == 'jmi': features = JMI.jmi(trnX,trnY, n_selected_features=n_features) elif method == 'mrmr': features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features) elif method == 'infogain': features = MIM.mim(trnX,trnY,n_selected_features=n_features) elif method == 'svmrfe': features = svmrfe(trnX,trnY) elif method == 'hdmr': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) elif method == 'hdmrhaar': sobol_set_all = scipy.io.loadmat('sobol_set.mat') sobol_set = sobol_set_all['sobol_set'] sobol_set = sobol_set.astype(float) params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'} models = hdmrlearn(trnX,trnY,params) features,w = hdmrselect(X,models) else: print(method + 'does no exist') cputime = time.time() - start_time print features print 'cputime %f' % cputime return {'features': features, 'cputime': cputime}
def test_chi_squared(self): X, y = self.DATA f = FilterChiSquare(ratio=0.5) f.fit(X, y) X_, y_ = f.transform(X, y) score = chi_square.chi_square(X, y) rank = chi_square.feature_ranking(score) selected = rank[0:5] assert f.fit(X, y) is f assert np.array_equal(f.rank(), rank) assert np.allclose(f.score(), score) assert np.array_equal(f.selected(), selected) assert np.allclose(X_, X[:, selected]) assert np.array_equal(y_, y)
def main(): # load data mat = scipy.io.loadmat('../data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features print X.shape # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the chi-square score of each feature score = chi_square.chi_square(X, y) # rank features in descending order according to score idx = chi_square.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print 'Accuracy:', float(correct) / 10
def main(): # load data mat = scipy.io.loadmat('../data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] n_samples, n_features = X.shape # number of samples and number of features # split data into 10 folds ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) # perform evaluation on classification task num_fea = 100 # number of selected features clf = svm.LinearSVC() # linear SVM correct = 0 for train, test in ss: # obtain the chi-square score of each feature score = chi_square.chi_square(X, y) # rank features in descending order according to score idx = chi_square.feature_ranking(score) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # train a classification model with the selected features on the training dataset clf.fit(selected_features[train], y[train]) # predict the class labels of test data y_predict = clf.predict(selected_features[test]) # obtain the classification accuracy on the test data acc = accuracy_score(y[test], y_predict) correct = correct + acc # output the average classification accuracy over all 10 folds print ('Accuracy:', float(correct)/10)
idx = fisher_score.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # reliefF score = reliefF.reliefF(X_train, y_train) idx = reliefF.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # chi_square score = chi_square.chi_square(np.abs(X_train), y_train) idx = chi_square.feature_ranking(score) selected_fea_train = X_train[:, idx[0:num_features]] selected_fea_test = X_test[:, idx[0:num_features]] clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # pca pca = PCA(n_components=num_features) pca.fit(X_train) selected_fea_train = pca.transform(X_train) selected_fea_test = pca.transform(X_test) clf.fit(selected_fea_train, y_train) acc.append(accuracy_score(y_test, clf.predict(selected_fea_test))) # rfe
def chi_square_FS(X, y): score = chi_square.chi_square(X, y) idx = chi_square.feature_ranking(score) return (idx, score)
def predict(fs_algorithm=None, dataframe=None, dataset=None, C=1.0, epsilon=0.1): sc = MinMaxScaler(feature_range=(0, 10)) best_sort_feature = [] if dataframe == None and dataset == None: return None if dataset: dataset_data = [model_to_dict(data) for data in dataset] df = pd.DataFrame(dataset_data) city_id = np.asarray(df['city']) raw_X = np.asarray( df.loc[:, 'sum_price_car':'std_buyer_land_rent']) # features raw_y = np.asarray(df['BPS_poverty_rate']) # label if dataframe: df = pd.read_excel(dataframe) city_id = np.asarray(df['city_id']) raw_X = np.asarray( df.loc[:, 'sum_price_car':'std_buyer_land_rent']) # features raw_y = np.asarray(df['BPS_poverty_rate']) # label # 2. pre-processing clean_X = np.nan_to_num(raw_X) clean_y = np.nan_to_num(raw_y) # 3. normalization sc.fit(raw_X) X = np.array(sc.transform(clean_X)) y = np.array(clean_y) if fs_algorithm == "f_score": ranked_index = f_score.f_score(X, y, mode="index") elif fs_algorithm == "chi_square": X_feature = X.astype(int) y_label = y.astype(int) ranked_index = chi_square.chi_square(X_feature, y_label, mode="index") elif fs_algorithm == "cfs": ranked_index = CFS.cfs(X, y) for row in X: row_array = [] for num, feature_idx in enumerate(ranked_index): row_array.append(row[feature_idx]) best_sort_feature.append(row_array) # 5. get best feature predict score best_pred, best_score, result, ten_column_predictions \ = trainf(best_sort_feature, y, C, epsilon) now_unix_timestamp = str(datetime.utcnow().timestamp()) time = now_unix_timestamp.split(".")[0] # set filename filename = "dumped_model/svr_" + fs_algorithm + "_" + time + "_.sav" # get full file path SITE_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) full_model_file_path = SITE_ROOT + "/" + filename # get regressor regressor = best_score[3] pickle.dump(regressor, open(filename, 'wb')) """ RETURN VALUES hasil return dari prediksi SVR """ # modified best prediction return value best_pred = [best_pred, dict(zip(city_id, best_pred))] # nambahin list ranked index dari pecahan 10 terbaik best_score.append(ranked_index[:best_score[2]]) y_true = y """ 1. best prediction => hasil prediksi poverty rate (dictionary, key => city_id) 2. detail => detail best score (array) .best_score => r2 .lowest_score => rmse .jumlah fitur dengan terbaik .model terbaik .urutan fitur terbaik 3. result => detail hasil r2 dari 10 fitur hingga 96 fitur (array) -> [fitur, r2, rmse] 4. hasil percobaan prediksi per 10 fitur (array) 5. actual poverty rate 6. filename """ return best_pred, best_score, result, ten_column_predictions, y_true, full_model_file_path
# 随机采样1000个样本用于计算 X = np.array(train_data) y = np.array(train_label) X_relief, y_relief = shuffle(X, y, n_samples=10000, random_state=0) ''' Filter 方法: Distance:RelieF Dependence:Chi-squared Information:MIFS (Mutual Information Feature ''' # Relief 和 Chi 都是给出每个特征值的一个score,MIFS稍有不同,电脑是第二行也可以当作一个分数,将这三种分数都归一化为0-1之间的数值,求平均 RelieF_score = reliefF.reliefF(X_relief, y_relief[:, 0], k=n_features) # RelieF Chi = chi_square.chi_square(X, y[:, 0]) # 返回值,第一行为特征值排序后的结果,第二行为目标函数,第三行是自变量与相应变量之间的互信息 Mifs = MIFS.mifs(X_relief, y_relief[:, 0], n_selected_features=n_features) ''' 使用mean method 进行选择融合 ''' scores = pd.DataFrame({'Feature': list(Mifs[0]), 'MIFS': list(Mifs[1])}) scores = scores.sort_values(by=['Feature']) scores['Relief'] = RelieF_score scores['Chi'] = Chi # 归一化 min_max_scaler = preprocessing.MinMaxScaler() scores['MIFS_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['MIFS']]) scores['Relief_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['Relief']]) scores['Chi_scaler'] = min_max_scaler.fit_transform(scores.loc[:, ['Chi']])