def find_best_n_features_mic(n=8, out_path=''): # 计算MIC mine = MINE(alpha=0.6, c=15, est="mic_approx") mic_all = [] for i in range(x.shape[1]): xi = x[:, i] mine.compute_score(xi, y) mic_all.append(mine.mic()) # 找出8个最大的 best_n = [] best_n_mic = [] for i in range(n): best_position = np.nanargmax(mic_all) best_n.append(best_position) best_n_mic.append(copy.deepcopy(mic_all[best_position])) mic_all[best_position] = np.nan print('Found', n, 'features with largest MIC, whose positions are:') print(best_n) print() print('The MIC of these features are:') print(best_n_mic) print() best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame( np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'mic_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)
def find_best_n_features_relief(n=8, out_path=''): relief_all = calculate_relief(x, y) # 找出8个最大的 best_n = [] best_n_ref = [] for i in range(n): best_position = np.nanargmax(relief_all) best_n.append(best_position) best_n_ref.append(copy.deepcopy(relief_all[best_position])) relief_all[best_position] = np.nan print('Found', n, 'features with largest Relief Statistics, whose positions are:') print(best_n) print() print('The Relief Statistics of these features are:') print(best_n_ref) print() best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame( np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'relief_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)
def find_best_n_features_ridge(n=8, out_path=''): ridge = LogisticRegression(penalty='l2', class_weight='balanced') params_tuned = {'C': [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100, 180, 190, 200, 220, 240, 250, 260, 300, 500, 1000, 2000]} clf = GridSearchCV(ridge, params_tuned, scoring='accuracy', cv=5) clf.fit(x, y) print('During Ridge, the penalty parameter alpha is set as', clf.best_params_['C']) clf.best_estimator_.fit(x, y) clf.best_estimator_.coef_ = np.abs(clf.best_estimator_.coef_) best_n = [] # best_n_coef = [] for i in range(n): best_position = np.nanargmax(clf.best_estimator_.coef_) best_n.append(best_position) # best_n_coef.append(copy.deepcopy(clf.best_estimator_.coef_[:, best_position][0])) clf.best_estimator_.coef_[:, best_position] = np.nan print('Selected Features:', best_n) best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame(np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'ridge_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)
def find_best_n_features_lasso(n=8, out_path='', max_iter=100): def find_proper_c(n, low, up, max_iter=100): c_now = (low + up) / 2 lasso = LogisticRegression(penalty='l1', class_weight='balanced', C=c_now) lasso.fit(x, y) # 用二分法缩小范围,直到最后留下刚好n个特征 count = 0 fea_num = np.sum(lasso.coef_ > 0.0) while count < max_iter and (fea_num > n * 1.05 or fea_num < n): if fea_num > n: up = (low + up) / 2 elif fea_num < n: low = (low + up) / 2 c_now = (low + up) / 2 lasso = LogisticRegression(penalty='l1', class_weight='balanced', C=c_now) lasso.fit(x, y) fea_num = np.sum(lasso.coef_ > 0.0) count += 1 # print(c_now, ':', fea_num) return c_now, count low = 0 up = 100000 c_now, count = find_proper_c(n, low, up, max_iter) retry = 1 while count == max_iter and retry <= 10: print('Didn\'t find proper C within', low, 'and', up, ', retrying', retry) low = up up = up * 10 c_now, count = find_proper_c(low, up, max_iter) retry += 1 if retry == 11: print('Cound not find proper C, please try a lower dimension.') return lasso = LogisticRegression(penalty='l1', class_weight='balanced', C=c_now) lasso.fit(x, y) best_features = x[:, (lasso.coef_ > 0.0)[0]] if best_features.shape[1] > n: best_n = [] for i in range(n): best_position = np.nanargmax(lasso.coef_) best_n.append(best_position) lasso.coef_[:, best_position] = np.nan best_features = x[:, best_n] print('Shape of features selected:', best_features.shape) best_features_with_label = pd.DataFrame( np.concatenate([best_features, y.reshape(len(y), 1)], axis=1)) out_path = out_path + 'lasso_best_' + str(n) + '.csv' best_features_with_label.to_csv(out_path, header=None, index=None)