def get_best_feature_subset(X, Y): from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import f1_score from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.1) best_f1 = 0 best_model = None estimator = LogisticRegression() for i in range(5, 33): rfe = RFE(estimator, i) rfe.fit(trainX.values, trainY.values) predictions = rfe.predict(testX) f1 = f1_score(predictions, testY, average='macro') print(i) print(f1) if f1 > best_f1: best_f1 = f1 best_model = rfe print("The subset of features for the best performing model are:") result = [] for i, chosen in enumerate(best_model.support_.tolist()): if chosen: result.append(trainX.columns.values[i]) print(result) print(classification_report(best_model.predict(testX), testY)) return result
def do_learning(X_training, Y_training, X_test, Y_test, reference_dic, model_class): ''' credit: Juan Arroyo-Miranda & Dani Alcala With training and testing data select the best features with recursive feature elimination method, then fit a classifier and return a tuple containing the predicted values on the test data and a list of the best features used. ''' model = model_class # Recursive Feature Elimination rfe = RFE(model) rfe = rfe.fit(X_training, Y_training) best_features = rfe.get_support(indices=True) best_features_names = [reference_dic[i] for i in best_features] predicted = rfe.predict(X_test) expected = Y_test accuracy = accuracy_score(expected, predicted) return (expected, predicted, best_features_names, accuracy)
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert len(rfe.ranking_) == X.shape[1] # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert X_r.shape == iris.data.shape assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert rfe.score(X, y) == clf.score(iris.data, iris.target) assert_array_almost_equal(X_r, X_r_sparse.toarray())
def recursive_feature_elimination(config_learning, config_data, number_features): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfe = RFE(estimator, number_features, step=1) rfe.fit(x_train, y_train) for i, name in enumerate(feature_names): output.write(name + "\t" + str(rfe.ranking_[i]) + "\n") print(name + "\t" + str(rfe.ranking_[i])) predictions = rfe.predict(x_test) output.close() return predictions
def feature_selection_LR(): from sklearn.feature_selection import RFE rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=30, step=5, verbose=5) rfe_selector.fit(X_train_scaled, y_train) y_pred = rfe_selector.predict(X_test_scaled) y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1] rfe_support = rfe_selector.get_support() rfe_feature = X_train[predictors].loc[:,rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') print('RFE features') print(rfe_feature) # Print model report: print("\nModel Report") #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train)) print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred)) #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train))) print('Test error: {:.3f}'.format(1 - metrics.accuracy_score(y_test, y_pred))) print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Recall : %f" % metrics.recall_score(y_test, y_pred)) print("Precision : %f" % metrics.precision_score(y_test, y_pred)) print("F-measure : %f" % metrics.f1_score(y_test, y_pred)) c_matrix = metrics.confusion_matrix(y_test, y_pred) print('========Confusion Matrix==========') print(" Rejected Accepted") print('Rejected {} {}'.format(c_matrix[0][0], c_matrix[0][1])) print('Accepted {} {}'.format(c_matrix[1][0], c_matrix[1][1]))
def recursive_feature_elimination(): """ perform recursive feature elimination on a Linear Regression model to retrieve an optimal choice of nodes from a set of nodes. """ # #### create data ####################### num_nodes = 20 num_nodes_choose = 5 nodes = np.linspace(0, 1, num_nodes) set_size = 100000 x = np.empty((num_nodes, set_size)) # one row for one node y = np.empty(set_size) for n in range(set_size): f = VelOscillator() x[:, n] = [f(node) for node in nodes] y[n] = f.integral(0, 1) # ### perform recursive feature elimination on a Linear Regression model reg = LinearRegression() rfe = RFE(estimator=reg, n_features_to_select=num_nodes_choose) rfe.fit(np.transpose(x), y) print('selected the nodes: {}'.format(nodes[rfe.support_])) # ### calculate the error error = np.mean((rfe.predict(np.transpose(x)) - y)**2)**0.5 print('The chosen nodes and weights yield an error of {}'.format(error))
def wbc_wrapper(): wbc_data = load_wbc_data() wbc_values, wbc_labels = data_preprocessing(wbc_data) estimator = SGDClassifier(max_iter=1000) selector = RFE(estimator,5) selector.fit(wbc_values,wbc_labels) score, f1score = selector.score(wbc_values,wbc_labels), f1_score(selector.predict(wbc_values), wbc_labels) print('WBC-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
def sonar_wrapper(): sonar_data = load_sonar_data() sonar_values, sonar_labels = data_preprocessing(sonar_data) estimator = SGDClassifier(max_iter=1000) selector = RFE(estimator,5) selector.fit(sonar_values, sonar_labels) score, f1score = selector.score(sonar_values, sonar_labels), f1_score(selector.predict(sonar_values), sonar_labels) print('Sonar-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
def train_recursive_feature_elimination(x_train, y_train, x_test, y_test): print("-------------RFE Model-------------") model = LogisticRegression(solver='lbfgs') rfe = RFE(model, 4) # RFE Fit rfe.fit(x_train, y_train) # RFE Predict y_predicted = rfe.predict(x_test) print_metrices_out(y_predicted, y_test)
def recursiveFeatureElimination(label, features): model = linear_model.LinearRegression() rfe = RFE(model, n_features_to_select=4) rfe = rfe.fit(features, label) print sorted( zip(map(lambda features: round(features, 4), rfe.ranking_), features)) prediction = rfe.predict(features) r2Score = r2_score(label, prediction) print(r2Score) return rfe
def elimination_feature(): df = _load_data() X_train, X_test, y_train, y_test = _train_test(df, 'Milk') linear = LinearRegression() rfe = RFE(linear, n_features_to_select=3) rfe.fit(X_train, y_train) y_predict = rfe.predict(X_test) score = rfe.score(X_test, y_test) err = mean_squared_error(y_test, y_predict) return score, err, y_predict
def linrfe(): """ 为了快速计算完成, step=xx 需要设置大一些. ridge : 0.28+ ridge + RFE: 0.28+ 线上却有0.045 ; 线下的这个测试看来完全不准确 """ X, y = load_svmlight_file('train.txt') X = X.toarray() scaler = StandardScaler().fit(X) X = scaler.transform(X) reg = linear_model.Ridge(alpha=0.5) reg.fit(X, y) print 'r^2=', reg.score(X, y) print 'train mse = ', mean_squared_error(y, reg.predict(X)) rfe = RFE(estimator=reg, n_features_to_select=500, step=1000, verbose=2) rfe.fit(X, y) print 'rfe r^2 = ', rfe.score(X, y) print 'rfe mse =', mean_squared_error(y, rfe.predict(X)) X_rfe = rfe.transform(X) poly = PolynomialFeatures(degree=2, interaction_only=True) X_poly = poly.fit_transform(X_rfe) #直接处理会有 MemoryError param_grid = {'alpha': [0.5, 1, 10, 100, 1000, 1e4, 3e4]} gbm = GridSearchCV(reg, param_grid, verbose=2, scoring='neg_mean_squared_error', cv=5) gbm.fit(X_poly, y) logging.info('after rfe poly, best_result = {0}'.format(gbm.best_score_)) logging.info('after rfe poly, best_param= {0}'.format(gbm.best_params_)) #mse = reg.score(X_poly, y) #print 'after poly ' ,mean_squared_error(y, reg.predict(X_poly)) #logging.info('rfe r^2 score= ' + str(mse) ) params = { 'objective': 'mse', 'num_leaves': 8, 'learning_rate': 0.05, 'min_child_samples': 60, # 这个题目比较关键 . # 'subsample': 0.9, 'n_estimators': 100, 'silent': False, } gbm = lgb.LGBMRegressor(**params) gbm.fit(X_poly, y, eval_metric='mse', eval_set=[(X_poly, y)]) logging.info('train lgb of poly = {0}'.format( mean_squared_error(y, gbm.predict(X_poly, y))))
def runRFE(x, y, x_test, y_test, display=False): print("Bayes with feature selection") bayes = MultinomialNB() selector = RFE(bayes, 5, step=1) selector.fit(x, y) y_pred = selector.predict(x_test) labels = y.unique() confusion = confusion_matrix(y_test, y_pred, labels) if display: cd.display(confusion, labels, 31, "Bayes with feature selection") return confusion
def subtest(model, XL, YL, XT, YT, feature_names): nfeatures = XL.shape[1] rfe = RFE(model, nfeatures-1) print "BEFORE" model.fit(XL, YL) print_performance(YT, model.predict(XT)) print "AFTER" rfe.fit(XL, YL) print_performance(YT, rfe.predict(XT)) print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]]) print "" return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug): e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug) step_length = int(len(e_data.tolist()[0]) / 100) + 1 model = RFE(self.model,target_features,step=step_length) model.fit(e_data,e_target) predictions = model.predict(p_data) all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0] top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]] return p_identifiers, predictions, top_features
def lsvm_classifier(authors: array, features: array, feature_max = 1000): train_labels, test_labels, train_data, test_data = train_test_split(authors, features, test_size=0.10) model = LinearSVC() selector = RFE(model, feature_max, 50, verbose=0) selector = selector.fit(train_data, train_labels) predictions = selector.predict(test_data) #for feature in range(len(features)): #for index in range(len(features[feature])): #features[feature][index] *= feature_mask[index] model.fit(train_data, train_labels) predictions = model.predict(test_data) accuracy = accuracy_score(predictions, test_labels) return accuracy
def test(self): estimator = LogisticRegression(random_state=0, solver='lbfgs') selector = RFE(estimator, self.feature_num, step=1) start = timer() selector = selector.fit(X, Y.ravel()) end = timer() running_time = end - start prediction = selector.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y_test, prediction, pos_label=1) roc_auc = metrics.auc(fpr, tpr) print("Train for feature_num=" + str(self.feature_num) + ' done') return running_time, roc_auc
def runRFE(x, y, x_test, y_test, display=False): print("Decision tree with feature selection") dtc = get_best_so_far() selector = RFE(dtc, 5, step=1) selector.fit(x, y) y_pred = selector.predict(x_test) labels = y.unique() confusion = confusion_matrix(y_test, y_pred, labels) if display: cd.display(confusion, labels, 90, "Decision tree with feature selection") return confusion
def model_logistic(X_train, y_train, X_test): ''' With training and testing data and the data's features and label, select the best features with recursive feature elimination method, then fit a logistic regression model and return predicted values on the test data and a list of the best features used. ''' model = LogisticRegression() rfe = RFE(model) rfe = rfe.fit(X_train, y_train) predicted = rfe.predict(X_test) best_features = rfe.get_support(indices=True) return predicted, best_features
def logisticRegression(): model = LogisticRegression() X, y = generateDataSet("normalizedRegression_removed.csv") # create the RFE model and select 3 attributes rfe = RFE(model, 12) rfe = rfe.fit(X, y) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_) expected = y predicted = rfe.predict(X) # summarize the fit of the model print(metrics.classification_report(expected, predicted))
def model_logistic(training_data, test_data, features, label): ''' With training and testing data and the data's features and label, select the best features with recursive feature elimination method, then fit a logistic regression model and return predicted values on the test data and a list of the best features used. ''' model = LogisticRegression() rfe = RFE(model) rfe = rfe.fit(training_data[features], training_data[label]) predicted = rfe.predict(test_data[features]) best_features = rfe.get_support(indices=True) return predicted, best_features
def experiment(no_features): results = [] for i in no_features: logistic = RFE(logistic_regression, i, step=1) logistic = logistic.fit(x_train, y_train) print(get_true_indices(logistic.support_)) y_res = logistic.predict(x_train) accuracy = accuracy_score(y_train, y_res.ravel()) print(accuracy) results.append(accuracy) return results
def model_logistic(training_data, test_data, features, label): ''' With training and testing data and the data's features and label, select the best features with recursive feature elimination method, then fit a logistic regression model and return predicted values on the test data and a list of the best features used. ''' start = time() model = LogisticRegression() rfe = RFE(model) rfe = rfe.fit(training_data[features], training_data[label]) predicted = rfe.predict(test_data[features]) best_features = rfe.get_support(indices=True) elapsed_time = time() - start print 'logistic regression took %s seconds to fit' % elapsed_time return predicted, best_features
def logistic_model(X_train, X_test, y_train): ''' Function to select best features using RFE, then fits logistic regression model. Returns predicted values. Inputs: X_train, X_test, y_train (df) Output: predicted_y (list) ''' reg = LogisticRegression() rfe = RFE(reg) rfe = rfe.fit(X_train, y_train) predicted_y = rfe.predict(X_test) best_features = rfe.get_support(indices=True) return predicted_y, best_features
def q4(): X = df.copy().drop(columns=["Overall"]) y = df["Overall"] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True) reg = LinearRegression() reg.fit(X_train, y_train) print("Model r2 score Linear regression:", reg.score(X_test, y_test)) y_pred = reg.predict(X_test) print('MSE', mse(y_test, y_pred)) print('RMSE', mse(y_test, y_pred, squared=False)) print( pd.DataFrame.from_dict(dict(zip(X_train.columns, reg.coef_)), orient='index', columns=['coef']).sort_values( by='coef', ascending=False).head(5)) selector = RFE(estimator=reg, n_features_to_select=5, step=1, verbose=0) selector = selector.fit(X_train, y_train) selected_features5 = list(X_train.columns[selector.get_support()]) print('\nMost important features RFE', selected_features5) print("\nModel r2 score RFE Linear regression selected features:", selector.score(X_test, y_test)) y_pred = selector.predict(X_test) print('MSE', mse(y_test, y_pred)) print('RMSE', mse(y_test, y_pred, squared=False)) X_train5 = selector.transform(X_train) reg.fit(X_train5, y_train) coeficients = reg.coef_ print( pd.DataFrame.from_dict(dict(zip(selected_features5, reg.coef_)), orient='index', columns=['coef']).sort_values( by='coef', ascending=False).head(5)) # plt.scatter(y_test,y_pred) # plt.show() return selected_features5
def get_predictions_full_CCLE_dataset_rfe(self,expression_file,ic50_file,target_features,drug): scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 model = RFE(self.model,target_features,step=step_length) model.fit(scikit_data,scikit_target) expression_frame = dfm.normalize_expression_frame(dfm.get_cell_line_expression_frame(expression_file)) cell_lines = expression_frame.columns testing_data = dfm.get_scikit_data(expression_frame) predictions = model.predict(testing_data) top_features = [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if model.support_[i]] return cell_lines,predictions,top_features
def train_recursive_feature_elimination(x_train, y_train, x_test, y_test, feature_num=10): print("-------------RFE Model-------------") class_weight = dict() class_weight[1] = 1 class_weight[0] = 1 model = LogisticRegression(solver='sag', class_weight=class_weight) # model = RandomForestClassifier(n_estimators=100) # model = SVC(gamma='scale', probability=True, kernel='poly') rfe = RFE(model, feature_num) # RFE Fit rfe.fit(x_train, y_train) # RFE Predict y_predicted = rfe.predict(x_test) y_prob = rfe.predict_proba(x_test) print(rfe.support_) return y_predicted, y_prob
def rfe_classifier(method, train_data, train_class, test_data, CV_=3, fraction_feat_to_keep=0.1, LM_params=get_ML_parameters()): global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) have_written_params_to_file = True clf = set_up_classifier(method, CV_, LM_params) # fit and predict based on whether cross validation is used if (CV_ > 1): step_elim = (1 - fraction_feat_to_keep) / CV_ num_to_keep = int(fraction_feat_to_keep * len(list(train_data))) num_to_keep = max(num_to_keep, 1) rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=num_to_keep) rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) mask = list(rfecv.support_) # print("Number of features selected:", sum(mask)) #print(rfecv.ranking_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] #print(features_selected) else: clf.fit(train_data, train_class) preds = clf.predict(test_data) return preds, features_selected, sum(mask)
def svm(train_set, label_set, test_set, ground_truth): train_set = Normalizer().fit_transform(train_set) test_set = Normalizer().fit_transform(test_set) svm_clf = SVC(C=0.2, kernel='linear') #svm_clf = SVC() #s = cross_validate(svm_clf,train_set,label_set) #print(s) #grid = GridSearchCV(svm_clf,param_grid={"C":[0.2,0.5,1.0,1.2,1.5,3,10],"kernel":['linear','rbf']},cv=10) #grid.fit(train_set,label_set) rfe = RFE(estimator=svm_clf, n_features_to_select=2, step=1) # n=5,0.6497 n=8,0.66358 n=12,0.66728 n=15,0.66635 """"[True True False True False True True False True True True True True True False True] [1 1 5 1 2 1 1 3 1 1 1 1 1 1 4 1]""" rfe.fit(train_set, label_set) #print(rfe.support_) #print(rfe.ranking_) #svm_clf.fit(train_set,label_set) #y_score = svm_clf.decision_function(test_set) #y = svm_clf.predict(test_set) y = rfe.predict(test_set) #fpr, tpr, threshold = roc_curve(test_set, y_score) #roc_auc = auc(fpr, tpr) #y = grid.predict(test_set) print(rfe.score(test_set, ground_truth)) #print(svm_clf.score(test_set,ground_truth)) #print(svm_clf.score(train_set,label_set)) #print(grid.score(test_set,ground_truth)) #print(grid.score(train_set,label_set)) #print(grid.best_params_) p = precision_score(y, ground_truth) r = recall_score(y, ground_truth) f = f1_score(y, ground_truth) return p, r, f
def automatic_recursive_feature_elimination(df): X = df.drop("test", axis=1) y = df["test"].apply(lambda x: 1 if x == "positive" else 0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Create the RFE with a LogisticRegression estimator and 3 features to select rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1) # Fits the eliminator to the data rfe.fit(X_train, y_train) # Print the features and their ranking (high = dropped early on) print(dict(zip(X.columns, rfe.ranking_))) # Print the features that are not eliminated print(X.columns[rfe.support_]) # Calculates the test set accuracy acc = accuracy_score(y_test, rfe.predict(X_test)) print("{0:.1%} accuracy on test set.".format(acc))
def sc3_multitask(X, Y, Z, feature_list, selection_method, estimator_method, selection_args, estimator_args): W = [] features = [] if estimator_method == 'svm' and selection_method == 'RFE': estimator_args['kernel'] = 'linear' n_features = min(len(feature_list), selection_args['n_features']) estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'rfe': del selection_args['n_features'] selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) selector = selector.fit(X, Y.T) features = feature_list[selector.support_] W = selector.predict(Z) if selection_method == 'kbest': print 'Cannot use KBest with multi task methods' return W.T, features
def RFElimination1(X_train1, train1_y, X_test1, test1_y, return_score_p1, name): print("in " + str(name)) org = lm selector = RFE(org, 18, step=1) selector = selector.fit(X_train1, train1_y) # print(selector.ranking_) rankingdf = pd.DataFrame(list(zip(X_train1.columns, selector.ranking_)), columns=["features", "ranking"]) file = "Features" + str(name) rankingdf.to_csv(file + ".csv") # print(rankingdf) result = sm.OLS(train1_y, X_train1).fit() # print(result.summary()) pred = selector.predict(X_train1) sc = r2_score(train1_y, pred) # print("RFElimination:" + str(sc)) # print(sc) return_score_p1[name] = sc print("Training Dataset") computations(selector, X_train1, train1_y) print("Testing Dataset") computations(selector, X_test1, test1_y)
def logisticReg(): train = getTrainingData('train.csv', visualize=False, discrete=False, encoding=True) X_train = train.drop(['Exited'], axis=1) print(X_train.columns.values) y_train = train.Exited # oversample = SMOTE() X_train, y_train = oversample.fit_resample(X_train, y_train) scale = StandardScaler().fit(X_train) X_train = scale.transform(X_train) # ---- RFE ----- rfes = [] scores = [] for n in range(1, 13): tree = RandomForestClassifier(n_estimators=40) rfe = RFE(tree, n, 1) rfe.fit(X_train, y_train) rfes.append(rfe) yHat = rfe.predict(X_train) scores.append(accuracy_score(y_train, yHat)) print(scores) print(rfes[4].support_) # ['CreditScore' 'Age' 'Tenure' 'Balance' 'NumOfProducts' 'HasCrCard' # 'EstimatedSalary' 'France' 'Germany' 'Female' 'Active'] # [False True False True False False False False True True True] # this is the setting that's resonable for both accuracy_score and f1_score # ['Age' 'NumOfProducts' 'HasCrCard' 'Germany' 'Active'] return # logisticReg()
def RFE_linear_regression(X, y, n_features = 6): # define estimators estimator1 = Lasso(alpha = 0.2) estimator2 = Ridge(alpha = 0.8) estimator3 = ElasticNet(alpha = 0.9) # create the vector of tuples required as parameter of make_KfoldCV_regression(), here we have 3 estimators estimators = [('RFE L1 Regularizer', estimator1), ('RFE L2 Regularizer', estimator2), ('RFE Elastic Net Regularizer', estimator3)] ets = {} for name, estimator in estimators: # select best model with minimum error (et1, er1 for MAD, et2, er2 for MSE) et1 = None et2 = None er1 = None er2 = None finaly_test = None #used to calculate E_out finalX_test = None for X_train, y_train, X_test, y_test in get_kfold_train_test(X, y): # do feature selection selector = RFE(estimator, n_features, step=1) # do training selector.fit(X_train, y_train) # get prediction vector preds = selector.predict(X_test) # calculate MAD, MSE (out of sample) error1 = mean_absolute_dev(preds, y_test) error2 = mean_squared_err(preds, y_test) # select the best training set and test set, i.e select the best estimator, and the corresponding MAD and MSE (out of sample) if er1 is None: et1 = copy.deepcopy(selector) er1 = error1 finaly_test = copy.deepcopy(y_test) finalX_test = copy.deepcopy(X_test) else: if error1 < er1: et1 = copy.deepcopy(selector) er1 = error1 finaly_test = copy.deepcopy(y_test) finalX_test = copy.deepcopy(X_test) if er2 is None: et2 = copy.deepcopy(selector) er2 = error2 else: if error2 < er2: et2 = copy.deepcopy(selector) er2 = error2 print(name, ':\n', 'MAD (out of sample):', '%.4f' % er1, '; MSE (out of sample):', '%.4f' % er2) # use the best estimator(respectively based on MAD and MSE) to predict all y_preds1 = et1.predict(X) y_preds2 = et2.predict(X) # calculate E_out finaly_preds = et1.predict(finalX_test) count = 0 tol = 0.8 for i in range(len(finaly_test)): if abs(finaly_preds[i]-finaly_test[i]) <= tol: count += 1 print('With tolerance ', '%.4f' % tol, ', E_out is ', '%.4f' %(1 - count/len(finaly_test))) # put corresponding vectors of prediction of all samples and MAD, MSE into a dictionary with keys as names of the estimators # put corresponding vectors of prediction of all samples and MAD, MSE into a dictionary with keys as names of the estimators ets[name] = (y_preds1, mean_absolute_dev(y_preds1, y), y_preds2, mean_squared_err(y_preds2, y)) return ets
print("Test Accuracy: ", test_score) MetricsForMulticlass(y_test, pred) accuracy = metrics.accuracy_score(y_test, pred) imp = improvement(group, accuracy) print("Accuracy: ", round(accuracy, 2)) #0.62 print("Improvement: ", round(imp, 2)) #14.3 #---------------------------------------------------------------------------- # RECURSIVE FEATURES ELIMINATION from sklearn.feature_selection import RFE estimator = rfc(max_depth=4, n_estimators=500, random_state=42, n_jobs=-1) selector = RFE(estimator, 5, step=1) selector = selector.fit(X_train, y_train) with open("rfe_new_features38", "wb") as f: pickle.dump(selector, f, pickle.HIGHEST_PROTOCOL) #selector.support_ #selector.ranking_ mod = selector.fit(X_train, y_train) prediction = selector.predict(X_test) accuracy = metrics.accuracy_score(y_test, pred) imp = improvement(group, accuracy) print("Accuracy: ", round(accuracy, 2)) print("Improvement: ", round(imp, 2))
print(confmat) return tree i = 1 while i <= len(wine.columns)-1: print(i) make_and_test_tree(wine_train, wine_test, i) i += 1 tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=len(wine.columns)-1, random_state=0) selector = RFE(estimator=tree, n_features_to_select=3, step=1) # limit this to two best features selector = selector.fit(wine_val_train.iloc[:, 0:len(wine_val_train.columns)-2], wine_val_train.iloc[:,[len(wine_validation.columns)-1]]) selector.support_ selector.ranking_ RFE_tree = selector.predict(wine_validation.iloc[:,0:len(wine_validation.columns)-2], wine_validation.iloc[:,[len(wine_validation.columns)-1]]) wine4 = wine[['residual.sugar', 'free.sulfur.dioxide', 'density', 'qualBins2']] # create training/testing (to use at the end) wine_train, wine_test = train_test_split(wine4, test_size=0.3, random_state=0) # create validation set (out of the training set) wine_val_train, wine_validation = train_test_split(wine_train, test_size=0.3, random_state=0) i = 1 while i <= len(wine4.columns)-1: print(i)
print "Predicted pos %d neg %d" %(p,n) p=int(0) n=int(0) for x in ycv: if x==1: p+=1 else: n+=1 print "Actual pos %d neg %d" %(p,n) #if "": f=np.genfromtxt(open("CAX_COPD_TEST_data.csv","rb"),delimiter=",",skiprows=1) mat=np.matrix(f) Xtest=mat[:,2:] p=int(0) n=int(0) pred=rfe.predict(Xtest) for x in pred: print x if x==1: p+=1 else: n+=1 print p,n if "": with open('CAX_COPD_SubmissionFormat.csv','r') as csvinput: with open('Coutput.csv', 'w') as csvoutput: writer = csv.writer(csvoutput, lineterminator='\n') reader = csv.reader(csvinput) all = [] row = next(reader)
results = [] selector = RFE(estimator,3,step=1) #recursive forward elimination selcetor = selector.fit(X,y) selector.support_ # Out[61]: array([ True, True, True, False, False, False]) # It seems the first predictors are okay but not the the last three. # one can see if droping the last three will improve the model with all 6. selector.ranking_ # Out[65]: array([2, 1, 3, 6, 5, 4]) for i in range(1,len(X.iloc[0])+1): selector = RFE(estimator, n_features_to_select=i, step=1) selector.fit(X,y) r2 = selector.score(X,y) selected_features = features[selector.support_] msr = mean_squared_error(y, selector.predict(X)) results.append([i, r2, msr, ','.join(selected_features)]) results ''' results Out[68]: [[1, 0.47017552557905884, 2.5448877365932985, 'Email'], [2, 0.8987844810699489, 0.48616503259788457, 'Internet,Email'], [3, 0.9008606156394956, 0.47619280658599406, 'Internet,Email,Blog'], [4, 0.9051564044419049, 0.45555899148299284, 'Internet,Email,Blog,SmartPhone'], [5,
for entry in inp: for i in range(n): key,val=entry[i].split(':') feature[key]=val feature={int(key):float(val) for key,val in feature.items()} fvec.append(feature.copy()) for i in range(mtest): #print fvec[i] row=[] for key in sorted(fvec[i]): row.append(fvec[i][key]) row=np.matrix(row) if i==0: test=row else: test=np.r_[test,row] #train=np.r_(train,row) print test orig=['-1','+1'] label2orig={i:orig[i] for i in range(2)} pred=rfe.predict(test) print pred cl=[0,1,1,0,1] for i,val in enumerate(cl): print label2orig[val] for i,val in enumerate(pred): print iden[i], print label2orig[val]
# Transform the training data #scaler = MinMaxScaler(feature_range=(1, 2)).fit(data_features) #data_features = scaler.transform(data_features) data_features = transform_data(data_features) # Fit the regression model alphas = np.logspace(-5,3,30) ridge_regressor = linear_model.RidgeCV(alphas=alphas, normalize=True, fit_intercept=True) rfe = RFE(estimator=ridge_regressor, n_features_to_select=14, step=1) rfe.fit(data_features[0:600], data_values[0:600]) print "alpha: " + str(rfe.estimator_.alpha_) print "intercept: " + str(rfe.estimator_.intercept_) print "R-square: " + str(rfe.score(data_features[600:], data_values[600:])) # Compute MSE train_pred = rfe.predict(data_features) mse = ((train_pred[600:] - data_values[600:])**2).mean(axis=0) print "MSE: " + str(mse) # Visualize predicted values on train data plt.plot(train_pred[600:], label='Predicted') plt.plot(data_values[600:], label='Actual') plt.legend(loc='upper right') plt.show() # Load the test data validate_test = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(1,15)) validate_test_ind = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(0,1)) # Transform the test data #validate_test = scaler.transform(validate_test)
n_split = 1800 X_train, X_test = X[:n_split], X[n_split:] Y_train, Y_test = y[:n_split], y[n_split:] numFeatures = 40 model = ExtraTreesClassifier() #model.fit(X_train, Y_train) rfe = RFE(model, numFeatures) rfe = rfe.fit(X_train,Y_train) temp = rfe.score(X_test, Y_test) predictionOfPrelim = rfe.predict(prelimData) featureRanking = rfe.ranking_ #Best ExtraTrees Accuracy is: [400, 0.98902777777777773, 40] print("ExtraTrees Accuracy is: ", temp) prelimClasses = np.loadtxt("prelim-class.txt") assert len(prelimClasses) == len(predictionOfPrelim) h = [] for i in range(len(prelimClasses)): if prelimClasses[i] == predictionOfPrelim[i]: h.append(1) else: h.append(0) thefile = open('Result_ExtraTrees_prelim.txt', 'w')
#print trainind,cvind #itrain=mat[:1100,0] #for x in X: # print x model = LogisticRegression() #from sklearn.svm import SVC #model=SVC(kernel="linear",C=1) rfe = RFE(model, k) rfe = rfe.fit(Xtrain, ytrain) #print(rfe.support_) #print(rfe.ranking_) This is one of the results expected Xcv=cv[:,2:] ycv=cv[:,1] p=int(0) n=int(0) pred=rfe.predict(Xcv) #print "pred set before matrix %s" % str(np.shape(pred)) #print "ycv set before matrix %s" % str(np.shape(ycv)) J+=float(f1_score(ycv,pred)) count+=1 f1score=float(J/count) print "No of features %d f1_score %f" % (k,f1score) if f1score>F1max: F1max=f1score n_features=k print n_features,F1max Xtrain=f[:,2:] ytrain=f[:,1] model = LogisticRegression() rfe = RFE(model, n_features) rfe = rfe.fit(Xtrain, ytrain)
# print the classification (a ratio) for # the samples in our original folder if (fit_data): for i in range(0, len(testData)): pred = rf.predict(testData[i]) live = sum(pred) ratio = live/len(testData[i]) print("%%live: ",ratio, "| name: ", dataName[i]) ################## # RANDOM FOREST WITH RFE (NO CV) if (fit_data_rfe): for i in range(0, len(testData)): pred = rfe.predict(testData[i]) live = sum(pred) ratio = live/len(testData[i]) print("%%live: ",ratio, "| name: ", dataName[i]) importances = rfe.ranking_ indices = np.argsort(importances) print("Feature ranking:") for f in range(n_features): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) ################## # RANDOM FOREST WITH RFE WITH CV if (fit_data_rfe_cv):
def main(args=None): init_log() if args is None: args = sys.argv[1:] np.seterr(all='raise') # so some option parsing parser, ns, args = init_args(description="Predict epitope sites.", args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') ARGS = parse_args(parser, args, namespace=ns) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} # maxrel doesn't support similar if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 antibodies = tuple(ARGS.ANTIBODY) # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL) # if we're doing LOOCV, make sure we set CV_FOLDS appropriately if ARGS.LOOCV: ARGS.CV_FOLDS = len(seqrecords) ab_basename = ''.join(( '+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '' )) alignment_basename = '_'.join(( ab_basename, ARGS.DATA.basename_root, __version__ )) sto_filename = alignment_basename + '.sto' # don't capture the second variable, let it be gc'd alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0] re_pngs = re_compile(r'N[^P][TS][^P]', re_I) ylabeler = Labeler( partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES) ) alignment, y, threshold = ylabeler(alignment) filter = naive_filter( max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO ) extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS')) ) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be 1 for now X = extractor.fit_transform(alignment) assert y.shape[0] == X.shape[0], \ "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0]) scorer = Scorer(ARGS.OPTSTAT) # do grid-search as part of the svm to avoid # performing feature selection on every iteration # of the grid search, which naturally takes forever svm = GridSearchCV( estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=list(C_range(*ARGS.LOG2C))), scoring=scorer, n_jobs=int(getenv('NCPU', -1)), pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS - 1 ) results = None for n_features in ARGS.FEATURE_GRID: results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS): if train_idxs.sum() < 1 or test_idxs.sum() < 1: y_true = y[test_idxs] results_.add(y_true, y_true, {}) continue X_train = X[train_idxs] y_train = y[train_idxs] if ARGS.RFE: clf = RFE( estimator=svm, n_features_to_select=n_features, step=ARGS.RFE_STEP ) else: mrmr = MRMR( k=n_features, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) clf = Pipeline([('mrmr', mrmr), ('svm', svm)]) clf.fit(X_train, y_train) X_test = X[test_idxs] y_true = y[test_idxs] if ARGS.RFE: selector_ = clf svm_ = clf.estimator_.best_estimator_ else: selector_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ y_pred = clf.predict(X_test) coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_) results_.add(y_true, y_pred, coefs, ranks) if results is None or results_ > results: results = results_ # the alignment reflects the number of sequences either naturally results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return results
Y_target_BR = BR.predict(X_test) #Random Forest Regressor RFR = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=0) RFR = RFR.fit(X_train, y_train) ranks["RFR"] = ranking(RFR.feature_importances_, colnames) #print(ranks["RFR"]) Y_target_RFR = RFR.predict(X_test) #Recursive Feature Elimination on Random Forest Regressor RFE_RFR = RFE(RFR, n_features_to_select=10, step=1) RFE_RFR.fit(X_train, y_train) Y_target_RFE_RFR = RFE_RFR.predict(X_test) #Extra Trees Classifier ETC = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) ETC = ETC.fit(X_train, y_train) ranks["ETC"] = ranking(np.abs(ETC.feature_importances_), colnames) Y_target_ETC = ETC.predict(X_test) #Recursive Feature Elimination on Decision Tree Regressor RFE = RFE(DTR, n_features_to_select=10, step=1) RFE.fit(X_train, y_train)
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args): W = [] features = [] if selection_method != '2step_kbest': n_features = min(n_features, len(feature_list)) if estimator_method == 'svm' and selection_method == 'rfe': estimator_args['kernel'] = 'linear' estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'cluster': agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average') clusters = agglom.fit_predict(X).tolist() sample = [clusters.index(i) for i in range(n_features)] X = X[:,sample] Z = Z[:,sample] selection_method = None if selection_method is None: for i, y in enumerate(Y): estimator.fit(X, y) w = estimator.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'rfe': selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) for i, y in enumerate(Y): selector = selector.fit(X, y) features.append(feature_list[selector.support_]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'myrfe': selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args) for i, y in enumerate(Y): selector.fit(X, y) features.append(feature_list[selector.support]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'kbest': selector = SelectKBest(f_regression, k=n_features, **selection_args) for i, y in enumerate(Y): X2 = selector.fit_transform(X, y) Z2 = selector.transform(Z) features.append(feature_list[selector.get_support()]) estimator.fit(X2, y) w = estimator.predict(Z2) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', print return W, features