def select_features(features, X_train_all, y_train_all): learningrates = [0.05, 0.1, 0.5, 1] valid_score = pd.DataFrame(index = ['accuracy'], columns = features) learning_rates = pd.DataFrame(index = ['accuracy'], columns = learningrates) valid_score[:] = 0 learning_rates[:] = 0 for j in reversed(range(1,51,1)): print(j) X_train = X_train_all[:-j,:] X_valid = X_train_all[-j,:] y_train = y_train_all[:-j] y_valid = y_train_all[-j] for f in features: for lr in learningrates: estimator = AdaBoostClassifier(learning_rate = lr, random_state=random_state) selector = RFE(estimator, f, step=1) selector = selector.fit(X_train, y_train) valid_score.loc['accuracy',f] += 1/(90*len(learningrates)) * selector.score(X_valid.reshape(1, -1), y_valid.flatten()) learning_rates.loc['accuracy',lr] += 1/(90*len(features)) * selector.score(X_valid.reshape(1,-1), y_valid.flatten()) print("number {}, feature number {}, learning rate{}, accuracy {}".format(j, f, lr, selector.score(X_valid.reshape(1, -1), y_valid.flatten()))) print(float(learning_rates.idxmax(axis=1))) print(int(valid_score.idxmax(axis=1))) estimator_select = AdaBoostClassifier(learning_rate = lr, random_state=random_state) selector_select = RFE(estimator_select, int(valid_score.idxmax(axis=1)), step=1) selector_select_fit = selector_select.fit(X_train_all, y_train_all) #valid_error = 1 - valid_score.max(axis=1) #print('validation error is:', valid_error) return selector_select_fit, float(learning_rates.idxmax(axis=1))
def feature_selection(X, Y, estimator): selector = RFE(estimator) choosen = selector score = 0 for i in range(1,7): selector = RFE(estimator,i) selector = selector.fit(X, Y) if selector.score(X,Y) > score: choosen = selector score = selector.score(X, Y) print choosen.ranking_ return choosen.transform(X)
def get_best_features_Nums(X_train, y_train, originNum): feature_num = 0 best_acc = 0 print(originNum) for i in range(originNum): clf = LinearSVC() model = RFE(clf, n_features_to_select=i + 1) model.fit(X_train, y_train) if model.score(X_train, y_train) > best_acc: best_acc = model.score(X_train, y_train) feature_num = i + 1 print(best_acc) return feature_num
def __init__(self, datafile, n_range, estimators=100, nsteps=5, ftest=True, mutual=True, rfe=True, seed=1, loss_='deviance', testsize=0.15): self.data = pd.read_csv(datafile) self.x, y = self.data.iloc[:, :-1], self.data.iloc[:, -1] self.feature_names=list(self.x.columns) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, y, test_size=.15, random_state=0) self.acc = {'f_classif':[], 'mutual':[], 'rfe':[]} for n in n_range: np.random.seed(seed) ######################################################################### # Train and test on GradientBoostingClassifier with f_classif ######################################################################### if ftest: Ftest = SelectKBest(score_func=f_classif, k=n) boost_clf = Pipeline((("Feature_select", Ftest), ("Classify", GradientBoostingClassifier(loss=loss_, n_estimators=estimators)))) self.y_pred_f = boost_clf.fit(self.x_train, self.y_train).predict(self.x_test) self.acc['f_classif'].append(accuracy_score(self.y_test, self.y_pred_f)) mask_Ftest = Ftest.get_support() ######################################################################### # Train and test on GradientBoostingClassifier with mutual_info_classif ######################################################################### if mutual: mutual = SelectKBest(score_func=mutual_info_classif, k=n) boost_m_clf = Pipeline((("Feature_select", mutual), ("Classify", GradientBoostingClassifier(loss=loss_, n_estimators=estimators)))) self.y_pred_m = boost_m_clf.fit(self.x_train, self.y_train).predict(self.x_test) self.acc['mutual'].append(accuracy_score(self.y_test, self.y_pred_m)) mask_mutual = mutual.get_support() ######################################################################### # Recursive Feature Elimination with GradientBoostingClassifier ######################################################################### if rfe: rfe = RFE(estimator=GradientBoostingClassifier(loss=loss_, n_estimators=estimators), step=nsteps, n_features_to_select=n) self.selector = rfe.fit(self.x_train, self.y_train) self.y_pred_r = self.selector.predict(self.x_test) self.acc['rfe'].append(rfe.score(self.x_test, self.y_test)) self.r2 = r2_score(self.y_test, self.y_pred_r) mask_rfe = self.selector.support_ ######################################################################### # Test for correlation between features ######################################################################### if ftest: self.Ftest_features = [feature for bool, feature in zip(mask_Ftest, self.feature_names) if bool] if mutual: self.mutual_features = [feature for bool, feature in zip(mask_mutual, self.feature_names) if bool] if rfe: self.rfe_features = [feature for bool, feature in zip(mask_rfe, self.feature_names) if bool] self.features_rank = self.selector.ranking_
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert len(rfe.ranking_) == X.shape[1] # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert X_r.shape == iris.data.shape assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert rfe.score(X, y) == clf.score(iris.data, iris.target) assert_array_almost_equal(X_r, X_r_sparse.toarray())
def main(params, inputs, outputs): ### 读入输入变量和目标变量 ### x = pd.read_pickle(inputs.x) y = pd.read_pickle(inputs.y) ### 读入参数 ### step = params.step n_features = params.n_features ### 定义RFE使用的算法 ### estimator = RandomForestClassifier(n_estimators=20, criterion='gini', class_weight='balanced', n_jobs=-1) ### 使用RFE进行训练 ### rfe = RFE(estimator, step=step, n_features_to_select=n_features) rfe.fit(x, y) ### 训练准确率 ### score = rfe.score(x, y) ### 生成新dataframe ### df_rfe = pd.DataFrame(index=x.columns, data=rfe.support_, columns=['support']) rfe_columns = list(df_rfe[df_rfe.support == True].index) x_new = x[rfe_columns] y_new = y.copy() ### 输出 ### x_new.to_pickle(outputs.x_new) y_new.to_pickle(outputs.y_new)
def Ftr_elm(X, y): ''' feature elimination using RFE''' adj_R2 = [] feature_set = [] max_adj_R2_so_far = 0 n = len(X) k = len(X[0]) selected_ranking=[] for i in range(1,k+1): selector = RFE(LogisticRegression(), i,verbose=1) selector = selector.fit(X, y) current_R2 = selector.score(X,y) current_adj_R2 = 1-(n-1)*(1-current_R2)/(n-i-1) adj_R2.append(current_adj_R2) feature_set.append(selector.support_) if max_adj_R2_so_far < current_adj_R2: max_adj_R2_so_far = current_adj_R2 selected_features = selector.support_ #selected_ranking= selector.ranking_ selected_ranking.append(selector.ranking_) print('End of iteration no. {}'.format(i)) print('selector support is :', selector.support_) #print('selected ranking is ;', selector.ranking_) print('selected ranking is ;', selected_ranking) X_sub = X[:,selected_features] return (adj_R2, selector.support_, selector.ranking_, X_sub )
def get_score_and_features(self, n_features): features = list(self.df.columns.values) features.remove('Label') df_X = self.df[features] df_Y = self.df['Label'] estimator = XGBClassifier(random_state=0) x_train, x_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.33, random_state = 0) selector = RFE(estimator, n_features, step=1) selector = selector.fit(x_train, y_train) features_bool = np.array(selector.support_) features = np.array(df_X.columns) list_feat = list(features[features_bool]) # list_to_keep = [] # for i in range(len(list(selector.support_))): # if list(selector.support_)[i] : # list_to_keep.append(i) # list_feat = list(df_X.columns[list_to_keep]) # list_delete = list(set(list(df_X.columns.values))-set(df_X.columns[list_to_keep])) # x_test = x_test.drop(list_delete ,axis=1) # clf_xgb = XGBClassifier(random_state=0) # clf_xgb = clf_xgb.fit(x_train , y_train) score = selector.score(x_test,y_test) # score = compute_accuracy_score(x_test, y_test) self.selected_features = list_feat self.list_score = score return self.selected_features, self.list_score
def demo(): digits = load_digits() X = digits.images.reshape((len(digits.images), -1)) y = digits.target # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=None, step=1) rfe.fit(X, y) ranking = rfe.ranking_.reshape(digits.images[0].shape) print rfe.score(X, y) # Plot pixel ranking plt.matshow(ranking, cmap=plt.cm.Blues) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
def sonar_wrapper(): sonar_data = load_sonar_data() sonar_values, sonar_labels = data_preprocessing(sonar_data) estimator = SGDClassifier(max_iter=1000) selector = RFE(estimator,5) selector.fit(sonar_values, sonar_labels) score, f1score = selector.score(sonar_values, sonar_labels), f1_score(selector.predict(sonar_values), sonar_labels) print('Sonar-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
def wbc_wrapper(): wbc_data = load_wbc_data() wbc_values, wbc_labels = data_preprocessing(wbc_data) estimator = SGDClassifier(max_iter=1000) selector = RFE(estimator,5) selector.fit(wbc_values,wbc_labels) score, f1score = selector.score(wbc_values,wbc_labels), f1_score(selector.predict(wbc_values), wbc_labels) print('WBC-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
def elimination_feature(): df = _load_data() X_train, X_test, y_train, y_test = _train_test(df, 'Milk') linear = LinearRegression() rfe = RFE(linear, n_features_to_select=3) rfe.fit(X_train, y_train) y_predict = rfe.predict(X_test) score = rfe.score(X_test, y_test) err = mean_squared_error(y_test, y_predict) return score, err, y_predict
def rfe_selection(X, y, model): # Realiza la selección de características por medio de un eliminado recursivo de características.. # Parámetros: # X (DataFrame): DataFrame con todas las variables predictoras. # y (Series): Objeto Series de pandas con la variable endógena. # model: Cualquier modelo de sklearn que se vaya a utilizar como referencia para la selección. # Devuelve: # Nada. rfe = RFE(model, 12, step=1, verbose=1) rfe = rfe.fit(X, y) rfe.score(X, y) X = X * rfe.support_ X = X.loc[:, (X != 0).any(axis=0)] X = pd.concat([X, y], axis=1, sort=False) print(X.columns) show_correlation(X)
def findRFE(): labels = [] acc = [] filteredFeat = [] for i in range(6): model = sklearn.linear_model.LogisticRegression() rfe = RFE(model, i + 1) rfe = rfe.fit(xtr, ytr) print("\n", "rfe", i + 1) print(rfe.support_) labels.append(rfe.support_) print(rfe.score(xte, yte)) acc.append(rfe.score(xte, yte)) # prob = rfe.predict_proba(xte) # loss1 = log_loss(yte, prob) # # print("Loss is ", loss1, "\n") labels = np.asarray(labels) acc = np.asarray(acc) bestacc = np.argmax(acc) bestLabel = labels[bestacc] if bestLabel[0]: filteredFeat.append('Person A') if bestLabel[1]: filteredFeat.append('Person B') if bestLabel[2]: filteredFeat.append('Years of Knowing') if bestLabel[3]: filteredFeat.append('Interaction Duration') if bestLabel[4]: filteredFeat.append('Interaction Type') if bestLabel[5]: filteredFeat.append('Moon Phase During Interaction') return filteredFeat
def feature_selection(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): #kernel: linear, poly, rbf, sigmoid, precomputed # for whatever reason, I cannot directly use the parameters that are passed in to run in the feature selection functions # because of this, the next several lines are essentially redefining the parameters and storing them in another variable name rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] # end of defining parameters rotated = convert_list_to_matrix(features_values, rows, columns) scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) #Running binary search to help find the correct features that meet the specified threshold. lower_bound = 0 upper_bound = columns while(upper_bound - lower_bound > 1): current_selector = (lower_bound + upper_bound)/2 selector = RFE(estimator, current_selector, step=1) selector = selector.fit(rotated, scores) if selector.score(rotated, scores) > threshold: upper_bound = current_selector else: lower_bound = current_selector print "second threshold: " print selector.score(rotated, scores)
def linrfe(): """ 为了快速计算完成, step=xx 需要设置大一些. ridge : 0.28+ ridge + RFE: 0.28+ 线上却有0.045 ; 线下的这个测试看来完全不准确 """ X, y = load_svmlight_file('train.txt') X = X.toarray() scaler = StandardScaler().fit(X) X = scaler.transform(X) reg = linear_model.Ridge(alpha=0.5) reg.fit(X, y) print 'r^2=', reg.score(X, y) print 'train mse = ', mean_squared_error(y, reg.predict(X)) rfe = RFE(estimator=reg, n_features_to_select=500, step=1000, verbose=2) rfe.fit(X, y) print 'rfe r^2 = ', rfe.score(X, y) print 'rfe mse =', mean_squared_error(y, rfe.predict(X)) X_rfe = rfe.transform(X) poly = PolynomialFeatures(degree=2, interaction_only=True) X_poly = poly.fit_transform(X_rfe) #直接处理会有 MemoryError param_grid = {'alpha': [0.5, 1, 10, 100, 1000, 1e4, 3e4]} gbm = GridSearchCV(reg, param_grid, verbose=2, scoring='neg_mean_squared_error', cv=5) gbm.fit(X_poly, y) logging.info('after rfe poly, best_result = {0}'.format(gbm.best_score_)) logging.info('after rfe poly, best_param= {0}'.format(gbm.best_params_)) #mse = reg.score(X_poly, y) #print 'after poly ' ,mean_squared_error(y, reg.predict(X_poly)) #logging.info('rfe r^2 score= ' + str(mse) ) params = { 'objective': 'mse', 'num_leaves': 8, 'learning_rate': 0.05, 'min_child_samples': 60, # 这个题目比较关键 . # 'subsample': 0.9, 'n_estimators': 100, 'silent': False, } gbm = lgb.LGBMRegressor(**params) gbm.fit(X_poly, y, eval_metric='mse', eval_set=[(X_poly, y)]) logging.info('train lgb of poly = {0}'.format( mean_squared_error(y, gbm.predict(X_poly, y))))
def choice_feature_nums(data_x, data_y, col_name): n = len(col_name) dic = {} for i in range(3, n + 1): rfe = RFE(estimator=LinearRegression, n_features_to_select=i) rfe.fit_transform(data_x, data_y) dic[i] = rfe.score() plt.xlabel('feature_num') plt.ylabel('score') plt.plot(dic.keys(), dic.values()) plt.show() return dic
def rfe_feature_select(count, features, X, y, model_type="RF", model=RandomForestClassifier(random_state=42), v=0, scores=pd.DataFrame(columns=['model_type', 'model','params', 'n_features', 'score', 'features','ranking', 'weighted_score'])): start = time.time() helicopter = [count, round(count*0.8), round(count*0.6), round(count*0.4), round(count*0.2)] weightings = [0.98, 0.985, 0.99, 0.995, 1] if v>0: print(model) score = pd.DataFrame() for n in range(len(helicopter)): eliminator = RFE(estimator=model, n_features_to_select=helicopter[n], step=1, verbose=v) eliminator.fit(X, y) feats = pd.Series(eliminator.support_, index=features) weight = 100*eliminator.score(X, y) params = model.get_params() score = score.append({'model_type' : model_type, 'model' : model, 'params': params, 'n_features' : eliminator.n_features_, 'score' : round(score, 2), 'features' : list(feats[feats==True].index), 'ranking' : list(eliminator.ranking_), 'weighted_score' : round(weightings[n]*weight, 2)}, ignore_index=True) if v>0: print("%d features took %d seconds, or %d minutes" % (count, round(time.time()-start), round((time.time()-start)/60))) print("Tested feature combinations: %s" % helicopter) print(score) df = score.sort_values(by='weighted_score', ascending=False) print("Selected feature combination: %d" % df.iloc[0].n_features) if helicopter[0]-helicopter[1]==1: # We've reached the end return scores elif df.iloc[0].n_features==count: # The highest feature count is the best so try -1 features tmp_features = df.iloc[0].features tmp_count = len(tmp_features)-1 scores = rfe_feature_select(tmp_count, tmp_features[0:tmp_count], X_train[tmp_features[0:tmp_count]], y_train, df.iloc[0].model_type, df.iloc[0].model, v, scores) else: scores = rfe_feature_select(df.iloc[0].n_features, df.iloc[0].features, X_train[df.iloc[0].features], y_train, df.iloc[0].model_type, df.iloc[0].model, v, scores) return scores
def reg_with_rfe(self): print("regression after RFE", "\n") reg = LinearRegression() selector = RFE(reg) selector.fit(self.x_train, self.y_train) train_score = selector.score(self.x_train, self.y_train) test_score = selector.score(self.x_test, self.y_test) coeff_used = selector.n_features_ print("training score:", train_score) print("test score: ", test_score) print("number of features used: ", coeff_used) #將使用到的變數印出其係數 # for var in selector.estimator_.coef_: print("Selected variables:") for var, coef in zip(self.x_train.columns, selector.estimator_.coef_): if (coef == 0): continue print(str(var) + ":", coef) print("-------------------------------------------------------")
def rfe_scores(clf, X_train, y_train, X_test, y_test, n_nodes=None): if n_nodes is None: n_nodes = [1, 4, 6, 10, 15, 22] clfs = [] for n in n_nodes: rclf = RFE(clf, n_features_to_select=n) rclf.fit(X_train, y_train) print("RFE {} selected nodes {}".format(n, rclf.get_support(True))) print("RFE {} score is {}".format(n, rclf.score(X_test, y_test))) clfs.append(rclf) return clfs
def get_best_attr_with_score(model, x_test, y_test, attr_col, k): # X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) # estimator = SVR(kernel="linear") selector = RFE(model, k, step=1) # print("________________1") selector = selector.fit(x_train, y_train) # selector.support_ # print("________________2") selector.ranking_ our_selected_attr = selector.get_support() list_attr_se = [] score = selector.score(x_test, y_test) for i, j in zip(our_selected_attr, attr_col): if (i == True): list_attr_se.append(j) return score, list_attr_se, our_selected_attr
def q4(): X = df.copy().drop(columns=["Overall"]) y = df["Overall"] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True) reg = LinearRegression() reg.fit(X_train, y_train) print("Model r2 score Linear regression:", reg.score(X_test, y_test)) y_pred = reg.predict(X_test) print('MSE', mse(y_test, y_pred)) print('RMSE', mse(y_test, y_pred, squared=False)) print( pd.DataFrame.from_dict(dict(zip(X_train.columns, reg.coef_)), orient='index', columns=['coef']).sort_values( by='coef', ascending=False).head(5)) selector = RFE(estimator=reg, n_features_to_select=5, step=1, verbose=0) selector = selector.fit(X_train, y_train) selected_features5 = list(X_train.columns[selector.get_support()]) print('\nMost important features RFE', selected_features5) print("\nModel r2 score RFE Linear regression selected features:", selector.score(X_test, y_test)) y_pred = selector.predict(X_test) print('MSE', mse(y_test, y_pred)) print('RMSE', mse(y_test, y_pred, squared=False)) X_train5 = selector.transform(X_train) reg.fit(X_train5, y_train) coeficients = reg.coef_ print( pd.DataFrame.from_dict(dict(zip(selected_features5, reg.coef_)), orient='index', columns=['coef']).sort_values( by='coef', ascending=False).head(5)) # plt.scatter(y_test,y_pred) # plt.show() return selected_features5
def selected_feature_from_random_forrest(X_train, X_test, y_train, y_test): # Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step rfe_rf = RFE(estimator=RandomForestRegressor(), n_features_to_select=10, step=3, verbose=1) rfe_rf.fit(X_train, y_train) # Calculate the R squared on the test set r_squared = rfe_rf.score(X_test, y_test) print( 'The model can explain {0:.1%} of the variance in the test set'.format( r_squared)) # Assign the support array to gb_mask rf_mask = rfe_rf.support_ return rf_mask
def selected_feature_by_rfe(X_train, X_test, y_train, y_test): # Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step rfe_gb = RFE(estimator=GradientBoostingRegressor(), n_features_to_select=10, step=3, verbose=1) rfe_gb.fit(X_train, y_train) # Calculate the R squared on the test set r_squared = rfe_gb.score(X_test, y_test) print( 'The model can explain {0:.1%} of the variance in the test set'.format( r_squared)) # Assign the support array to gb_mask gb_mask = rfe_gb.support_ return gb_mask
def RFE_SVM(x, y, numFeats): np.random.seed(7) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33) clf = SVC(kernel='linear', probability=True, max_iter=200) selector = RFE(estimator=clf, n_features_to_select=numFeats, step=5) s_time = time.clock() selector.fit(x_train, y_train.iloc[:, 0]) acc = selector.score(x_test, y_test.iloc[:, 0]) e_time = time.clock() print('\n Total Time: ', e_time - s_time) print(' Accuracy:', acc) return selector, acc
def rfe(self, X_train, y_train, X_test, y_test, n_features = 300, step = 0.2, kernel = "linear"): """ - Recursive Feature Elimination - step < 1 is a percentage. Returns selected features. - Hyperparameter tuning was done in a different notebook. """ # Create estimator and selector. estimator = SVR(kernel=kernel, C=0.01, gamma=1e-07) selector = RFE(estimator, n_features_to_select = n_features, step=step) selector = selector.fit(X_train.to_numpy(), y_train.to_numpy()) # Print accuracy. print('Accuracy of RFE: {:.3f}'.format(selector.score(X_test, y_test))) # Create dictionary with results. selected_features = X_train.columns[selector.support_].tolist() feature_importances = [1 for x in range(len(selected_features))] dictionary = {"Recursive Feature Elimination":[selected_features, feature_importances]} return dictionary
def RFE_AdaBoost(x, y, numFeats): np.random.seed(7) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33) clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0) selector = RFE(estimator=clf, n_features_to_select=numFeats, step=5) s_time = time.clock() selector.fit(x_train, y_train.iloc[:, 0]) acc = selector.score(x_test, y_test.iloc[:, 0]) e_time = time.clock() print('\n Total Time: ', e_time - s_time) print(' Accuracy:', acc) return selector, acc
def svm(train_set, label_set, test_set, ground_truth): train_set = Normalizer().fit_transform(train_set) test_set = Normalizer().fit_transform(test_set) svm_clf = SVC(C=0.2, kernel='linear') #svm_clf = SVC() #s = cross_validate(svm_clf,train_set,label_set) #print(s) #grid = GridSearchCV(svm_clf,param_grid={"C":[0.2,0.5,1.0,1.2,1.5,3,10],"kernel":['linear','rbf']},cv=10) #grid.fit(train_set,label_set) rfe = RFE(estimator=svm_clf, n_features_to_select=2, step=1) # n=5,0.6497 n=8,0.66358 n=12,0.66728 n=15,0.66635 """"[True True False True False True True False True True True True True True False True] [1 1 5 1 2 1 1 3 1 1 1 1 1 1 4 1]""" rfe.fit(train_set, label_set) #print(rfe.support_) #print(rfe.ranking_) #svm_clf.fit(train_set,label_set) #y_score = svm_clf.decision_function(test_set) #y = svm_clf.predict(test_set) y = rfe.predict(test_set) #fpr, tpr, threshold = roc_curve(test_set, y_score) #roc_auc = auc(fpr, tpr) #y = grid.predict(test_set) print(rfe.score(test_set, ground_truth)) #print(svm_clf.score(test_set,ground_truth)) #print(svm_clf.score(train_set,label_set)) #print(grid.score(test_set,ground_truth)) #print(grid.score(train_set,label_set)) #print(grid.best_params_) p = precision_score(y, ground_truth) r = recall_score(y, ground_truth) f = f1_score(y, ground_truth) return p, r, f
gnb = GaussianNB() parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 1, 100], 'gamma': [0.001, 0.0001]} svr = svm.SVC() clf = GridSearchCV(estimator=svr, param_grid=parameters, cv=2) parameters = {'n_neighbors':[4, 12, 20], 'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')} kvr = KNeighborsClassifier() knn = GridSearchCV(estimator=kvr, param_grid=parameters) svmselector = svm.SVC(kernel='linear') selector = RFE(svmselector, 10, step = 1) selector = selector.fit(xTrain, yTrain) print "\tTraining accuracy: " + str(selector.score(xTrain, yTrain) * 100) print "\tTesting accuracy: " + str(selector.score(xTest, yTest) * 100) chosen = selector.support_ print chosen for idx, val in enumerate(chosen.tolist()): if str(val) == 'True': print idx print selector.ranking_ # Training gnb.fit(xTrain, yTrain) clf.fit(xTrain, yTrain) knn.fit(xTrain, yTrain)
X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(len(y),1)).reshape(len(y)) #Feature Elimination from sklearn.linear_model import LinearRegression from sklearn.feature_selection import RFE adj_R2 = [] feature_set = [] max_adj_R2_so_far = 0 n = len(X) k = len(X[0]) for i in range(1,k+1): selector = RFE(LinearRegression(), i,verbose=1) selector = selector.fit(X, y) current_R2 = selector.score(X,y) current_adj_R2 = 1-(n-1)*(1-current_R2)/(n-i-1) adj_R2.append(current_adj_R2) feature_set.append(selector.support_) if max_adj_R2_so_far < current_adj_R2: max_adj_R2_so_far = current_adj_R2 selected_features = selector.support_ print('End of iteration no. {}'.format(i)) X_sub = X[:,selected_features] #split into train and test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_sub,y,random_state=0)
data_values = np.loadtxt("train.csv", delimiter=',', usecols=range(15,16)) print "Number of training samples: " + str(data_features.shape[0]) # Transform the training data #scaler = MinMaxScaler(feature_range=(1, 2)).fit(data_features) #data_features = scaler.transform(data_features) data_features = transform_data(data_features) # Fit the regression model alphas = np.logspace(-5,3,30) ridge_regressor = linear_model.RidgeCV(alphas=alphas, normalize=True, fit_intercept=True) rfe = RFE(estimator=ridge_regressor, n_features_to_select=14, step=1) rfe.fit(data_features[0:600], data_values[0:600]) print "alpha: " + str(rfe.estimator_.alpha_) print "intercept: " + str(rfe.estimator_.intercept_) print "R-square: " + str(rfe.score(data_features[600:], data_values[600:])) # Compute MSE train_pred = rfe.predict(data_features) mse = ((train_pred[600:] - data_values[600:])**2).mean(axis=0) print "MSE: " + str(mse) # Visualize predicted values on train data plt.plot(train_pred[600:], label='Predicted') plt.plot(data_values[600:], label='Actual') plt.legend(loc='upper right') plt.show() # Load the test data validate_test = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(1,15)) validate_test_ind = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(0,1))
X = [residual[:-1] for residual in data ] n_split = 1800 X_train, X_test = X[:n_split], X[n_split:] Y_train, Y_test = y[:n_split], y[n_split:] numFeatures = 40 model = ExtraTreesClassifier() #model.fit(X_train, Y_train) rfe = RFE(model, numFeatures) rfe = rfe.fit(X_train,Y_train) temp = rfe.score(X_test, Y_test) predictionOfPrelim = rfe.predict(prelimData) featureRanking = rfe.ranking_ #Best ExtraTrees Accuracy is: [400, 0.98902777777777773, 40] print("ExtraTrees Accuracy is: ", temp) prelimClasses = np.loadtxt("prelim-class.txt") assert len(prelimClasses) == len(predictionOfPrelim) h = [] for i in range(len(prelimClasses)): if prelimClasses[i] == predictionOfPrelim[i]: h.append(1) else: h.append(0)
selector1 = RFECV(rf,cv=5); selector1 = selector1.fit(X,y); print("Features selected by RFECV ", selector1.n_features_); # RFECV yields a model with low number (4~5) of features with K-fold cross validation with 5 folds. # Since our sample size is small, additional features are selected to avoid over-fitting on the model data. # 8 Features were selected using RFE. selector2 = RFE(rf,8); # Recursive feature elimination to select 8 best features. selector2 = selector2.fit(X,y); print (selector2.n_features_) for i,j in enumerate(selector2.support_): if j == True: print(features[i]) predictor = selector2.estimator_; print('Variance score Train: %.2f' % selector2.score(X,y)); print('Variance score Test: %.2f' % selector2.score(Xtest,ytest)); print('Coeff of Test: ', selector2.ranking_); print('No of Features selected by RFE = %.2f' %sum(selector2.support_)); plotfit(selector2,X,y, title = 'Training fit'); plotfit(selector2,Xtest,ytest,c='blue', title = 'Test fit'); # Forecast, # Since lagged variables are selected for our model, forecasting is done iteratively # by using the predicted values at time t as lagged variables for time t+1,t+2... # In practice however, this is not required as the true price will be known before prediction. yfcast = []; for i in list(range(len(Xfcast))): Xfcast_trim = selector2.transform(Xfcast);
from sklearn.metrics import mean_squared_error features = X.columns.values results = [] selector = RFE(estimator,3,step=1) #recursive forward elimination selcetor = selector.fit(X,y) selector.support_ # Out[61]: array([ True, True, True, False, False, False]) # It seems the first predictors are okay but not the the last three. # one can see if droping the last three will improve the model with all 6. selector.ranking_ # Out[65]: array([2, 1, 3, 6, 5, 4]) for i in range(1,len(X.iloc[0])+1): selector = RFE(estimator, n_features_to_select=i, step=1) selector.fit(X,y) r2 = selector.score(X,y) selected_features = features[selector.support_] msr = mean_squared_error(y, selector.predict(X)) results.append([i, r2, msr, ','.join(selected_features)]) results ''' results Out[68]: [[1, 0.47017552557905884, 2.5448877365932985, 'Email'], [2, 0.8987844810699489, 0.48616503259788457, 'Internet,Email'], [3, 0.9008606156394956, 0.47619280658599406, 'Internet,Email,Blog'], [4, 0.9051564044419049, 0.45555899148299284,
# very similar idea to foreward selection but done recurssively. This method is gready # which means it tries one feature at the time NUM_FEATURES = 16 # this is kind of arbitrary but the idea should come by observing the scatter plots and correlation. model = LinearRegression() rfe = RFE(model, NUM_FEATURES) champsFit = rfe.fit(champsX, champsY) print("For Champions:") print("Num Features:", champsFit.n_features_) print("Selected Features:", champsFit.support_) print("Feature Ranking:", champsFit.ranking_) runnersFit = rfe.fit(runnersX, runnersY) print("For Runner Ups:") print("Num Features:", runnersFit.n_features_) print("Selected Features:", runnersFit.support_) print("Feature Ranking:", runnersFit.ranking_) # calculate the score for the selected features champsScore = rfe.score(champsX, champsY) runnersScore = rfe.score(runnersX, runnersY) print("Model Champs Score with selected features is: ", champsScore) print("Model Runer up Score with selected features is: ", runnersScore) """ Results: Run in terminal to see results. Not very good because these 2 datasets are more of a classification model. Linear Regression is not very good here. """
# If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) # Create the actual result vector where a tie counts as a loss for the home team game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0)) # Check to see where the bet_vector equals the actual game result with the spread included result = np.array(np.where(bet_vector == game_result,1,0)) prob_result = float(np.sum(result)) / len(result) # print 'Number of features =', feat, 'C =',c,' Percent correct =',prob_result if prob_result > prob_val: prob_val = prob_result C_val = c feat_val = feat print 'Score =',selector.score(X_test,y_test) # print prob_val, C_val, feat clf = linear_model.LogisticRegression(C=C_val,random_state=42) clf = clf.fit(X_train,y_train) probabilities = clf.predict_proba(scaler.transform(matchups)) vfunc = np.vectorize(spread_conversion) predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0]) bet_vector = np.array(np.where(predicted_spreads > spreads,0,1)) print spreads print predicted_spreads print bet_vector