print('In-sample R2 score for XGB:') print(r2_score(y_train, model.predict(dtrain))) print("OOS R2 score for XGB:") r2 = r2_score(dvalid.get_label(), model.predict(dvalid)) print(r2) '''Train the stacked models then predict the test data''' stacked_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator( estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)), #StackingEstimator(estimator=BayesianRidge()), #StackingEstimator(estimator=ElasticNetCV()), #StackingEstimator(estimator=HuberRegressor()), #StackingEstimator(estimator=LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=True, precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=1, positive=False, random_state=None, selection='cyclic')), #StackingEstimator(estimator=LassoLarsIC()), #StackingEstimator(estimator=LinearRegression()), #StackingEstimator(estimator=OrthogonalMatchingPursuitCV()), #StackingEstimator(estimator=RANSACRegressor()), #tackingEstimator(estimator=RidgeCV()), #LassoLarsCV() # .6 #LinearRegression() # .6 #ElasticNetCV() # worse
# Calculate R squared on the test set r_squared = lcv.score(X_test, y_test) print('The model explains {0:.1%} of the test set variance'.format(r_squared)) # Create a mask for coefficients not equal to zero lcv_mask = lcv.coef_ != 0 print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask))) -------------------------------------------------- # Exercise_11 #1 from sklearn.feature_selection import RFE from sklearn.ensemble import GradientBoostingRegressor # Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step rfe_gb = RFE(estimator=GradientBoostingRegressor(), n_features_to_select=10, step=3, verbose=1) rfe_gb.fit(X_train, y_train) #2 from sklearn.feature_selection import RFE from sklearn.ensemble import GradientBoostingRegressor # Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step rfe_gb = RFE(estimator=GradientBoostingRegressor(), n_features_to_select=10, step=3, verbose=1) rfe_gb.fit(X_train, y_train) # Calculate the R squared on the test set r_squared = rfe_gb.score(X_test, y_test) print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared)) #3
def feature_selection(self, X, y, method): """ purpose: select feature input: X:train data y:lable method: uesed method return: """ X_indices = np.arange(X.shape[-1]) score = [] # Removing features with low variance # correlation coefficient # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target) # mutual information # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target) # Univariate feature selection (for classification) if method == 'chi-squared': skb = SelectKBest(chi2) skb.fit_transform(X, y) score = skb.scores_ # Univariate feature selection (for regression) if method == 'f_regression': skb = SelectKBest(f_regression) skb.fit_transform(X, y) score = skb.scores_ # L1-based feature selection (for classification) if method == 'LinearSVC': lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) sfm = SelectFromModel(lsvc, prefit=True) X_new = sfm.transform(X) # L1-based feature selection (for regression) elif method == 'LassoCV': lasso = LassoCV().fit(X, y) score = lasso.coef_ sfm = SelectFromModel(lasso, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classification) elif method == 'ExtraTreesClassifier': clf = ExtraTreesClassifier() clf = clf.fit(X, y) print clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'ExtraTreesRegressor': clf = ExtraTreesRegressor() clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for classifier) elif method == 'GradientBoostingClassifier': clf = GradientBoostingClassifier(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Tree-based feature selection (for regression) elif method == 'GradientBoostingRegressor': clf = GradientBoostingRegressor(learning_rate=0.01) clf = clf.fit(X, y) score = clf.feature_importances_ sfm = SelectFromModel(clf, threshold=0.25, prefit=True) X_new = sfm.transform(X) # Print the feature ranking indices = np.argsort(score)[::-1] print("Feature ranking:") for f in X_indices: print("feature %d: %s (%f)" % (indices[f], self.columns[indices[f]], score[indices[f]])) #draw plot plt.figure() # plt.bar(indices, score, width=0.2, color='r') plt.barh(indices, score, height=0.2, color='r') plt.title(method) plt.xlabel("score") plt.ylabel("feature") plt.grid(axis='x') plt.show() pass
def GenerateGradientBoostModel(X_train, Y_train): gradient_boost_reg = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=0) grad_boost_model = gradient_boost_reg.fit(X_train, Y_train) return grad_boost_model
df_regOutput = pd.DataFrame() df_booleanOutput = pd.DataFrame() df = pd.read_csv('data/2007_weather_preprocess.csv') #df, attributes = preprocess.preprocess(df) attributes = list(df.columns.values)[1:] attributes.remove('DateTime') attributes.remove('PredDelay') # Initialise Regressors regressors = { 'gbr_reg': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls'), 'ada_reg': AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=np.random.RandomState(1)) } # Initialise Classifiers classifiers = { 'svm_clf': svm.SVC(), 'bernolli_rbm_clf': BernoulliRBM(n_components=2), 'decision_tree_clf': tree.DecisionTreeClassifier() }
def model_training_regressor(X, Y, test_ratio, verbose_mode, name): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_ratio, shuffle=False) if name == "MLP": model = MLPRegressor(hidden_layer_sizes=(200, 50), activation='relu', solver='adam', alpha=0.0002, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True, random_state=None, tol=0.0001, verbose=verbose_mode, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10).fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "NaiveBayes": model = linear_model.BayesianRidge().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "SVM": model = svm.LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "DT": model = tree.DecisionTreeRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "KNN": model = neighbors.KNeighborsRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "RandomForest": model = RandomForestRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "Adaboost": model = AdaBoostRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) elif name == "GradientBoost": model = GradientBoostingRegressor().fit(X_train, y_train) return get_model_performance(model, X_test, y_test) else: ret = dict() print("no available model") return ret
def train(self): if self.config['method'] == 'regression': print('Building regression model') print('Fetching data') self.get_df_reg() print('Data Fetched') print('Splitting data') df_x = self.df_reg.iloc[:, 3:] df_y = self.df_reg.iloc[:, 1] x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=1) print('Data splitted') print('Size of x_train', x_train.shape) print('Size of y_train', y_train.shape) print('Size of x_test', x_test.shape) print('Size of y_test', y_test.shape) if self.config['model'] == 'svr': print('Support vector regressor') model = SVR(kernel=self.config['svr_kernel']) if self.config['model'] == 'knr': print('K-nearest neighbors regressor') model = KNeighborsRegressor(n_jobs=12) if self.config['model'] == 'dtr': print('Decision tree regressor') model = DecisionTreeRegressor() if self.config['model'] == 'rf': print('Random forest regressor') model = RandomForestRegressor(n_jobs=12) if self.config['model'] == 'et': print('Extra trees regressor') model = ExtraTreesRegressor(n_jobs=12) if self.config['model'] == 'gbr': print('Gradient boosting regressor') model = GradientBoostingRegressor() try: model except BaseException: print('Invalid model configuration. Check config.ini') return model.fit(x_train, y_train) pred = pd.Series(model.predict(df_x)) self.df_reg.insert(2, 'Predicted_current', pred) print('R^2 score', model.score(x_test, y_test)) print('Converting to binary classification') y_test_list, y_pred_list, _, _ = self.to_bin_cl( x_test, y_test, model) _, _, bin_y, bin_y_pred = self.to_bin_cl(df_x, df_y, model) conf_mat = confusion_matrix(y_true=y_test_list, y_pred=y_pred_list) print('Converted to binary classification') self.df_reg.insert(3, 'Actual_class', bin_y) self.df_reg.insert(4, 'Predicted_class', bin_y_pred) print('Confusion matrix:\n', conf_mat) p = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[1][0]) r = conf_mat[0][0] / (conf_mat[0][0] + conf_mat[0][1]) print( 'Accuracy is', np.sum(np.array(y_test_list) == y_pred_list) / len(y_pred_list)) print('Precision is', p) print('Recall is', r) print('F1-score is', self.get_f_score(p, r, 1)) print('F0.5-score is', self.get_f_score(p, r, 0.5)) print('F2-score is', self.get_f_score(p, r, 2)) # joblib.dump(model,'models/'+self.config['model']+'.model') self.save_result()
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100) from sklearn.tree import DecisionTreeRegressor tree_reg1 = DecisionTreeRegressor(max_depth=2) tree_reg1.fit(X, y) y2 = y - tree_reg1.predict(X) tree_reg2 = DecisionTreeRegressor(max_depth=2) tree_reg2.fit(X, y2) y3 = y2 - tree_reg2.predict(X) tree_reg3 = DecisionTreeRegressor(max_depth=2) tree_reg3.fit(X, y3) X_new = np.array([[0.8]]) y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3)) from sklearn.ensemble import GradientBoostingRegressor grbt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1) grbt.fit(X, y) grbt.predict(X_new) grbt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
# assign to training data and test data for yyyy, month, tmax, tmin in data: if tmax.startswith("Missing"): max_test_X.append([int(yyyy), month_dict[month], float(tmin)]) elif tmin.startswith("Missing"): min_test_X.append([int(yyyy), month_dict[month], float(tmax)]) else: max_train_X.append([int(yyyy), month_dict[month], float(tmin)]) max_train_y.append(float(tmax)) min_train_X.append([int(yyyy), month_dict[month], float(tmax)]) min_train_y.append(float(tmin)) # training gbr_max = GradientBoostingRegressor() gbr_max.fit(max_train_X, max_train_y) gbr_min = GradientBoostingRegressor() gbr_min.fit(min_train_X, min_train_y) # predict #print(max_train_X) #print(max_test_X) index_max = 0 index_min = 0 for yyyy, month, tmax, tmin in data: if tmax.startswith("Missing"): y = gbr_max.predict([max_test_X[index_max]]) print("%.1f" % y) index_max += 1
LinearRegressionModel = LinearRegressionModel.fit(train_X, train_y_ln) RidgeModel = Ridge(normalize=True) RidgeModel = RidgeModel.fit(train_X, train_y_ln) LassoModel = Lasso().fit(train_X, train_y_ln) # 非线性模型 from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor DecisionTreeModel = DecisionTreeRegressor().fit(train_X, train_y_ln) RandomForestModel = RandomForestRegressor().fit(train_X, train_y_ln) GradientBoostingModel = GradientBoostingRegressor().fit(train_X, train_y_ln) f = open('./models/LinearRegressionModel.pkl', 'xb') pickle.dump(LinearRegressionModel, f) f.close() f = open('./models/RidgeModel.pkl', 'xb') pickle.dump(RidgeModel, f) f.close() f = open('./models/LassoModel.pkl', 'xb') pickle.dump(LassoModel, f) f.close() f = open('./models/DecisionTreeModel.pkl', 'wb') pickle.dump(DecisionTreeModel, f)
print("RMSE for Test data = "+str(RMSE_test_RF)) # In[86]: print(r2_score(y_train, pred_train_RF)) #train print(r2_score(y_test, pred_test_RF)) #test # # Gradient Boosting : # In[87]: fit_GB = GradientBoostingRegressor().fit(X_train, y_train) # In[88]: #prediction on train data pred_train_GB = fit_GB.predict(X_train) #prediction on test data pred_test_GB = fit_GB.predict(X_test) # In[89]:
### 2.RF #Random Forest 一般在 max_features 设为 Feature 数量的平方根附近得到最佳结果。 from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification rf = RandomForestClassifier(max_depth=2, random_state=0) rf.fit(train_x, train_y) y_pre = rf.predict(val_x) y_pre[y_pre > 0.5] = 1 y_pre[y_pre < 0.5] = 0 ### 3.GBDT from sklearn.ensemble import GradientBoostingRegressor gbdt = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, max_depth=3) gbdt.fit(train_x, train_y) y_pre = gbdt.predict(val_x) y_pre[y_pre > 0.5] = 1 y_pre[y_pre < 0.5] = 0 ### 4.knn from sklearn import neighbors knn = neighbors.KNeighborsClassifier(n_neighbors=8, leaf_size=30, p=3) knn.fit(x, y) ### 5.svm #http://blog.csdn.net/u013709270/article/details/53365744 (d多分类)
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0) categorical = ['normalizeHolidayName', 'isPaidTimeOff'] numerical = ['vendorID', 'passengerCount', 'tripDistance', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_num', 'snowDepth', 'precipTime', 'precipDepth', 'temperature'] numeric_transformations = [([f], Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])) for f in numerical] categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical] transformations = numeric_transformations + categorical_transformations clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)), ('regressor', GradientBoostingRegressor())]) clf.fit(X_train, y_train) y_predict = clf.predict(X_test) y_actual = y_test.values.flatten().tolist() rmse = math.sqrt(mean_squared_error(y_actual, y_predict)) print('The RMSE score on test data for GradientBoostingRegressor: ', rmse) # ## Global Explanation Using TabularExplainer # # **Global Model Explanation** is a holistic understanding of how the model makes decisions. It provides you with insights on what features are most important and their relative strengths in making model predictions. # # [TabularExplainer](https://docs.microsoft.com/en-us/python/api/azureml-explain-model/azureml.explain.model.tabularexplainer?view=azure-ml-py) uses one of three explainers: TreeExplainer, DeepExplainer, or KernelExplainer, and is automatically selecting the most appropriate one for our use case. You can learn more about the underlying model explainers at [Azure Model Interpretability](https://docs.microsoft.com/en-us/azure/machine-learning/service/machine-learning-interpretability-explainability). #
cv=kf)) return (rmse) #LASSO Regression: lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) #Elastic Net Regression ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) #Kernel Ridge Regression : KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #Gradient Boosting Regression: GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) #XGBoost: model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state=7,
def BuildGBRT(train_samples, dev_samples, test_samples, model_path, n_calls, cast_to_zero=True, optimizer='gp', measurement_time='day', measurement_unit='$m^3/s$'): if not os.path.exists(model_path): os.makedirs(model_path) sMin = train_samples.min(axis=0) sMax = train_samples.max(axis=0) norm = pd.concat([sMax, sMin], axis=1) norm = pd.DataFrame(norm.values, columns=['sMax', 'sMin'], index=train_samples.columns.values) norm.to_csv(model_path + 'norm.csv') joblib.dump(norm, model_path + 'norm.pkl') train_samples = 2 * (train_samples - sMin) / (sMax - sMin) - 1 dev_samples = 2 * (dev_samples - sMin) / (sMax - sMin) - 1 test_samples = 2 * (test_samples - sMin) / (sMax - sMin) - 1 cal_samples = pd.concat([train_samples, dev_samples], axis=0) cal_samples = cal_samples.sample(frac=1) train_y = train_samples['Y'] train_x = train_samples.drop('Y', axis=1) dev_y = dev_samples['Y'] dev_x = dev_samples.drop('Y', axis=1) test_y = test_samples['Y'] test_x = test_samples.drop('Y', axis=1) cal_y = cal_samples['Y'] cal_x = cal_samples.drop('Y', axis=1) predictor_columns = list(train_x.columns) joblib.dump(predictor_columns, model_path + 'predictor_columns.pkl') # Get the feature num n_features = cal_x.shape[1] reg = GradientBoostingRegressor(n_estimators=100, random_state=0) # The list hyper-parameters we want space = [ Integer(1, 25, name='max_depth'), Real(10**-5, 10**0, 'log-uniform', name='learning_rate'), Integer(1, n_features, name='max_features'), Integer(2, 100, name='min_samples_split'), Integer(1, 100, name='min_samples_leaf'), ] @use_named_args(space) def objective(**params): reg.set_params(**params) return -np.mean( cross_val_score(reg, cal_x, cal_y, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')) start = time.process_time() if optimizer == 'gp': res = gp_minimize(objective, space, n_calls=n_calls, random_state=0, verbose=True, n_jobs=-1) elif optimizer == 'fr_bt': res = forest_minimize(objective, space, n_calls=n_calls, base_estimator='ET', random_state=0, verbose=True, n_jobs=-1) elif optimizer == 'fr_rf': res = forest_minimize(objective, space, n_calls=n_calls, base_estimator='RF', random_state=0, verbose=True, n_jobs=-1) elif optimizer == 'dm': res = dummy_minimize(objective, space, n_calls=n_calls) end = time.process_time() time_cost = end - start dump(res, model_path + 'tune_history.pkl', store_objective=False) # returned_results = load(model_path+'tune_history.pkl') DIMENSION_GBRT = [ 'max depth', 'learning rate', 'max features', 'min samples split', 'min samples leaf' ] plot_objective_(res, dimensions=DIMENSION_GBRT, fig_savepath=model_path + 'objective.png') plot_evaluations_(res, dimensions=DIMENSION_GBRT, fig_savepath=model_path + 'evaluation.png') plot_convergence_(res, fig_savepath=model_path + 'convergence.png') # logger.info('Best score=%.4f'%res.fun) # logger.info("""Best parameters: # - max_depth=%d # - learning_rate=%.6f # - max_features=%d # - min_samples_split=%d # - min_samples_leaf=%d""" % (res.x[0], res.x[1], res.x[2], res.x[3], # res.x[4])) # logger.info('Time cost:{}'.format(time_cost)) params_dict = { 'max_depth': res.x[0], 'learning_rate': res.x[1], 'max_features': res.x[2], 'min_samples_split': res.x[3], 'min_samples_leaf': res.x[4], 'time_cost': time_cost, 'n_calls': n_calls, } params_df = pd.DataFrame(params_dict, index=[0]) params_df.to_csv(model_path + 'optimized_params.csv') GBR = GradientBoostingRegressor(max_depth=res.x[0], learning_rate=res.x[1], max_features=res.x[2], min_samples_split=res.x[3], min_samples_leaf=res.x[4]).fit( cal_x, cal_y) joblib.dump(GBR, model_path + 'model.pkl') GBR = joblib.load(model_path + 'model.pkl') train_predictions = GBR.predict(train_x) dev_predictions = GBR.predict(dev_x) test_predictions = GBR.predict(test_x) train_y = (train_y.values).flatten() dev_y = (dev_y.values).flatten() test_y = (test_y.values).flatten() sMax = sMax[sMax.shape[0] - 1] sMin = sMin[sMin.shape[0] - 1] train_y = np.multiply(train_y + 1, sMax - sMin) / 2 + sMin dev_y = np.multiply(dev_y + 1, sMax - sMin) / 2 + sMin test_y = np.multiply(test_y + 1, sMax - sMin) / 2 + sMin train_predictions = np.multiply(train_predictions + 1, sMax - sMin) / 2 + sMin dev_predictions = np.multiply(dev_predictions + 1, sMax - sMin) / 2 + sMin test_predictions = np.multiply(test_predictions + 1, sMax - sMin) / 2 + sMin if cast_to_zero: train_predictions[train_predictions < 0.0] = 0.0 dev_predictions[dev_predictions < 0.0] = 0.0 test_predictions[test_predictions < 0.0] = 0.0 dump_pred_results( path=model_path + 'opt_pred.csv', train_y=train_y, train_predictions=train_predictions, dev_y=dev_y, dev_predictions=dev_predictions, test_y=test_y, test_predictions=test_predictions, time_cost=time_cost, ) plot_rela_pred(train_y, train_predictions, measurement_time=measurement_time, measurement_unit=measurement_unit, fig_savepath=model_path + 'TRAIN-PRED.png') plot_rela_pred(dev_y, dev_predictions, measurement_time=measurement_time, measurement_unit=measurement_unit, fig_savepath=model_path + "DEV-PRED.png") plot_rela_pred(test_y, test_predictions, measurement_time=measurement_time, measurement_unit=measurement_unit, fig_savepath=model_path + "TEST-PRED.png") plot_error_distribution(test_y, test_predictions, fig_savepath=model_path + "TEST-ERROR-DSTRI.png") plt.show() plt.close('all')
oof_test = np.zeros((ntest, )) oof_test_skf = np.empty((NFOLDS, ntest)) for i, (train_index, test_index) in enumerate(kf): x_tr = x_train[train_index] y_tr = y_train[train_index] #获取训练数据中的4折用于训练模型 x_te = x_train[test_index] #剩余一折用来预测 clf.train(x_tr, y_tr) oof_train[test_index] = clf.predict( x_te) # 训练数据的一折(剩余4折用于训练模型)#从而5次迭代后,对全部训练数据都进行了预测。 oof_test_skf[i, :] = clf.predict(x_test) # 全部的测试数据 oof_test[:] = oof_test_skf.mean(axis=0) #每个模型对测试数据预测了5次,取平均数 return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) et = ExtraTreeRegressor() rr = RandomForestRegressor() NN = NearestNeighbors() x_train = train x_test = test et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees rr_oof_train, rr_oof_test = get_oof(rr, x_train, y_train, x_test) nn_oof_train, nn_oof_test = get_oof(NN, x_train, y_train, x_test) x_train = np.concatenate((et_oof_train, rr_oof_train, nn_oof_train), axis=1) x_test = np.concatenate((et_oof_test, rr_oof_test, nn_oof_test), axis=1) gb = GradientBoostingRegressor().fit(x_train, y_train) predictions = gb.predict(x_test)
def gbdt(x_train, y_train, x_test): model = GradientBoostingRegressor() model.fit(x_train, y_train) # 线性回归建模 predicted = model.predict(x_test) return (predicted)
#param_grid = { # 'loss':['ls','lad','huber'], # 'learning_rate': [0.01, 0.02, 0.05, 0.1,0.2], # 'n_estimators': [100, 200, 400, 800, 1000], # 'max_depth':[3,4,5,6], # 'alpha':[0.7,0.8,0.9]} ##fit_params = {'categorical_feature':[2,3,4,5,6]} # #gbm = GridSearchCV(gbdt, param_grid) # #gbm.fit(X_train, y_train) #print('Best parameters found by grid search are:', gbm.best_params_) ##模型训练 gbdt=GradientBoostingRegressor(loss='ls',learning_rate=0.2,n_estimators=1000,subsample=1, min_samples_split=2,min_samples_leaf=1,max_depth=3,alpha=0.7, verbose=0) gbdt.fit(X_train,y_train) #展示特征的重要性分布 feature_importance=gbdt.feature_importances_ plt.figure() plt.scatter(np.arange(1,len(feature_importance)+1),feature_importance,c='r',zorder=10) plt.plot(np.arange(1,len(feature_importance)+1),feature_importance) plt.xlabel('Feature index') plt.ylabel('Feature importance') ##训练部分的拟合效果展示 plt.figure()
filename = "blogData_train.csv" train_data = pd.read_csv(filename, header=None) #train_data = train_data.iloc[np.random.permutation(len(train_data))] train_output = train_data[len(train_data.columns) - 1] del train_data[len(train_data.columns) - 1] filename = "blogData_test-2012.02.01.00_00.csv" test_data = pd.read_csv(filename, header=None) #test_data = test_data.iloc[np.random.permutation(len(test_data))] test_output = test_data[len(test_data.columns) - 1] del test_data[len(test_data.columns) - 1] reg = LinearRegression() rf = RandomForestRegressor() gradBoost = GradientBoostingRegressor() ada = AdaBoostRegressor() #n_estimators=500 regressors = [reg, rf, gradBoost, ada] regressor_names = [ "Linear Regression", "Random Forests", "Gradient Boosting", "Adaboost" ] #regressors = ada #regressor_names = "Adaboost" for regressor, regressor_name in zip(regressors, regressor_names): regressor.fit(train_data, train_output)
@author: sandra_chang """ from sklearn import datasets, metrics from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV from sklearn.ensemble import GradientBoostingRegressor from scipy.stats import uniform import numpy as np wine = datasets.load_wine() # 切分訓練集/測試集 x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4) # 建立模型 clf = GradientBoostingRegressor(random_state=7) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(metrics.mean_squared_error(y_test, y_pred)) # 設定要訓練的超參數組合 n_estimators = np.arange(20,200,20) max_depth = np.arange(1,7) #param_grid = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1']) param_grid = dict(n_estimators=n_estimators, max_depth=max_depth) ## 建立搜尋物件,放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算) random_search = RandomizedSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1, cv=3)
def development_models_pred_test(self, plotTest): ''' plotTest: Parameter to make the distribution plot of test or not This function develops the models and makes the predictions ''' def __get_mape(y_true, y_pred): """ Compute mean absolute percentage error (MAPE) """ y_true, y_pred = np.array(y_true), np.array(y_pred) return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 4) estimatorXGB = { 'random_state': [22], 'max_depth': stats.randint(1, 100), 'max_leaves': stats.randint(1, 100), 'learning_rate': stats.uniform(0.1, 0.8), 'min_child_weight': stats.randint(1, 100), 'subsample': stats.uniform(0.1, 1), 'n_estimators': stats.randint(1, 100) } model_xgbRandomSearch = RandomizedSearchCV( XGBRegressor(), estimatorXGB, scoring='neg_mean_squared_error', n_jobs=5, cv=5, random_state=22).fit(self.X_train_scaled, self.y_train_scaled) estimatorLGB = { 'random_state': [22], 'max_depth': stats.randint(1, 50), 'num_leaves': stats.randint(1, 25), 'max_leaves': stats.randint(1, 25), 'learning_rate': stats.uniform(0.1, 1), 'min_child_weight': stats.randint(1, 50), 'subsample': stats.uniform(0.1, 1), 'n_estimators': stats.randint(1, 100) } model_lgbRandomSearch = RandomizedSearchCV(LGBMRegressor(), estimatorLGB, scoring='r2', n_jobs=5, cv=5, random_state=22).fit( self.X_train_scaled, self.y_train_scaled) estimatorLR = {'n_jobs': stats.randint(1, 5)} model_lrRandomSearch = RandomizedSearchCV(LinearRegression(), estimatorLR, scoring='r2', n_jobs=5, cv=5, random_state=22).fit( self.X_train_scaled, self.y_train_scaled) estimatorGradBoost = { 'random_state': [22], 'n_estimators': stats.randint(1, 100), 'max_depth': stats.randint(1, 50), 'learning_rate': stats.uniform(0.1, 1), 'min_weight_fraction_leaf': stats.uniform(0.1, 1), 'min_samples_split': stats.randint(1, 100) } model_GradBoostRandomSearch = RandomizedSearchCV( GradientBoostingRegressor(), estimatorGradBoost, scoring='neg_mean_squared_error', n_jobs=5, cv=5, random_state=22).fit(self.X_train_scaled, self.y_train_scaled) estimatorRandForest = { 'random_state': [22], 'n_estimators': stats.randint(1, 100), 'max_depth': stats.randint(1, 50), 'min_samples_split': stats.randint(1, 100), 'min_samples_leaf': stats.randint(1, 100), 'max_leaf_nodes': stats.randint(1, 100) } model_RandForestRandomSearch = RandomizedSearchCV( RandomForestRegressor(), estimatorRandForest, scoring='neg_mean_squared_error', n_jobs=5, cv=5, random_state=22).fit(self.X_train_scaled, self.y_train_scaled) pred_train_xgb_scaled = model_xgbRandomSearch.predict( self.X_train_scaled) pred_train_xgb = pred_train_xgb_scaled * math.sqrt( self.scaler.var_[0]) + self.scaler.mean_[0] pred_train_lgb_scaled = model_lgbRandomSearch.predict( self.X_train_scaled) pred_train_lgb = pred_train_lgb_scaled * math.sqrt( self.scaler.var_[0]) + self.scaler.mean_[0] pred_train_lr_scaled = model_lrRandomSearch.predict( self.X_train_scaled) pred_train_lr = pred_train_lr_scaled * math.sqrt( self.scaler.var_[0]) + self.scaler.mean_[0] pred_train_GradBoost_scaled = model_GradBoostRandomSearch.predict( self.X_train_scaled) pred_train_GradBoost = pred_train_GradBoost_scaled * math.sqrt( self.scaler.var_[0]) + self.scaler.mean_[0] pred_train_RandForest_scaled = model_RandForestRandomSearch.predict( self.X_train_scaled) pred_train_RandForest = pred_train_RandForest_scaled * math.sqrt( self.scaler.var_[0]) + self.scaler.mean_[0] models = [ model_xgbRandomSearch, model_lgbRandomSearch, model_lrRandomSearch, model_GradBoostRandomSearch, model_RandForestRandomSearch ] namesModels = ['xgb', 'lgb', 'lr', 'GradBoost', 'RandForest'] def __pred_test(test, models, namesModels): for i, model in enumerate(models): var = 'pred_' + namesModels[i] pred = model.predict(self.X_test_scaled) test[var + '_scaled'] = pred test[var] = test[var + '_scaled'] * test[ self.varPredict + '_std'] + test[self.varPredict + '_mean'] test.drop([var + '_scaled'], axis=1, inplace=True) return test test_copy = self.test.copy() test_copy = __pred_test(test_copy, models, namesModels) varsPred = [ elem for elem in test_copy.columns if elem.__contains__('pred') ] test_copy['pred_ensamble'] = test_copy[varsPred].mean(axis=1) dfMetricsTrainTest = pd.DataFrame({ 'model': namesModels, 'RMSE': [ round( math.sqrt(mean_squared_error(self.y_train, pred_train_xgb)), 3), round( math.sqrt(mean_squared_error(self.y_train, pred_train_lgb)), 3), round( math.sqrt(mean_squared_error(self.y_train, pred_train_lr)), 3), round( math.sqrt( mean_squared_error(self.y_train, pred_train_GradBoost)), 3), round( math.sqrt( mean_squared_error(self.y_train, pred_train_RandForest)), 3) ], 'MAPE (%)': [ __get_mape(self.y_train, pred_train_xgb), __get_mape(self.y_train, pred_train_lgb), __get_mape(self.y_train, pred_train_lr), __get_mape(self.y_train, pred_train_GradBoost), __get_mape(self.y_train, pred_train_RandForest) ], 'RMSE_pred_test': [ round( math.sqrt( mean_squared_error(test_copy[[self.varPredict]], test_copy[['pred_xgb']])), 3), round( math.sqrt( mean_squared_error(test_copy[[self.varPredict]], test_copy[['pred_lgb']])), 3), round( math.sqrt( mean_squared_error(test_copy[[self.varPredict]], test_copy[['pred_lr']])), 3), round( math.sqrt( mean_squared_error(test_copy[[self.varPredict]], test_copy[['pred_GradBoost']])), 3), round( math.sqrt( mean_squared_error(test_copy[[self.varPredict]], test_copy[['pred_RandForest']])), 3) ], 'MAPE_pred_test (%)': [ __get_mape(test_copy[[self.varPredict]], test_copy[['pred_xgb']]), __get_mape(test_copy[[self.varPredict]], test_copy[['pred_lgb']]), __get_mape(test_copy[[self.varPredict]], test_copy[['pred_lr']]), __get_mape(test_copy[[self.varPredict]], test_copy[['pred_GradBoost']]), __get_mape(test_copy[[self.varPredict]], test_copy[['pred_RandForest']]) ] }) if plotTest == True: rcParams['figure.figsize'] = 10, 8 # width 10, height 8 ax = test_copy.plot(x='Date', y=[self.varPredict] + varsPred + ['pred_ensamble'], style=['g-', 'y-', 'b-'], grid=True) ax.legend(['test'] + varsPred + ['pred_ensamble']) ax.set_xlabel("Date") ax.set_ylabel("USD") ax.set_title("Zoom in to test set") fileSave = self.path + '/' + self.idYahoo + '/output/metricas/metrics_train_' + self.varPredict + '_ml.csv' dfMetricsTrainTest.to_csv(fileSave, sep=';', index=False) self.testPred, self.dfMetricsTrainTest, self.models = test_copy, dfMetricsTrainTest, models trainingModel_ml.__saveModels(self, models, self.scaler, self.path, self.idYahoo)
model = GridSearchCV(SVR(kernel='rbf'), cv=5, param_grid={"C": c_param, "gamma": gamma_param}) model_name = "SVR" elif selected_model == Model.KRR: model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": numpy.logspace(-2, 2, 5)}) model_name = "KRR" elif selected_model == Model.REGRESSION_TREE: model = DecisionTreeRegressor(criterion="mse") model_name = "REGRESSION_TREE" elif selected_model == Model.RANDOM_FOREST: model = RandomForestRegressor(criterion="mse", n_estimators=20, min_samples_split=4, min_weight_fraction_leaf=0.01) model_name = "FOREST" elif selected_model == Model.EXTRA_TREE_REGRESSOR: model = ExtraTreesRegressor(criterion="mse") model_name = "EXTRA_TREE_REGRESSOR" elif selected_model == Model.GRADIENT_BOOSTING_REGRESSOR: model = GradientBoostingRegressor(loss="lad", n_estimators=200) model_name = "GRADIENT_BOOSTING_REGRESSOR" elif selected_model == Model.BAGGING_REGRESSOR: model = BaggingRegressor(oob_score=True) model_name = "BAGGING_REGRESSOR" elif selected_model == Model.ADABOOST_REGRESSOR: model = AdaBoostRegressor(loss="linear") model_name = "ADABOOST_REGRESSOR" else: Support.colored_print("No method selected!", "red") sys.exit(0) Support.colored_print("Method selected: " + model_name, "green") Support.colored_print("Training...", "green") t0 = time.time() model.fit(X[:train_size], y[:train_size])
print(features_40_percent_sale_price_corr) model_eval_helper(features_40_percent_sale_price_corr, LinearRegression()) # Again, we should have scaled our data before we trained our linear regression model. But since we won't use linear regression models from now on, we will skip scaling the data. # ### GradientBoostingRegressor # # Now we'll use a more GradientBoostingRegressor as a more sophisticated model. Ensemble methods like GradientBoostingRegressor usually perform extremely good in Kaggle competitions. Further, we don't have to worry about feature scaling and having too many features. # In[92]: from sklearn.ensemble import GradientBoostingRegressor # In[93]: reg = GradientBoostingRegressor(n_estimators=200, max_depth=2) reg # In[94]: model_eval_helper(final_features, model=reg) # In[95]: df_importances = pd.DataFrame(reg.feature_importances_, index=final_features, columns=["Importance"]) df_importances.sort_values("Importance", ascending=False, inplace=True) print(df_importances)
'SVR': { 'model': SVR(), 'param': { 'clf__C': [0.1, 1, 10, 100], 'clf__gamma': [1, 0.1, 0.01, 0.001], 'clf__kernel': ['rbf', 'poly', 'sigmoid'], }, }, # 'XGB':{ "model":XGBRegressor(), # "param":{"clf__learning_rate": [0.05,1,5],'clf__n_estimators': [100,50], # # "clf__max_depth": [5,10,15] # }, # }, 'GradientBoost': { "model": GradientBoostingRegressor(), "param": { "clf__model__n_estimators": [500, 600, 700, 800, 1000], # "clf__max_depth": [2, 3, 4] }, }, 'decisionTree': { "model": GradientBoostingRegressor(), "param": { "clf__criterion": ['mse', 'mae'], 'clf__min_samples_leaf': [5, 10, 15, 20, 25], 'clf__max_depth': [6, 9, 12, 15, 20], }, }, }
Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # ensembles ensembles = [] ensembles.append(('ScaledAB', Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())]))) ensembles.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())]))) ensembles.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())]))) ensembles.append(('ScaledET', Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())]))) results = [] names = [] for name, model in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
import sklearn from random import shuffle from sklearn.datasets import fetch_california_housing from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.linear_model import LinearRegression # Download data tmp = fetch_california_housing() num_samples = tmp['data'].shape[0] feature_names = tmp['feature_names'] y = tmp['target'] X = tmp['data'] clf = GradientBoostingRegressor(loss="ls") clf.fit(X,y) plt.close("all") plt.figure(figsize=[10,10]) ax = plt.gca() plot_partial_dependence(clf, X, feature_names, feature_names, n_cols=3, ax=ax) plt.tight_layout() plt.show() clf2 = LinearRegression() clf2.fit(X,y) MSE_boosting = np.mean((y-clf.predict(X))**2) MSE_LR = np.mean((y-clf2.predict(X))**2)
dat1 = df.loc[:, ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']] X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42) y_train = y_train.values.ravel() models = [] models.append(('SVR', SVR())) models.append(('KNN', KNeighborsRegressor())) models.append(('DT', DecisionTreeRegressor())) models.append(('RF', RandomForestRegressor())) models.append(('l', Lasso())) models.append(('EN', ElasticNet())) models.append(('R', Ridge())) models.append(('BR', BayesianRidge())) models.append(('GBR', GradientBoostingRegressor())) models.append(('RF', AdaBoostRegressor())) models.append(('ET', ExtraTreesRegressor())) models.append(('BgR', BaggingRegressor())) scoring = 'neg_mean_squared_error' results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=42) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg)
def gredient_boosting(self): model = GradientBoostingRegressor() return self.fiting_model(model)
data = data.drop(['origin','destination','train_type','train_class','fare'],1) data = pd.concat([one_hot_encoding, data], axis=1) data = data.astype(np.float) X = data.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 ,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38, 39,40,41,42,43,44,46,47,48,49,50,51]] Y = data.iloc[:, [45]] #split X_train1, X_test1, y_train1, y_test1 = train_test_split(X.values, Y.values, test_size=.9, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_train1, y_train1, test_size=.1, random_state=42) #model model = GradientBoostingRegressor(n_estimators=100) model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(mse) R2 = r2_score(y_test, y_pred) print(R2) score_train = model.score(X_train, y_train) print(score_train) score_test = model.score(X_test,y_test) print(score_test) #learning_curve #thanks to scikit-learn for the https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42) def plot_curve():
else: break selected_features_BE = cols X = df[selected_features_BE] X = df[["CRuns", "OrtCWalks", "CWalks"]] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46) gbm_cv_model_best_params_ = { 'learning_rate': 0.01, 'loss': 'lad', 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.5 } gbm_tuned = GradientBoostingRegressor(**gbm_cv_model_best_params_).fit( X_train, y_train) y_pred = gbm_tuned.predict(X_test) gbm_final = np.sqrt(mean_squared_error(y_test, y_pred)) print(gbm_final) import pickle pickle.dump(gbm_tuned, open('regression_model.pkl', 'wb')) print("Model Kaydedildi")