def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50, learning_rate=0.1) clf.fit(X, y) y_predict = clf.predict(X) result = { "mean_absolute_error": mean_absolute_error(y, y_predict), } print(result) return {}, result
def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def main(config="../../config.yaml", param="./gbdt_config_reg.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X) result = {"mean_absolute_error": mean_absolute_error(y, y_predict)} print(result) return {}, result
def run(self): loss = self.lossComboBox.currentText() if loss == 'Least Squares': loss = 'ls' if loss == 'Least Absolute Deviation': loss = 'lad' if loss == 'Huber': loss = 'huber' if loss == 'Quantile': loss = 'quantile' params = { 'loss': loss, 'learning_rate': self.learningDoubleSpinBox.value(), 'n_estimators': self.numEstSpinBox.value(), 'subsample': self.subsampleDoubleSpinBox.value(), 'criterion': 'friedman_mse', 'min_samples_split': self.min_n_splitSpinBox.value(), 'min_samples_leaf': self.min_n_leafSpinBox.value(), 'min_weight_fraction_leaf': self.min_fractionDoubleSpinBox.value(), 'max_depth': self.max_depthSpinBox.value(), 'min_impurity_decrease': self.min_imp_decDoubleSpinBox.value(), 'random_state': 1, 'alpha': self.alphaDoubleSpinBox.value() } return params, self.getChangedValues(params, GradientBoostingRegressor())
def train_model(data): half_len = len(data) # train X = [] y = [] for [c, cb, delta] in data[:half_len]: X.append([c, cb]) y.append(delta) svr_rbf_general = svm.SVR(kernel='rbf') svr_linear_general = svm.SVR(kernel='linear') svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = svm.SVR(kernel='linear', C=1e3) svr_poly = svm.SVR(kernel='poly', C=1e3, degree=2) model_br = BayesianRidge() model_lr = LinearRegression() model_etc = ElasticNet() model_svr = SVR() model_gbr = GradientBoostingRegressor() # clf = svr_linear_general clf = svr_linear_general clf.fit(X, y) return clf
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingRegressor(n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X_guest) result = { "mean_squared_error": mean_squared_error(y_guest, y_predict), "mean_absolute_error": mean_absolute_error(y_guest, y_predict) } print(result) return {}, result
def tune_gbr(self): parameters = {'kernel':['rbf','linear'], 'C':[88,89,90,91,92], 'gamma':[0.34,0.36,0.37]} clf = GridSearchCV(GradientBoostingRegressor(),parameters,verbose=2) clf.fit(self.X_train,self.y_train) print (clf.best_params_) print (clf.best_score_)
def GDBT_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)]) return res
def getModels(): models = {} models['dt'] = DecisionTreeRegressor(max_depth=50) models['rf1'] = RandomForestRegressor() models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15) models['gbr'] = GradientBoostingRegressor(n_estimators=128, max_depth=5, learning_rate=1.0) # models['abr'] = AdaBoostRegressor(n_estimators=128) return models
def model_build(train_set): X = train_set.iloc[:, 6:11] Y = train_set['label'] #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def train_model(): global train_x, train_y, test_x gbr = GradientBoostingRegressor() cv_score = cross_val_score(gbr, train_x, train_y).mean() print(cv_score) nn = MLPRegressor() cv_score = cross_val_score(nn, train_x, train_y).mean() print(cv_score) rft = RandomForestRegressor() cv_score = cross_val_score(rft, train_x, train_y).mean() print(cv_score)
def create_models(): models = { 'BayesianRidge': BayesianRidge(), # 'LinearRegression': LinearRegression(), 'ElasticNet': ElasticNet(), 'SVR(rbf)': SVR(kernel='rbf'), 'SVR(linear)': SVR(kernel='linear'), 'Lasso': Lasso(), 'GBR': GradientBoostingRegressor(n_estimators=300, max_depth=3), } return models
def test_argument_names(): boston = load_boston() X = DataFrame(boston['data'], columns=boston['feature_names']) y = boston['target'] model = GradientBoostingRegressor(verbose=True).fit(X, y) code = sklearn2code(model, ['predict'], numpy_flat, argument_names=X.columns) boston_housing_module = exec_module('boston_housing_module', code) assert_array_almost_equal(model.predict(X), boston_housing_module.predict(**X))
def predict_using_local_model(self): gbr = GradientBoostingRegressor() gbr.fit(self.train_x, self.train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(self.test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') return predictions
def train_model(self): # Tried other model such MLP neural network regressor and random forest trees, but GBR performed best global train_x, train_y, test_x cvscore = [] range = [4, 5, 6, 7, 8] for i in range: print(i) gbr = GradientBoostingRegressor(max_leaf_nodes=i) cv_score = cross_val_score( gbr, train_x, train_y, scoring='neg_mean_squared_error').mean() cvscore.append(cv_score) print(cvscore)
def prediction(): global train_x, train_y, test_x gbr = GradientBoostingRegressor() gbr.fit(train_x, train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') print(predictions) return predictions
def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0): ''' :param data: DataFrame :param label: label name :param task: 任务类型, [regression, classification] :param model: ['gbdt', 'xgb', 'lgb'] :param importance_threshold, 除去小于阈值的特征 ''' self.data = data self.label = label self.task = task self.model_name = model_name self._importance_threshold = importance_threshold self.model = None # 根据任务和label的值,设置验证准则 self.eval_metric = None if model_name == 'lgb': if self.task == 'classification': self.model = lgb.LGBMClassifier(**lgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'logloss' elif self.task == 'regression': self.model = lgb.LGBMRegressor(**lgb_params) self.eval_metric = 'l2' else: raise ValueError('Task must be either "classification" or "regression"') elif model_name == 'xgb': if self.task == 'classification': self.model = xgb.XGBClassifier(**xgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'mlogloss' elif self.task == 'regression': self.model = xgb.XGBRegressor(**xgb_params) self.eval_metric = 'rmse' else: raise ValueError('Task must be either "classification" or "regression"') else: # gbdt if self.task == 'classification': self.model = GradientBoostingClassifier(**gbdt_params) elif self.task == 'regression': self.model = GradientBoostingRegressor(**gbdt_params) else: raise ValueError('Task must be either "classification" or "regression"') if not eval_metric: self.eval_metric = eval_metric
def GDBT_ALL_train(trainFileName, testFileName): train_X, train_y, _ = ld.loadData_all(trainFileName) test_X, test_y, items = ld.loadData_all(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(test_X) res = [] for i in range(len(test_X)): res.append([ items[i], 'all', '%.2f' % max(pred_y[i], 0), '%.2f' % max(test_y[i], 0) ]) return res
def main(train, test, filepath): if not filepath: click.echo("need filepath") return X, Y = get_data(filepath) if not train or not test: click.echo("need train or test size") return TRAIN_SIZE = 96 * int(train) TEST_SIZE = 96 * int(test) X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] #clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) clf = GradientBoostingRegressor(n_estimators=100, max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') predict_list = [] for i in range(TEST_SIZE): X = [[x] for x in range(i, TRAIN_SIZE + i)] clf.fit(X, Y[i:TRAIN_SIZE + i]) y_pred = clf.predict(np.array([TRAIN_SIZE + 1 + i]).reshape(1, -1)) predict_list.append(y_pred) #print("mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)) #print("sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))) origin_data = Y_test #print("origin data:%s"%origin_data) plt.plot([x for x in range(TRAIN_SIZE + 1, TRAIN_SIZE + TEST_SIZE + 1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
def multi_output_regression(train, test, grid, outputs): # Multi-Layer Perceptron Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_mlp = MultiOutputRegressor( MLPRegressor(solver='adam', learning_rate='adaptive', max_iter=500, early_stopping=True)) multi_output_mlp.fit(input_train, output_train) prediction_mlp = multi_output_mlp.predict(input_test) print('Multi-Layer Perceptron') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_mlp))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_mlp))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_mlp)))) # Gradient Boosting Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_gbr = MultiOutputRegressor( GradientBoostingRegressor(loss='huber')) multi_output_gbr.fit(input_train, output_train) prediction_gbr = multi_output_gbr.predict(input_test) print('Gradient Boosting Regressor') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_gbr))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_gbr))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_gbr)))) # Random Forest Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_rfr = MultiOutputRegressor(RandomForestRegressor()) multi_output_rfr.fit(input_train, output_train) prediction_rfr = multi_output_rfr.predict(input_test) print('Random Forest Regressor') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_rfr))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_rfr))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_rfr)))) return actual, prediction_gbr, prediction_mlp, prediction_rfr
def parameter_choose(train_set): """ 模型最佳参数选择,根据对应的训练集选择最佳模型参数 :param train_set: 训练集 :return: 无 """ X = train_set.iloc[:, 6:11] Y = train_set['label'] param_test = {'n_estimators': range(10, 81, 10)} gsearch = GridSearchCV( estimator=GradientBoostingRegressor(learning_rate=1), param_grid=param_test, iid=True, cv=5) gsearch.fit(X, Y) print(gsearch.cv_results_) print(gsearch.best_params_, gsearch.best_score_)
def model_build(train_set, weight=None): """ 模型建立,根据训练集,构建GBDT模型 :param train_set: 训练集 :param weight: 训练集label权重列表 :return: 训练完成的model """ X = train_set.iloc[:, 6:11] Y = train_set['label'] #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() if not weight: model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def gradient_boosting(train, test, label): gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber') gb.fit(train, label.as_matrix().ravel()) # prediction on training data y_predicton = gb.predict(train) y_test = label print("Gradient Boosting score on training set: ", rmse(y_test, y_predicton)) y_prediction = gb.predict(test) y_prediction = np.exp(y_prediction) return y_prediction
def compare_algorithms(datasetName, data, target): X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1) params = { 'n_estimators': [10, 20, 30, 40], 'loss': ['ls', 'huber'], 'min_samples_leaf': [6], 'max_depth': [3, 4, 5, 6] } print("\n\nTraining GBRT on %s..." % datasetName) clf = GridSearchCV(GradientBoostingRegressor(), params, cv=5, n_jobs=-1) clf.fit(X_train, y_train) print("Best params original: %s" % clf.best_params_) print("Avg train time original: %s seconds" % clf.cv_results_["mean_fit_time"][clf.best_index_]) bestOriginal = clf.best_estimator_ myclf = GridSearchCV(MyGradientBoostingRegressor(), params, cv=5, n_jobs=-1) myclf.fit(X_train, y_train) print("Best params mine: %s" % myclf.best_params_) print("Avg train time mine: %s seconds" % myclf.cv_results_["mean_fit_time"][myclf.best_index_]) bestMine = myclf.best_estimator_ originalPredictions = bestOriginal.predict(X_test) myPredicttions = bestMine.predict(X_test) print("The dataset: %s with %s train instances" % (datasetName, data.shape[0])) print("Original GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" % (r2_score(y_test, originalPredictions), mean_squared_error(y_test, originalPredictions), mean_absolute_error(y_test, originalPredictions))) print("My GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" % (r2_score(y_test, myPredicttions), mean_squared_error(y_test, myPredicttions), mean_absolute_error(y_test, myPredicttions)))
def model_build(train_set, weight=None): """ 模型建立,根据训练集,构建GBDT模型 :param train_set: 训练集 :param weight: 训练集label权重列表 :return: 训练完成的model """ X = train_set.iloc[:, 1:] print(len(X)) Y = train_set['label'] print(len(Y)) #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() #model = logistic_regression_path(X, Y) model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def trainmodels(): global n_folds,model_br,model_dic,model_etc,model_gbr,model_lr,model_names,\ model_svr,cv_score_list,pre_y_list n_folds = 6 # 设置交叉检验的次数 model_br = BayesianRidge() # 建立贝叶斯岭回归模型对象 model_lr = LinearRegression() # 建立普通线性回归模型对象 model_etc = ElasticNet() # 建立弹性网络回归模型对象 model_svr = SVR() # 建立支持向量机回归模型对象 model_gbr = GradientBoostingRegressor() # 建立梯度增强回归模型对象 model_names = [ 'BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR' ] # 不同模型的名称列表 model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同回归模型对象的集合 cv_score_list = [] # 交叉检验结果列表 pre_y_list = [] # 各个回归模型预测的y值列表 for model in model_dic: # 读出每个回归模型对象 scores = cross_val_score(model, X, y, cv=n_folds) # 将每个回归模型导入交叉检验模型中做训练检验 cv_score_list.append(scores) # 将交叉检验结果存入结果列表 pre_y_list.append(model.fit(X, y).predict(X)) # 将回归训练中得到的预测y存入列表
def run_experiment(writer, name, generate_data): np.random.seed(SEED) data = DataHolder(generate_data(TRAIN_SIZE), generate_data(TEST_SIZE)) if DUMP_FILES: data.dump(name) # Define model types to use models = [ svr_grid(), RandomForestRegressor(n_estimators=100), GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=0, verbose=VERBOSE), KerasRegressor(build_fn=neural_network_regression, data=data) ] for model in models: eval_data(writer, name, model, data)
def regular_model(self): model_br = BayesianRidge() model_lr = LinearRegression() model_etc = ElasticNet() model_las = Lasso() model_rid = Ridge() model_sgd = SGDRegressor() model_svr = SVR() model_gbr = GradientBoostingRegressor() model_rfr = RandomForestRegressor() model_names = ['BayesianRidge','LinearRegression','ElasticNet','Lasso','Ridge', 'SGDRegressor','SVR','GradientBoostingRegressor','RandomForestRegressor'] model_dic = [model_br,model_lr,model_etc,model_las,model_rid,model_sgd,model_svr,model_gbr,model_rfr] result_dict ={} for i,clf in enumerate(model_dic): value = cross_validate(clf,self.X_train,self.y_train) result_dict[model_names[i]] = value result_dict = sorted(result_dict.items(),key = lambda x:x[1],reverse = True) print (result_dict)
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = GradientBoostingRegressor(n_estimators=120, learning_rate=0.2,max_depth=2, random_state=0, loss='ls') regressor.fit(X_train,Y_train) score = regressor.score(X_test,Y_test) print(score)
rsqrd_svm = r2_score(Y_test, y_pred_svm) mae_svm = mean_absolute_error(Y_test, y_pred_svm) #RF Algorithm from sklearn.ensemble import RandomForestRegressor regressor_rf = RandomForestRegressor(n_estimators=20, random_state=0) regressor_rf.fit(X_train, Y_train) y_pred_rf = regressor_rf.predict(X_test) rms_rf = sqrt(mean_squared_error(Y_test, y_pred_rf)) rsqrd_rf = r2_score(Y_test, y_pred_rf) mae_rf = mean_absolute_error(Y_test, y_pred_rf) #GB Algorithm from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor regressor_gb = GradientBoostingRegressor(learning_rate=0.5, n_estimators=400, loss='ls') regressor_gb.fit(X_train, Y_train) y_pred_gb = regressor_gb.predict(X_test) rms_gb = sqrt(mean_squared_error(Y_test, y_pred_gb)) rsqrd_gb = r2_score(Y_test, y_pred_gb) mae_gb = mean_absolute_error(Y_test, y_pred_gb) #Multiple Linear Regression from sklearn.linear_model import LinearRegression regressor_lr = LinearRegression() regressor_lr.fit(X_train, Y_train) y_pred_lr = regressor_lr.predict(X_test) rms_lr = sqrt(mean_squared_error(Y_test, y_pred_lr)) rsqrd_lr = r2_score(Y_test, y_pred_lr) mae_lr = mean_absolute_error(Y_test, y_pred_lr)