def gbdt_lr(para): print("gbdt_lr") x_train = para[0] x_train_lr = para[1] x_test = para[2] y_train = para[3] y_train_lr = para[4] y_test = para[5] maxleafnodes = 11 gbc = GBDT(max_leaf_nodes=maxleafnodes - 1, n_estimators=600, min_samples_leaf=5, max_depth=3, learning_rate=0.02, subsample=0.2, max_features=0.1) gbc.fit(x_train, y_train) ohe = OHE() ohe.fit(gbc.apply(x_train)[:, :]) li = gbc.apply(x_train_lr)[:, :] x_train_lr_gbc = ohe.transform(li) #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) li = gbc.apply(x_test)[:, :] x_test_gbc = ohe.transform(li) #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes) del (li) lr = sgd(n_iter=50) lr.fit(x_train_lr_gbc, y_train_lr) yp = lr.predict(x_test_gbc) print("GBDT+SGD: " + str(auc(y_test, yp))) return (gbc, yp)
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def test_regression_dataset(loss, subsample): # Check consistency on regression dataset with least squares # and least absolute deviation. ones = np.ones(len(y_reg)) last_y_pred = None for sample_weight in [None, ones, 2 * ones]: reg = GradientBoostingRegressor( n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, ) reg.fit(X_reg, y_reg, sample_weight=sample_weight) leaves = reg.apply(X_reg) assert leaves.shape == (500, 100) y_pred = reg.predict(X_reg) mse = mean_squared_error(y_reg, y_pred) assert mse < 0.04 if last_y_pred is not None: # FIXME: We temporarily bypass this test. This is due to the fact # that GBRT with and without `sample_weight` do not use the same # implementation of the median during the initialization with the # `DummyRegressor`. In the future, we should make sure that both # implementations should be the same. See PR #17377 for more. # assert_allclose(last_y_pred, y_pred) pass last_y_pred = y_pred
def test_boston(): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. for loss in ("ls", "lad", "huber"): for subsample in (1.0, 0.5): last_y_pred = None for i, sample_weight in enumerate( (None, np.ones(len(boston.target)), 2 * np.ones(len(boston.target)))): clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=1, random_state=1) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert mse < 6.0, "Failed with loss %s and " \ "mse = %.4f" % (loss, mse) if last_y_pred is not None: np.testing.assert_array_almost_equal( last_y_pred, y_pred, err_msg='pred_%d doesnt match last pred_%d for loss %r and subsample %r. ' % (i, i - 1, loss, subsample)) last_y_pred = y_pred
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
class myStackingFeaturesRegressor(BaseEstimator, TransformerMixin): def __init__(self): self.estimator = None self.lgb = GradientBoostingRegressor(loss='ls', alpha=0.9, n_estimators=100, learning_rate=0.01, max_depth=8, subsample=0.8, min_samples_split=9, max_leaf_nodes=10) self.grd_enc = OneHotEncoder() self.lr = RidgeCV() self.classes_ = [-1, 1] def fit(self, X, y=None, **fit_params): self.lgb.fit(X, y) self.grd_enc.fit(self.lgb.apply(X)) self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y) def predict(self, X): return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))
def test_gbm_regressor_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingRegressor #Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingRegressor gbm_sk = GradientBoostingRegressor(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True print( (a == b for a, b in zip(gbm.staged_predict(X), gbm_sk.staged_predict(X)))) assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_gbm_regressor_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingRegressor #Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingRegressor gbm_sk = GradientBoostingRegressor(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True print((a == b for a, b in zip(gbm.staged_predict(X), gbm_sk.staged_predict(X)))) assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_boston(): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. for loss in ("ls", "lad", "huber"): for subsample in (1.0, 0.5): last_y_pred = None for i, sample_weight in enumerate( (None, np.ones(len(boston.target)), 2 * np.ones(len(boston.target)))): clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=1, random_state=1) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert mse < 6.0, "Failed with loss %s and " \ "mse = %.4f" % (loss, mse) if last_y_pred is not None: np.testing.assert_array_almost_equal( last_y_pred, y_pred, err_msg= 'pred_%d doesnt match last pred_%d for loss %r and subsample %r. ' % (i, i - 1, loss, subsample)) last_y_pred = y_pred
def on_data(context): context.Num = context.Num + 1 if context.Num < context.Len: # 如果交易日个数小于Len+1,则进入下一个交易日进行回测 return if datetime.datetime.strftime( context.now, '%Y-%m-%d') not in context.month_begin: # 调仓频率为月,月初开始调仓 return # 获取数据: KData = get_reg_kdata(reg_idx=context.reg_kdata[0], length=context.Len, fill_up=True, df=True) FData = get_reg_factor(reg_idx=context.reg_factor[0], target_indices=[x for x in range(300)], length=context.Len, df=True) # 获取因子数据 # 特征构建: Fcode = context.FactorCode # 标签不需要代号了 # 数据存储变量: # Close 字段为标签,Fcode 为标签 FactorData = pd.DataFrame(columns=(['idx', 'benefit'] + Fcode)) # 存储训练特征及标签样本 FactorDataTest = pd.DataFrame(columns=(['idx'] + Fcode)) # 存储预测特征样本 # K线数据序号对齐 tempIdx = KData[KData['time'] == KData['time'] [0]]['target_idx'].reset_index(drop=True) # 按标的处理数据: for i in range(300): # 训练特征集及训练标签构建: # 临时数据存储变量: FactorData0 = pd.DataFrame(np.full([1, len(Fcode) + 2], np.nan), columns=(['idx', 'benefit'] + Fcode)) # 存储预测特征样本 FactorDataTest0 = pd.DataFrame(np.full([1, len(Fcode) + 1], np.nan), columns=(['idx'] + Fcode)) # 因子数据 序号对齐, 提取当前标的的因子数据 FData0 = FData[FData['target_idx'] == tempIdx[i]].reset_index( drop=True) # 按特征处理数据: for FC in context.FactorCode: # 提取当前标的中与当前因子FC相同的部分 FCData = FData0[FData0['factor'] == FC]['value'].reset_index( drop=True) FactorData0[FC] = FCData[0] # 存储上一个月初的股票因子数据 # 按标签处理数据: # 提取当前标的的前一个月的K线面板数据 close = np.array(KData[KData['target_idx'] == tempIdx[i]]['close']) # 计算当前标的在上一个月的收益率 benefit = (close[context.Len - 1] - close[0]) / close[0] FactorData0['benefit'] = benefit # idx: 建立当前标的在训练样本集中的索引 FactorData0['idx'] = tempIdx[i] # 合并数据:组成训练样本 FactorData = FactorData.append(FactorData0, ignore_index=True) # 预测特征集构建:建立标的索引 FactorDataTest0['idx'] = tempIdx[i] # 按特征处理数据,过程同建立训练特征 for FC in context.FactorCode: FCData = FData0[FData0['factor'] == FC]['value'].reset_index( drop=True) FactorDataTest0[FC] = FCData[context.Len - 1] # 合并测试数据 FactorDataTest = FactorDataTest.append(FactorDataTest0, ignore_index=True) """ 训练集和测试集的表头字段如下 FactorData DataFrame: idx | benefit | Factor 1 | Factor 2| .... benefit 作为标签,上月初Factor作为特征,此处是单因子测试,只有一个特征 FactorDataTest DataFrame: idx | Factor 1 | Factor 2 | ... 本月初的因子作为预测特征 """ # 数据清洗: FactorData = FactorData.dropna(axis=0, how='any').reset_index(drop=True) # 清洗数据 FactorDataTest = FactorDataTest.dropna(axis=0, how='any').reset_index( drop=True) # 清洗数据 Idx = FactorDataTest['idx'] # 剩余标的序号 # 按特征进行预处理 for Factor in context.FactorCode: FactorData = filter_MAD(FactorData, Factor, 5) # 中位数去极值法 FactorData[Factor] = preprocessing.scale(FactorData[Factor]) # 标准化 FactorDataTest = filter_MAD(FactorDataTest, Factor, 5) # 中位数去极值法 FactorDataTest[Factor] = preprocessing.scale( FactorDataTest[Factor]) # 标准化 # print(FactorData.head(1)) # print(FactorDataTest.head(1)) # 训练和预测特征构建:# 行(样本数)* 列(特征数) X = np.ones([FactorData.shape[0], len(Fcode)]) Xtest = np.ones([FactorDataTest.shape[0], len(Fcode)]) # 循环填充特征到numpy数组中 for i in range(X.shape[1]): X[:, i] = FactorData[Fcode[i]] Xtest[:, i] = FactorDataTest[Fcode[i]] # 训练样本的标签,为浮点数的收益率 Y = (np.array(FactorData['benefit']).astype(float) > 0) SVM = svm.SVR(gamma='scale') gbr = GradientBoostingRegressor() gbr.fit(X, Y) enc = OneHotEncoder() enc.fit(gbr.apply(X)) new_X = enc.transform(gbr.apply(X)) new_X = new_X.toarray() X = new_X new_Xtest = enc.transform(gbr.apply(Xtest)) new_Xtest = new_Xtest.toarray() Xtest = new_Xtest # 模型训练: SVM.fit(X, Y) y = SVM.predict(Xtest) # 交易设置: positions = context.account().positions['volume_long'] # 多头持仓数量 valid_cash = context.account(account_idx=0).cash['valid_cash'][0] # 可用资金 P = context.cash_rate / (sum(y > 0) + 1) # 设置每只标的可用资金比例 + 1 防止分母为0 # 获取收益率的高分位数和低分位数 low_return, high_return = np.percentile( y, [context.down_pos, context.upper_pos]) for i in range(len(Idx)): position = positions.iloc[Idx[i]] #if position == 0 and y[i] == True and valid_cash > 0: # 若预测结果为true(收益率>0),买入 # print('开仓') if position == 0 and y[i] > high_return and valid_cash > 0: # 开仓数量 + 1防止分母为0 # print(valid_cash, P, KData['close'][Idx[i]]) # 这里的数目可考虑减少一点,,有时太多有时太少 Num = int( math.floor(valid_cash * P / 100 / (KData['close'][Idx[i]] + 1)) * 100) # 控制委托量,不要过大或过小,需要保证是100的倍数 if Num < 1000: Num *= 10 if Num > 100000: Num = int(Num / 10) Num -= Num % 100 if Num <= 0: # 不开仓 continue print("开仓数量为:{}".format(Num)) order_id = order_volume(account_idx=0, target_idx=int(Idx[i]), volume=Num, side=1, position_effect=1, order_type=2, price=0) # 指定委托量开仓 # 对订单号为order_id的委托单设置止损,止损距离10个整数点,触发时,委托的方式用市价委托 # stop_loss_by_order(target_order_id=order_id, stop_type=1, stop_gap=10, order_type=2) # elif position > 0 and y[i] == False: #预测结果为false(收益率<0),卖出 elif position > 0 and y[i] < low_return: # 当前持仓,且该股票收益小于低60%分位数,则平仓,卖出 print("平仓,数量为: {}".format(position / 10)) order_volume(account_idx=0, target_idx=int(Idx[i]), volume=int(position / 10), side=2, position_effect=2, order_type=2, price=0) # 指定委托量平仓
x_test=test[["Age","Fare","SibSp","Parch"]].fillna(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) gbdt_train_X,gbdt_train_y=train[["Age","Fare","SibSp","Parch"]],train["Survived"] ##clf =svm.SVC(gamma=0.001,C=100) ##,"Sex",'Age','SibSp'"Pclass"] ##clf=RandomForestClassifier(100) ##clf.fit(X_train, y_train) ##print(accuracy_score( ## clf.predict(X_test),y_test)) ##temp=clf.predict(x_test) ##DataSet Conduct gbr=GradientBoostingRegressor()#x[i0]为训练样本输入,y[i0]为训练样本输出 gbr.fit(gbdt_train_X, gbdt_train_y)#训练GBDT模型 enc = OneHotEncoder() enc.fit(gbr.apply(gbdt_train_X))#将位置码转化为01码 new_feature_train=enc.transform(gbr.apply(gbdt_train_X)) new_feature_train=new_feature_train.toarray() ##For Adjust print(len(new_feature_train[0])) enc1= OneHotEncoder() enc1.fit(train[["Pclass","Sex","IsAlone","IsChild","IsStrong"]]) new_feature_train1=enc1.transform(train[["Pclass","Sex","IsAlone","IsChild","IsStrong"]]) new_feature_train1=new_feature_train1.toarray() new_train=np.concatenate([new_feature_train1,new_feature_train],axis=1) new_feature_test=enc.transform(gbr.apply(x_test)) new_feature_test=new_feature_test.toarray() ##For Adjust print(len(new_feature_test[0])) new_feature_test1=enc1.transform(test[["Pclass","Sex","IsAlone","IsChild","IsStrong"]])
# X, y = make_regression(random_state=0) # X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) train_x,train_y,data=generator_data(data) reg = GradientBoostingRegressor(loss='ls', learning_rate=0.1, random_state=0, verbose=1,n_estimators=300,max_depth=3) print(reg) # for i in range(10): reg.fit(train_x,train_y) print('the whole parameter of the model : ',reg.get_params()) # GradientBoostingRegressor(random_state=0) a=reg.apply(train_x) print(a) print(a.shape) pre=reg.predict(train_x) # print('Predict regression target for x :', pre) # print(pre.shape) r=reg.score(train_x, train_y) print('Return the coefficient of determination R2 of the prediction : ',r) loss=reg.loss_(train_y,pre) print('loss is : ',loss) re_index(observed_v=train_y,predicted_v=pre) feature_importance=reg.feature_importances_ print(sum(feature_importance[1:6]),sum(feature_importance))
grd = GradientBoostingRegressor( loss='huber', learning_rate=0.04, n_estimators=params[run - 1], # 100 is enough using this learning_rate max_depth=6, subsample=0.7, max_features=0.7, min_samples_leaf=1, verbose=0, random_state=2015, ) grd_enc = OneHotEncoder() result = {} grd.fit(X_train, labels_train) grd_enc.fit(grd.apply(X_train)) """ etr = ExtraTreesRegressor(n_estimators=param_space_reg_skl_etr['n_estimators'], max_features=param_space_reg_skl_etr['max_features'], n_jobs=param_space_reg_skl_etr['n_jobs'], random_state=param_space_reg_skl_etr['random_state']) etr.fit(grd_enc.transform(grd.apply(X_train)),labels_train) Y_train = etr.predict(grd_enc.transform(grd.apply(X_train))) Y = etr.predict(grd_enc.transform(grd.apply(X_valid))) cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8] res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(Y_train,labels_train),method='Nelder-Mead') cutpoints = np.sort(res.x) kappa=minimize_quadratic_weighted_kappa(cutpoints,Y,labels_valid) """ ridge = Ridge(alpha=2500) ridge.fit(grd_enc.transform(grd.apply(X_train)), labels_train)
LinReg_model.fit(train_data[features], Y) linReg_score = cross_val_score(LinReg_model, train_data[features], Y, cv=10, scoring='r2').mean() print("R2 score using Linear Regression is ", linReg_score * 100) print("Linear reg coef", LinReg_model.coef_) ##Random Forest Regressor ## ##RanForest_model = RandomForestRegressor( random_state=0) ##RanForest_model.fit(train_data[features], Y) ##ranForest_score = cross_val_score(RanForest_model, train_data[features], Y, cv=10,scoring='r2').mean() ##print("R2 score using Random Forest Regression is ",ranForest_score*100) ##Gradient Boosting Regressor GradBoost_model = GradientBoostingRegressor(max_depth=3, random_state=0, learning_rate=0.1, n_estimators=200) GradBoost_model.fit(train_data[features], Y) GradBoost_model.apply(train_data[features]) gradBoost_score = cross_val_score(GradBoost_model, train_data[features], Y, cv=10, scoring='r2').mean() print("Feature Importance ", GradBoost_model.feature_importances_) print("R2 score using Gradient Boosting Regressor is ", gradBoost_score * 100)
""" grd = GradientBoostingRegressor( loss = 'huber', learning_rate=0.04, n_estimators=params[run-1],# 100 is enough using this learning_rate max_depth=6, subsample=0.7, max_features=0.7, min_samples_leaf=1, verbose=0, random_state=2015, ) grd_enc = OneHotEncoder() result ={} grd.fit(X_train,labels_train) grd_enc.fit(grd.apply(X_train)) """ etr = ExtraTreesRegressor(n_estimators=param_space_reg_skl_etr['n_estimators'], max_features=param_space_reg_skl_etr['max_features'], n_jobs=param_space_reg_skl_etr['n_jobs'], random_state=param_space_reg_skl_etr['random_state']) etr.fit(grd_enc.transform(grd.apply(X_train)),labels_train) Y_train = etr.predict(grd_enc.transform(grd.apply(X_train))) Y = etr.predict(grd_enc.transform(grd.apply(X_valid))) cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8] res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(Y_train,labels_train),method='Nelder-Mead') cutpoints = np.sort(res.x) kappa=minimize_quadratic_weighted_kappa(cutpoints,Y,labels_valid) """ ridge = Ridge(alpha=2500) ridge.fit(grd_enc.transform(grd.apply(X_train)),labels_train)
def test_GradientBoost(): X1 = np.arange(0, 10, 0.1) X2 = np.arange(10, 20, 0.1) y = np.sin(X1).ravel() + np.cos(X2).ravel() X_df = pd.DataFrame(np.array([X1, X2]).T, columns=['x1', 'x2']) gbr_regr = GradientBoostingRegressor(n_estimators=5000, max_depth=3) gbr_regr.fit(X_df, y) with StopWatch("LucidEnsemble Gradient Boost construction"): lucid_gbr = make_LucidEnsemble(gbr_regr, feature_names=X_df.columns, print_precision=3) with StopWatch("Scikit-learn Gradient Boost prediction"): gbr_pred = gbr_regr.predict(X_df) with StopWatch("Lucid Gradient Boost (non-compressed) prediction"): lucid_gbr_pred = lucid_gbr.predict(X_df) ###################################################### # test prediction outputted from LucidEnsemble np.testing.assert_almost_equal(lucid_gbr_pred, gbr_pred) assert (np.all(gbr_regr.apply(X_df) == lucid_gbr.apply(X_df))) with StopWatch("Compression of Lucid Gradient Boost"): compressed_lucid_gbr = lucid_gbr.compress() print("{} unique nodes and {} # of estimators".format( compressed_lucid_gbr.n_leaves, len(lucid_gbr))) with StopWatch("Lucid Gradient Boost (compressed) prediction"): cgbr_pred = compressed_lucid_gbr.predict(X_df) ###################################################### # test the compressed prediction np.testing.assert_almost_equal(cgbr_pred, gbr_pred) # test comparison, compare the leaves of two # LucidEnsembles made from the the same arguments lucid_gbr2 = make_LucidEnsemble(gbr_regr, feature_names=X_df.columns, print_precision=3) compressed_lucid_gbr2 = lucid_gbr2.compress() assert (set(compressed_lucid_gbr.leaves) == set( compressed_lucid_gbr2.leaves)) script_dir = os.path.dirname(__name__) ###################################################### # test pickling functionality pickle_path = os.path.join(script_dir, 'lucid_gbr.pkl') with open(pickle_path, 'wb') as fh: pickle.dump(lucid_gbr, fh) with open(pickle_path, 'rb') as fh: lucid_gbr_pickle = pickle.load(fh) np.testing.assert_almost_equal(lucid_gbr_pickle.predict(X_df), lucid_gbr_pred) os.remove(pickle_path) pickle_path = os.path.join(script_dir, 'compressed_lucid_gbr.pkl') with open(pickle_path, 'wb') as fh: pickle.dump(compressed_lucid_gbr, fh) with open(pickle_path, 'rb') as fh: compressed_lucid_gbr_pickle = pickle.load(fh) np.testing.assert_almost_equal( compressed_lucid_gbr_pickle.predict(X_df), cgbr_pred) os.remove(pickle_path)
gbrt.fit(x_train, y_train) pred = gbrt.predict(x_test) k = 0 for i in range(len(pred)): k = k + abs(pred[i] - y_test[i]) / (y_test[i]) print(1 - k / len(pred)) # sort importances indices = np.argsort(gbrt.feature_importances_) # plot as bar chart plt.barh(np.arange(len(names)), gbrt.feature_importances_[indices]) plt.yticks(np.arange(len(names)) + 0.25, np.array(names)[indices]) _ = plt.xlabel('Relative importance') #plt.show() print(gbrt.feature_importances_[indices]) #print(gbdt.score(x_test,y_test)) # score on test data (accuracy) print('###################################') print(gbrt.apply(np.array(x_test))) for i in range(len(pred)): print(pred[i], gbrt.apply(np.array(x_test))[i][0]) ''' for i in range(len(y_train)): print(gbdt.fit_transform(x_train,y_train)[i],y_train[i]) ''' #初步结论:apply()函数可以将样本落在哪个叶子节点处的位置用向量表示出,构成一个新的特征 #fit-TRANSFORM()函数可以将训练样本的特征进行转换,起到类似降为的作用