Beispiel #1
0
def gbdt_lr(para):
    print("gbdt_lr")
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    maxleafnodes = 11
    gbc = GBDT(max_leaf_nodes=maxleafnodes - 1,
               n_estimators=600,
               min_samples_leaf=5,
               max_depth=3,
               learning_rate=0.02,
               subsample=0.2,
               max_features=0.1)
    gbc.fit(x_train, y_train)
    ohe = OHE()
    ohe.fit(gbc.apply(x_train)[:, :])
    li = gbc.apply(x_train_lr)[:, :]
    x_train_lr_gbc = ohe.transform(li)
    #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    li = gbc.apply(x_test)[:, :]
    x_test_gbc = ohe.transform(li)
    #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    del (li)
    lr = sgd(n_iter=50)
    lr.fit(x_train_lr_gbc, y_train_lr)
    yp = lr.predict(x_test_gbc)
    print("GBDT+SGD: " + str(auc(y_test, yp)))
    return (gbc, yp)
Beispiel #2
0
def check_boston(presort, loss, subsample):
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    ones = np.ones(len(boston.target))
    last_y_pred = None
    for sample_weight in None, ones, 2 * ones:
        clf = GradientBoostingRegressor(n_estimators=100,
                                        loss=loss,
                                        max_depth=4,
                                        subsample=subsample,
                                        min_samples_split=2,
                                        random_state=1,
                                        presort=presort)

        assert_raises(ValueError, clf.predict, boston.data)
        clf.fit(boston.data, boston.target,
                sample_weight=sample_weight)
        leaves = clf.apply(boston.data)
        assert_equal(leaves.shape, (506, 100))

        y_pred = clf.predict(boston.data)
        mse = mean_squared_error(boston.target, y_pred)
        assert_less(mse, 6.0)

        if last_y_pred is not None:
            assert_array_almost_equal(last_y_pred, y_pred)

        last_y_pred = y_pred
def test_regression_dataset(loss, subsample):
    # Check consistency on regression dataset with least squares
    # and least absolute deviation.
    ones = np.ones(len(y_reg))
    last_y_pred = None
    for sample_weight in [None, ones, 2 * ones]:
        reg = GradientBoostingRegressor(
            n_estimators=100,
            loss=loss,
            max_depth=4,
            subsample=subsample,
            min_samples_split=2,
            random_state=1,
        )

        reg.fit(X_reg, y_reg, sample_weight=sample_weight)
        leaves = reg.apply(X_reg)
        assert leaves.shape == (500, 100)

        y_pred = reg.predict(X_reg)
        mse = mean_squared_error(y_reg, y_pred)
        assert mse < 0.04

        if last_y_pred is not None:
            # FIXME: We temporarily bypass this test. This is due to the fact
            # that GBRT with and without `sample_weight` do not use the same
            # implementation of the median during the initialization with the
            # `DummyRegressor`. In the future, we should make sure that both
            # implementations should be the same. See PR #17377 for more.
            # assert_allclose(last_y_pred, y_pred)
            pass

        last_y_pred = y_pred
def test_boston():
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    for loss in ("ls", "lad", "huber"):
        for subsample in (1.0, 0.5):
            last_y_pred = None
            for i, sample_weight in enumerate(
                    (None, np.ones(len(boston.target)),
                     2 * np.ones(len(boston.target)))):
                clf = GradientBoostingRegressor(n_estimators=100, loss=loss,
                                                max_depth=4, subsample=subsample,
                                                min_samples_split=1,
                                                random_state=1)

                assert_raises(ValueError, clf.predict, boston.data)
                clf.fit(boston.data, boston.target,
                        sample_weight=sample_weight)
                leaves = clf.apply(boston.data)
                assert_equal(leaves.shape, (506, 100))
                
                y_pred = clf.predict(boston.data)
                mse = mean_squared_error(boston.target, y_pred)
                assert mse < 6.0, "Failed with loss %s and " \
                    "mse = %.4f" % (loss, mse)

                if last_y_pred is not None:
                    np.testing.assert_array_almost_equal(
                        last_y_pred, y_pred,
                        err_msg='pred_%d doesnt match last pred_%d for loss %r and subsample %r. '
                        % (i, i - 1, loss, subsample))

                last_y_pred = y_pred
def check_boston(presort, loss, subsample):
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    ones = np.ones(len(boston.target))
    last_y_pred = None
    for sample_weight in None, ones, 2 * ones:
        clf = GradientBoostingRegressor(n_estimators=100,
                                        loss=loss,
                                        max_depth=4,
                                        subsample=subsample,
                                        min_samples_split=2,
                                        random_state=1,
                                        presort=presort)

        assert_raises(ValueError, clf.predict, boston.data)
        clf.fit(boston.data, boston.target,
                sample_weight=sample_weight)
        leaves = clf.apply(boston.data)
        assert_equal(leaves.shape, (506, 100))

        y_pred = clf.predict(boston.data)
        mse = mean_squared_error(boston.target, y_pred)
        assert_less(mse, 6.0)

        if last_y_pred is not None:
            assert_array_almost_equal(last_y_pred, y_pred)

        last_y_pred = y_pred
Beispiel #6
0
class myStackingFeaturesRegressor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.estimator = None
        self.lgb = GradientBoostingRegressor(loss='ls',
                                             alpha=0.9,
                                             n_estimators=100,
                                             learning_rate=0.01,
                                             max_depth=8,
                                             subsample=0.8,
                                             min_samples_split=9,
                                             max_leaf_nodes=10)
        self.grd_enc = OneHotEncoder()
        self.lr = RidgeCV()
        self.classes_ = [-1, 1]

    def fit(self, X, y=None, **fit_params):
        self.lgb.fit(X, y)
        self.grd_enc.fit(self.lgb.apply(X))
        self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y)

    def predict(self, X):
        return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))
def test_gbm_regressor_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingRegressor

    #Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingRegressor
    gbm_sk = GradientBoostingRegressor(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        print(
            (a == b
             for a, b in zip(gbm.staged_predict(X), gbm_sk.staged_predict(X))))
        assert np.allclose(list(gbm.staged_predict(X)),
                           list(gbm_sk.staged_predict(X)))
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True

        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)

        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__

        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_
                ).all() == True

        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def test_gbm_regressor_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.GradientBoostingRegressor

    #Run h2o4gpu version of RandomForest Regression
    gbm = Solver(backend=backend, random_state=1234)
    print("h2o4gpu fit()")
    gbm.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import GradientBoostingRegressor
    gbm_sk = GradientBoostingRegressor(random_state=1234, max_depth=3)
    print("Scikit fit()")
    gbm_sk.fit(X, y)

    if backend == "sklearn":
        assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True
        print((a == b for a, b in zip(gbm.staged_predict(X), gbm_sk.staged_predict(X))))
        assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X)))
        assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True
        assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True
        
        print("Estimators")
        print(gbm.estimators_)
        print(gbm_sk.estimators_)
        
        print("loss")
        print(gbm.loss_)
        print(gbm_sk.loss_)
        assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__
        
        print("init_")
        print(gbm.init)
        print(gbm_sk.init)

        print("Feature importance")
        print(gbm.feature_importances_)
        print(gbm_sk.feature_importances_)
        assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True
        
        print("train_score_")
        print(gbm.train_score_)
        print(gbm_sk.train_score_)
        assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
Beispiel #9
0
def test_boston():
    # Check consistency on dataset boston house prices with least squares
    # and least absolute deviation.
    for loss in ("ls", "lad", "huber"):
        for subsample in (1.0, 0.5):
            last_y_pred = None
            for i, sample_weight in enumerate(
                (None, np.ones(len(boston.target)),
                 2 * np.ones(len(boston.target)))):
                clf = GradientBoostingRegressor(n_estimators=100,
                                                loss=loss,
                                                max_depth=4,
                                                subsample=subsample,
                                                min_samples_split=1,
                                                random_state=1)

                assert_raises(ValueError, clf.predict, boston.data)
                clf.fit(boston.data,
                        boston.target,
                        sample_weight=sample_weight)
                leaves = clf.apply(boston.data)
                assert_equal(leaves.shape, (506, 100))

                y_pred = clf.predict(boston.data)
                mse = mean_squared_error(boston.target, y_pred)
                assert mse < 6.0, "Failed with loss %s and " \
                    "mse = %.4f" % (loss, mse)

                if last_y_pred is not None:
                    np.testing.assert_array_almost_equal(
                        last_y_pred,
                        y_pred,
                        err_msg=
                        'pred_%d doesnt match last pred_%d for loss %r and subsample %r. '
                        % (i, i - 1, loss, subsample))

                last_y_pred = y_pred
Beispiel #10
0
def on_data(context):
    context.Num = context.Num + 1
    if context.Num < context.Len:  # 如果交易日个数小于Len+1,则进入下一个交易日进行回测
        return
    if datetime.datetime.strftime(
            context.now,
            '%Y-%m-%d') not in context.month_begin:  # 调仓频率为月,月初开始调仓
        return

    # 获取数据:
    KData = get_reg_kdata(reg_idx=context.reg_kdata[0],
                          length=context.Len,
                          fill_up=True,
                          df=True)
    FData = get_reg_factor(reg_idx=context.reg_factor[0],
                           target_indices=[x for x in range(300)],
                           length=context.Len,
                           df=True)  # 获取因子数据

    # 特征构建:
    Fcode = context.FactorCode  # 标签不需要代号了

    # 数据存储变量:
    # Close 字段为标签,Fcode 为标签
    FactorData = pd.DataFrame(columns=(['idx', 'benefit'] +
                                       Fcode))  # 存储训练特征及标签样本
    FactorDataTest = pd.DataFrame(columns=(['idx'] + Fcode))  # 存储预测特征样本

    # K线数据序号对齐
    tempIdx = KData[KData['time'] == KData['time']
                    [0]]['target_idx'].reset_index(drop=True)

    # 按标的处理数据:
    for i in range(300):
        # 训练特征集及训练标签构建:
        # 临时数据存储变量:
        FactorData0 = pd.DataFrame(np.full([1, len(Fcode) + 2], np.nan),
                                   columns=(['idx', 'benefit'] + Fcode))
        # 存储预测特征样本
        FactorDataTest0 = pd.DataFrame(np.full([1, len(Fcode) + 1], np.nan),
                                       columns=(['idx'] + Fcode))

        # 因子数据 序号对齐, 提取当前标的的因子数据
        FData0 = FData[FData['target_idx'] == tempIdx[i]].reset_index(
            drop=True)

        # 按特征处理数据:
        for FC in context.FactorCode:
            # 提取当前标的中与当前因子FC相同的部分
            FCData = FData0[FData0['factor'] == FC]['value'].reset_index(
                drop=True)
            FactorData0[FC] = FCData[0]  # 存储上一个月初的股票因子数据

        # 按标签处理数据:
        # 提取当前标的的前一个月的K线面板数据
        close = np.array(KData[KData['target_idx'] == tempIdx[i]]['close'])
        # 计算当前标的在上一个月的收益率
        benefit = (close[context.Len - 1] - close[0]) / close[0]

        FactorData0['benefit'] = benefit
        # idx: 建立当前标的在训练样本集中的索引
        FactorData0['idx'] = tempIdx[i]
        # 合并数据:组成训练样本
        FactorData = FactorData.append(FactorData0, ignore_index=True)

        # 预测特征集构建:建立标的索引
        FactorDataTest0['idx'] = tempIdx[i]
        # 按特征处理数据,过程同建立训练特征
        for FC in context.FactorCode:
            FCData = FData0[FData0['factor'] == FC]['value'].reset_index(
                drop=True)
            FactorDataTest0[FC] = FCData[context.Len - 1]

        # 合并测试数据
        FactorDataTest = FactorDataTest.append(FactorDataTest0,
                                               ignore_index=True)
    """
    训练集和测试集的表头字段如下
    FactorData DataFrame:
    idx  |  benefit |  Factor 1 | Factor 2| ....
    benefit 作为标签,上月初Factor作为特征,此处是单因子测试,只有一个特征
    FactorDataTest DataFrame: 
    idx | Factor 1 | Factor 2 | ...
    本月初的因子作为预测特征
    """

    # 数据清洗:
    FactorData = FactorData.dropna(axis=0,
                                   how='any').reset_index(drop=True)  # 清洗数据
    FactorDataTest = FactorDataTest.dropna(axis=0, how='any').reset_index(
        drop=True)  # 清洗数据
    Idx = FactorDataTest['idx']  # 剩余标的序号

    # 按特征进行预处理
    for Factor in context.FactorCode:
        FactorData = filter_MAD(FactorData, Factor, 5)  # 中位数去极值法
        FactorData[Factor] = preprocessing.scale(FactorData[Factor])  # 标准化

        FactorDataTest = filter_MAD(FactorDataTest, Factor, 5)  # 中位数去极值法
        FactorDataTest[Factor] = preprocessing.scale(
            FactorDataTest[Factor])  # 标准化

    # print(FactorData.head(1))
    # print(FactorDataTest.head(1))

    # 训练和预测特征构建:# 行(样本数)* 列(特征数)
    X = np.ones([FactorData.shape[0], len(Fcode)])
    Xtest = np.ones([FactorDataTest.shape[0], len(Fcode)])

    # 循环填充特征到numpy数组中
    for i in range(X.shape[1]):
        X[:, i] = FactorData[Fcode[i]]
        Xtest[:, i] = FactorDataTest[Fcode[i]]

    # 训练样本的标签,为浮点数的收益率
    Y = (np.array(FactorData['benefit']).astype(float) > 0)

    SVM = svm.SVR(gamma='scale')

    gbr = GradientBoostingRegressor()
    gbr.fit(X, Y)
    enc = OneHotEncoder()
    enc.fit(gbr.apply(X))

    new_X = enc.transform(gbr.apply(X))
    new_X = new_X.toarray()

    X = new_X

    new_Xtest = enc.transform(gbr.apply(Xtest))
    new_Xtest = new_Xtest.toarray()
    Xtest = new_Xtest

    # 模型训练:
    SVM.fit(X, Y)

    y = SVM.predict(Xtest)
    # 交易设置:
    positions = context.account().positions['volume_long']  # 多头持仓数量
    valid_cash = context.account(account_idx=0).cash['valid_cash'][0]  # 可用资金

    P = context.cash_rate / (sum(y > 0) + 1)  # 设置每只标的可用资金比例 + 1 防止分母为0

    # 获取收益率的高分位数和低分位数
    low_return, high_return = np.percentile(
        y, [context.down_pos, context.upper_pos])

    for i in range(len(Idx)):
        position = positions.iloc[Idx[i]]
        #if position == 0 and y[i] == True and valid_cash > 0:  # 若预测结果为true(收益率>0),买入
        # print('开仓')
        if position == 0 and y[i] > high_return and valid_cash > 0:
            # 开仓数量 + 1防止分母为0
            # print(valid_cash, P, KData['close'][Idx[i]])  # 这里的数目可考虑减少一点,,有时太多有时太少
            Num = int(
                math.floor(valid_cash * P / 100 /
                           (KData['close'][Idx[i]] + 1)) * 100)

            # 控制委托量,不要过大或过小,需要保证是100的倍数
            if Num < 1000:
                Num *= 10
            if Num > 100000:
                Num = int(Num / 10)
                Num -= Num % 100
            if Num <= 0:  # 不开仓
                continue

            print("开仓数量为:{}".format(Num))
            order_id = order_volume(account_idx=0,
                                    target_idx=int(Idx[i]),
                                    volume=Num,
                                    side=1,
                                    position_effect=1,
                                    order_type=2,
                                    price=0)  # 指定委托量开仓
            # 对订单号为order_id的委托单设置止损,止损距离10个整数点,触发时,委托的方式用市价委托
            # stop_loss_by_order(target_order_id=order_id, stop_type=1, stop_gap=10, order_type=2)
        # elif position > 0 and y[i] == False: #预测结果为false(收益率<0),卖出
        elif position > 0 and y[i] < low_return:  # 当前持仓,且该股票收益小于低60%分位数,则平仓,卖出
            print("平仓,数量为: {}".format(position / 10))
            order_volume(account_idx=0,
                         target_idx=int(Idx[i]),
                         volume=int(position / 10),
                         side=2,
                         position_effect=2,
                         order_type=2,
                         price=0)  # 指定委托量平仓
Beispiel #11
0
x_test=test[["Age","Fare","SibSp","Parch"]].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
gbdt_train_X,gbdt_train_y=train[["Age","Fare","SibSp","Parch"]],train["Survived"]
##clf =svm.SVC(gamma=0.001,C=100)
##,"Sex",'Age','SibSp'"Pclass"]
##clf=RandomForestClassifier(100)
##clf.fit(X_train, y_train)
##print(accuracy_score(
##    clf.predict(X_test),y_test))
##temp=clf.predict(x_test)

##DataSet Conduct
gbr=GradientBoostingRegressor()#x[i0]为训练样本输入,y[i0]为训练样本输出
gbr.fit(gbdt_train_X, gbdt_train_y)#训练GBDT模型
enc = OneHotEncoder()
enc.fit(gbr.apply(gbdt_train_X))#将位置码转化为01码
new_feature_train=enc.transform(gbr.apply(gbdt_train_X))
new_feature_train=new_feature_train.toarray()
##For Adjust
print(len(new_feature_train[0]))
enc1= OneHotEncoder()
enc1.fit(train[["Pclass","Sex","IsAlone","IsChild","IsStrong"]])
new_feature_train1=enc1.transform(train[["Pclass","Sex","IsAlone","IsChild","IsStrong"]])
new_feature_train1=new_feature_train1.toarray()
new_train=np.concatenate([new_feature_train1,new_feature_train],axis=1)

new_feature_test=enc.transform(gbr.apply(x_test))
new_feature_test=new_feature_test.toarray()
##For Adjust
print(len(new_feature_test[0]))
new_feature_test1=enc1.transform(test[["Pclass","Sex","IsAlone","IsChild","IsStrong"]])
Beispiel #12
0
# X, y = make_regression(random_state=0)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

train_x,train_y,data=generator_data(data)

reg = GradientBoostingRegressor(loss='ls',
                                learning_rate=0.1,
                                random_state=0,
                                verbose=1,n_estimators=300,max_depth=3)
print(reg)
# for i in range(10):
reg.fit(train_x,train_y)

print('the whole parameter of the model : ',reg.get_params())
# GradientBoostingRegressor(random_state=0)
a=reg.apply(train_x)
print(a)
print(a.shape)
pre=reg.predict(train_x)
# print('Predict regression target for x :', pre)
# print(pre.shape)
r=reg.score(train_x, train_y)
print('Return the coefficient of determination R2 of the prediction : ',r)

loss=reg.loss_(train_y,pre)
print('loss is : ',loss)

re_index(observed_v=train_y,predicted_v=pre)

feature_importance=reg.feature_importances_
print(sum(feature_importance[1:6]),sum(feature_importance))
Beispiel #13
0
 grd = GradientBoostingRegressor(
     loss='huber',
     learning_rate=0.04,
     n_estimators=params[run -
                         1],  # 100 is enough using this learning_rate
     max_depth=6,
     subsample=0.7,
     max_features=0.7,
     min_samples_leaf=1,
     verbose=0,
     random_state=2015,
 )
 grd_enc = OneHotEncoder()
 result = {}
 grd.fit(X_train, labels_train)
 grd_enc.fit(grd.apply(X_train))
 """
 etr = ExtraTreesRegressor(n_estimators=param_space_reg_skl_etr['n_estimators'],
                                   max_features=param_space_reg_skl_etr['max_features'],
                                   n_jobs=param_space_reg_skl_etr['n_jobs'],
                                   random_state=param_space_reg_skl_etr['random_state'])
 etr.fit(grd_enc.transform(grd.apply(X_train)),labels_train)
 Y_train = etr.predict(grd_enc.transform(grd.apply(X_train)))
 Y = etr.predict(grd_enc.transform(grd.apply(X_valid)))
 cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8]
 res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(Y_train,labels_train),method='Nelder-Mead')
 cutpoints = np.sort(res.x)
 kappa=minimize_quadratic_weighted_kappa(cutpoints,Y,labels_valid)
 """
 ridge = Ridge(alpha=2500)
 ridge.fit(grd_enc.transform(grd.apply(X_train)), labels_train)
LinReg_model.fit(train_data[features], Y)
linReg_score = cross_val_score(LinReg_model,
                               train_data[features],
                               Y,
                               cv=10,
                               scoring='r2').mean()
print("R2 score using Linear Regression is ", linReg_score * 100)
print("Linear reg coef", LinReg_model.coef_)
##Random Forest Regressor
##
##RanForest_model = RandomForestRegressor( random_state=0)
##RanForest_model.fit(train_data[features], Y)
##ranForest_score = cross_val_score(RanForest_model, train_data[features], Y, cv=10,scoring='r2').mean()
##print("R2 score using Random Forest Regression is ",ranForest_score*100)

##Gradient Boosting Regressor

GradBoost_model = GradientBoostingRegressor(max_depth=3,
                                            random_state=0,
                                            learning_rate=0.1,
                                            n_estimators=200)
GradBoost_model.fit(train_data[features], Y)
GradBoost_model.apply(train_data[features])
gradBoost_score = cross_val_score(GradBoost_model,
                                  train_data[features],
                                  Y,
                                  cv=10,
                                  scoring='r2').mean()
print("Feature Importance ", GradBoost_model.feature_importances_)
print("R2 score using Gradient Boosting Regressor is ", gradBoost_score * 100)
 """
 grd = GradientBoostingRegressor(
     loss = 'huber',
     learning_rate=0.04,
     n_estimators=params[run-1],# 100 is enough using this learning_rate
     max_depth=6,
     subsample=0.7,
     max_features=0.7,
     min_samples_leaf=1,
     verbose=0,
     random_state=2015,
 ) 
 grd_enc = OneHotEncoder()
 result ={}
 grd.fit(X_train,labels_train)
 grd_enc.fit(grd.apply(X_train))
 """
 etr = ExtraTreesRegressor(n_estimators=param_space_reg_skl_etr['n_estimators'],
                                   max_features=param_space_reg_skl_etr['max_features'],
                                   n_jobs=param_space_reg_skl_etr['n_jobs'],
                                   random_state=param_space_reg_skl_etr['random_state'])
 etr.fit(grd_enc.transform(grd.apply(X_train)),labels_train)
 Y_train = etr.predict(grd_enc.transform(grd.apply(X_train)))
 Y = etr.predict(grd_enc.transform(grd.apply(X_valid)))
 cutpoints = [2.8,3.8,4.5,4.9,5.5,6.2,6.8]
 res = minimize(minimize_quadratic_weighted_kappa,cutpoints,(Y_train,labels_train),method='Nelder-Mead')
 cutpoints = np.sort(res.x)
 kappa=minimize_quadratic_weighted_kappa(cutpoints,Y,labels_valid)
 """  
 ridge = Ridge(alpha=2500)
 ridge.fit(grd_enc.transform(grd.apply(X_train)),labels_train)
Beispiel #16
0
def test_GradientBoost():

    X1 = np.arange(0, 10, 0.1)
    X2 = np.arange(10, 20, 0.1)

    y = np.sin(X1).ravel() + np.cos(X2).ravel()
    X_df = pd.DataFrame(np.array([X1, X2]).T, columns=['x1', 'x2'])

    gbr_regr = GradientBoostingRegressor(n_estimators=5000, max_depth=3)
    gbr_regr.fit(X_df, y)
    with StopWatch("LucidEnsemble Gradient Boost construction"):
        lucid_gbr = make_LucidEnsemble(gbr_regr,
                                       feature_names=X_df.columns,
                                       print_precision=3)

    with StopWatch("Scikit-learn Gradient Boost prediction"):
        gbr_pred = gbr_regr.predict(X_df)

    with StopWatch("Lucid Gradient Boost (non-compressed) prediction"):
        lucid_gbr_pred = lucid_gbr.predict(X_df)

    ######################################################
    # test prediction outputted from LucidEnsemble
    np.testing.assert_almost_equal(lucid_gbr_pred, gbr_pred)
    assert (np.all(gbr_regr.apply(X_df) == lucid_gbr.apply(X_df)))

    with StopWatch("Compression of Lucid Gradient Boost"):
        compressed_lucid_gbr = lucid_gbr.compress()
    print("{} unique nodes and {} # of estimators".format(
        compressed_lucid_gbr.n_leaves, len(lucid_gbr)))

    with StopWatch("Lucid Gradient Boost (compressed) prediction"):
        cgbr_pred = compressed_lucid_gbr.predict(X_df)

    ######################################################
    # test the compressed prediction
    np.testing.assert_almost_equal(cgbr_pred, gbr_pred)

    # test comparison, compare the leaves of two
    # LucidEnsembles made from the the same arguments
    lucid_gbr2 = make_LucidEnsemble(gbr_regr,
                                    feature_names=X_df.columns,
                                    print_precision=3)
    compressed_lucid_gbr2 = lucid_gbr2.compress()

    assert (set(compressed_lucid_gbr.leaves) == set(
        compressed_lucid_gbr2.leaves))

    script_dir = os.path.dirname(__name__)
    ######################################################
    # test pickling functionality
    pickle_path = os.path.join(script_dir, 'lucid_gbr.pkl')
    with open(pickle_path, 'wb') as fh:
        pickle.dump(lucid_gbr, fh)
    with open(pickle_path, 'rb') as fh:
        lucid_gbr_pickle = pickle.load(fh)
        np.testing.assert_almost_equal(lucid_gbr_pickle.predict(X_df),
                                       lucid_gbr_pred)
    os.remove(pickle_path)

    pickle_path = os.path.join(script_dir, 'compressed_lucid_gbr.pkl')
    with open(pickle_path, 'wb') as fh:
        pickle.dump(compressed_lucid_gbr, fh)
    with open(pickle_path, 'rb') as fh:
        compressed_lucid_gbr_pickle = pickle.load(fh)
        np.testing.assert_almost_equal(
            compressed_lucid_gbr_pickle.predict(X_df), cgbr_pred)
    os.remove(pickle_path)
Beispiel #17
0
gbrt.fit(x_train, y_train)

pred = gbrt.predict(x_test)

k = 0
for i in range(len(pred)):
    k = k + abs(pred[i] - y_test[i]) / (y_test[i])
print(1 - k / len(pred))

# sort importances
indices = np.argsort(gbrt.feature_importances_)
# plot as bar chart
plt.barh(np.arange(len(names)), gbrt.feature_importances_[indices])
plt.yticks(np.arange(len(names)) + 0.25, np.array(names)[indices])
_ = plt.xlabel('Relative importance')
#plt.show()

print(gbrt.feature_importances_[indices])

#print(gbdt.score(x_test,y_test))  # score on test data (accuracy)
print('###################################')
print(gbrt.apply(np.array(x_test)))
for i in range(len(pred)):
    print(pred[i], gbrt.apply(np.array(x_test))[i][0])
'''
for i in range(len(y_train)):
	print(gbdt.fit_transform(x_train,y_train)[i],y_train[i])
'''

#初步结论:apply()函数可以将样本落在哪个叶子节点处的位置用向量表示出,构成一个新的特征
#fit-TRANSFORM()函数可以将训练样本的特征进行转换,起到类似降为的作用