コード例 #1
0
    def xgb_regressor(self, assign=True, **kwargs):
        """
        有监督学习回归器,默认使用:
                        GBR(n_estimators=100)
        通过**kwargs即关键字参数透传GBR(**kwargs),即:
                        GBR(**kwargs)

        注意导入使用:
            try:
                from xgboost.sklearn import XGBRegressor as GBR
            except ImportError:
                from sklearn.ensemble import GradientBoostingRegressor as GBR

        :param assign: 是否保存实例后的回归器对象,默认True,self.reg = reg
        :param kwargs: 有参数情况下初始化: GBR(n_estimators=100)
                       无参数情况下初始化: GBR(**kwargs)

        :return: 实例化的GBR对象
        """
        if kwargs is not None and len(kwargs) > 0:
            reg = GBR(**kwargs)
        else:
            reg = GBR(n_estimators=100)
        if assign:
            self.reg = reg
        return reg
コード例 #2
0
def main():
    features = [
        "response_excitedness", "response_happiness", "mode", "time_signature",
        "acousticness", "danceability", "energy", "instrumentalness",
        "liveness", "loudness", "speechiness", "valence", "tempo"
    ]

    data = link_features_mood(get_responses=True)

    train_set = []
    for song in data:
        print(data)
        row = [song[feature] for feature in features]
        train_set.append(row)
    train_set = np.array(train_set).astype(float)

    energy = [elem[1] for elem in train_set]
    happiness = [elem[2] for elem in train_set]
    train_data = [elem[5:] for elem in train_set]

    excited_est = GBR(n_estimators=50, max_depth=3)
    excited_est.fit(train_data, energy)

    happy_est = GBR(n_estimators=50, max_depth=3)
    happy_est.fit(train_data, happiness)

    dump(excited_est, 'Retrained-Energy.joblib')
    dump(happy_est, 'Retrained-Happiness.joblib')
コード例 #3
0
ファイル: param_tuner.py プロジェクト: Lambda-Rec/CI
 def _make_cate_predictions(self, trial: optuna.Trial,
                            i: int) -> np.ndarray:
     """Make predictions of CATE by a sampled set of hyperparameters."""
     # hyparparameters
     # for control model
     eta_con = trial.suggest_loguniform('eta_control', 1e-5, 1e-1)
     min_leaf_con = trial.suggest_int('min_samples_leaf_control', 1, 20)
     max_depth_con = trial.suggest_int('max_depth_control', 1, 20)
     subsample_con = trial.suggest_uniform('sub_sample_control', 0.1, 1.0)
     control_params = {
         'n_estimators': 100,
         'learning_rate': eta_con,
         'min_samples_leaf': min_leaf_con,
         'max_depth': max_depth_con,
         'subsample': subsample_con,
         'random_state': 12345
     }
     # for treated model
     eta_trt = trial.suggest_loguniform('eta_treat', 1e-5, 1e-1)
     min_leaf_trt = trial.suggest_int('min_samples_leaf_treat', 1, 20)
     max_depth_trt = trial.suggest_int('max_depth_treat', 1, 20)
     subsample_trt = trial.suggest_uniform('sub_sample_treat', 0.1, 1.0)
     treated_params = {
         'n_estimators': 100,
         'learning_rate': eta_trt,
         'min_samples_leaf': min_leaf_trt,
         'max_depth': max_depth_trt,
         'subsample': subsample_trt,
         'random_state': 12345
     }
     # for overall model
     eta_ova = trial.suggest_loguniform('eta_overall', 1e-5, 1e-1)
     min_leaf_ova = trial.suggest_int('min_samples_leaf_overall', 1, 20)
     max_depth_ova = trial.suggest_int('max_depth_overall', 1, 20)
     subsample_ova = trial.suggest_uniform('sub_sample_overall', 0.1, 1.0)
     overall_params = {
         'n_estimators': 100,
         'learning_rate': eta_ova,
         'min_samples_leaf': min_leaf_ova,
         'max_depth': max_depth_ova,
         'subsample': subsample_ova,
         'random_state': 12345
     }
     # define DAL model
     meta_learner = DAL(controls_model=GBR(**control_params),
                        treated_model=GBR(**treated_params),
                        overall_model=GBR(**overall_params))
     meta_learner.fit(X=self.Xtr[i], T=self.Ttr[i], Y=self.Ytr[i])
     return meta_learner.effect(X=self.Xval[i])
コード例 #4
0
def main(TRAIN_RATIO):
    ''' This script is used to train the Gradient Boosting Regressor. 
    The .joblib files (trained data) are used on the server for
    mood analysis. This file is only used offline.
    :param TRAIN_RATIO: Ratio used to train/test the data'''

    data = np.array(np.loadtxt(open("analyzed_tracks_1.csv", "rb"),
                               delimiter=",", skiprows=1, usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                                   10, 11, 12, 13, 14, 15)))

    trainset = []
    testset = []

    for item in data:
        if(np.random.uniform(0, 1) <= TRAIN_RATIO):
            trainset.append(item)
        else:
            testset.append(item)

    energy = [elem[0] for elem in trainset]
    happiness = [elem[1] for elem in trainset]
    traindata = [elem[4:] for elem in trainset]
    testdata = [elem[4:] for elem in testset]
    testE = [elem[0] for elem in testset]
    testH = [elem[1] for elem in testset]

    E_est = GBR(n_estimators=50, max_depth=3)
    E_est.fit(traindata, energy)
    H_est = GBR(n_estimators=50, max_depth=3)
    H_est.fit(traindata, happiness)

    # Create .joblib files to reuse the trained algorith later.
    dump(E_est, 'Trained-Energy.joblib')
    dump(H_est, 'Trained-Happiness.joblib')

    E_pred = E_est.predict(testdata)
    H_pred = H_est.predict(testdata)

    # Determine absolute difference betweeen predictions and actual values.
    E_sum = 0
    H_sum = 0

    for i in range(len(testE)):
        difE = abs(testE[i]-E_pred[i])
        difH = abs(testH[i]-H_pred[i])
        E_sum += difE
        H_sum += difH
    print(E_sum, H_sum)
コード例 #5
0
ファイル: gbr.py プロジェクト: xuedong/mangaki
    def fit(self, X, y):
        if self.T is None:
            self.load_tags()

        self.als = MangakiALS(self.nb_components)
        try:
            self.als.load(self.als.get_backup_filename())
        except:
            self.als.set_parameters(self.nb_users, self.nb_works)
            self.als.fit(X, y)
            self.als.compute_all_errors(X, y, X, y)

        self.chrono.save('fit ALS model')

        X_full = self.prepare_features(X, self.als.U, self.als.VT.T)

        self.chrono.save('build features')

        self.gbr = GBR(n_estimators=self.nb_estimators)
        self.gbr.fit(X_full, y)
        logging.debug('feature_importances=%s',
                      str(self.gbr.feature_importances_))
        logging.debug('train_score=%s', str(self.gbr.train_score_))

        self.chrono.save('fit GBR model')
コード例 #6
0
def find_best_feature(feature_name, cv_fold, train_data, train_label):
    # 为了寻找最佳的特征组合,这里是对LGBMClassifier  XGBClassifier   GBC三个模型的得分进行平均,来代表这个特征所代表的分数
    get_ans_face = feature_name
    new_lgb_model = lgb.LGBMRegressor(n_estimators=300, random_state=1)
    cv_model = cv(new_lgb_model,
                  train_data[get_ans_face],
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_lgb_model.fit(train_data[get_ans_face], train_label)
    m1 = cv_model.mean()

    new_xgb_model1 = xgb.XGBRegressor(n_estimators=300, random_state=1)
    cv_model = cv(new_xgb_model1,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_xgb_model1.fit(train_data[get_ans_face].values, train_label)
    m2 = cv_model.mean()

    new_gbc_model = GBR(n_estimators=310)
    cv_model = cv(new_gbc_model,
                  train_data[get_ans_face].values,
                  train_label,
                  cv=cv_fold,
                  scoring='r2')
    new_gbc_model.fit(train_data[get_ans_face].values, train_label)
    m3 = cv_model.mean()
    return (m1 + m2 + m3) / 3
コード例 #7
0
def train_model():
    data = get_data()
    X_train, X_test, y_train, y_test = split_data(data)
    X_train, y_train = remove_county_state(X_train, y_train)
    X_test, y_test = remove_county_state(X_test, y_test)

    # data preprocessing (removing mean and scaling to unit variance with StandardScaler)
    pipeline = make_pipeline(StandardScaler(), GBR())

    # set hyperparameters
    hyperparameters = {
        'gradientboostingregressor__n_estimators': [100, 600, 700, 800],
        'gradientboostingregressor__max_depth': [3, 4, 5, 10, 20],
        'gradientboostingregressor__min_samples_split': [3, 4, 5, 10, 20],
        'gradientboostingregressor__learning_rate': [0.01, 0.05, 0.1],
        'gradientboostingregressor__loss': ['ls'],
    }

    # tune model via pipeline
    clf = GridSearchCV(pipeline, hyperparameters, cv=3)

    clf.fit(X_train, y_train)

    pred = clf.predict(X_test)
    # print('feature importances:', clf.feature_importances_)
    print('r2 score:', r2_score(y_test, pred))
    print('mse:', mean_squared_error(y_test, pred))
    print('*' * 20)
    print('best params:', clf.best_params_)
    print('best grid:', clf.best_estimator_)
    print('^' * 20)
    eval_model(clf.best_estimator_, X_train, y_train, X_test, y_test)
    print('#' * 20)
    print('score', clf.score)
    return clf
コード例 #8
0
ファイル: SKLearnSuite.py プロジェクト: Igerald/Python-Tools
 def Gradient(self,Results='',TestSet=False):
     G = GBR(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2,
             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=None, init=None, random_state=None,
             max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
     if TestSet==False:
         GResult = G.fit(self.X,np.ravel(self.y,1))
         if Results==True:
             print(str(GResult.score(self.X,np.ravel(self.y,1))) + '\n' + str(GResult.get_params()))
         plt.plot(G.fit(self.X,np.ravel(self.y,1)).predict(self.X))
         y = np.array(self.y[self.DVCols])
         plt.plot(y,'ro')
         plt.show()
     else:
         x_train = self.X[:len(self.X)//2]
         y_train = np.ravel(self.y,1)[:len(self.y)//2]
         x_test = self.X[len(self.X)//2:]
         y_test = np.ravel(self.y,1)[len(self.y)//2:]
         GResult = G.fit(x_train,y_train)
         if Results==True:
             print(str(GResult.score(self.X,np.ravel(self.y,1))) + '\n' + str(GResult.get_params()))
         GRPredict = GResult.predict(x_test)
         plt.plot(GRPredict,polyval(polyfit(GRPredict,y_test.reshape(-1),1),GRPredict),'r-',label='predicted')
         plt.plot(GRPredict,y_test.reshape(-1),'bo')
         plt.legend()
         plt.show()
コード例 #9
0
def gradient_boosting_regressor(trainX, y_train):
    model = GBR(n_estimators=300,
                learning_rate=0.1,
                max_depth=8,
                random_state=777,
                loss='ls')
    model.fit(trainX.iloc[:, ~trainX.columns.str.match("y")], y_train)
    return model
コード例 #10
0
def model(X_train,
          y_train,
          X_test=np.array([]),
          y_test=np.array([]),
          method="LR"):
    #X_train inputs of model for training
    #X_test inputs of model fortesting
    #y_train -outputs for Xtrain
    #y_test - outputs fo X_test
    #method of model design. Default method is linear regression

    if method == "LR":
        lr = LR()
    elif method == "Ridge":
        lr = Ridge()
    elif method == "Lasso":
        lr = Lasso()
    elif method == "MLPRegressor":
        lr = MLPRegressor()
    elif method == "SVR":
        lr = SVR()
    elif method == "KNR":
        lr = KNR()
    elif method == "RFR":
        lr = RFR()
    elif method == "GBR":
        lr = GBR()
    else:
        print("unknown method")
        return False

#    lr = MLPRegressor( hidden_layer_sizes=[5], activation ="relu")
#    lr = MLPRegressor()
#    lr=SVR()
#    lr=KNR()
#
#    lr=Ridge(alpha=alpha.x)
#    lr=Ridge()
#    lr=Lasso(alpha=0.001)

#    lr=Lasso()

#    lr=RFR(n_estimators=5, max_features=2, max_depth=2, random_state=2)
#    lr=RFR()

#    lr=GBR()

    lr = lr.fit(X_train, y_train[:, 0])
    y_mod_train = lr.predict(X_train)
    c_train = CCC(y_train, y_mod_train[:, np.newaxis])

    c_test = -1
    if len(y_test) > 0:
        y_mod_test = lr.predict(X_test)
        c_test = CCC(y_test, y_mod_test[:, np.newaxis])

    return (lr, c_train, c_test)
コード例 #11
0
ファイル: gptest.py プロジェクト: Aprilara/backup
def gbdtcv(n_estimators, min_samples_split,  max_depth):
    val = cross_val_score(
        GBR(n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_depth=int(max_depth),
            random_state=2
        ),
        X_tr, y_tr, cv=2
    ).mean()
    return val
コード例 #12
0
ファイル: GBDT_sklearn.py プロジェクト: KingJames777/Ensemble
def bestParas(X_train, y_train):
    pipeline = make_pipeline(preprocessing.StandardScaler(), GBR())
    hyperparameters = {
        'gradientboostingregressor__learning_rate': [0.1, 0.2],
        'gradientboostingregressor__max_depth': [3, 6, 9],
        'gradientboostingregressor__n_estimators': [50, 80],
        'gradientboostingregressor__subsample': [0.8, 0.9, 1.0]
    }
    gbr = GridSearchCV(pipeline, hyperparameters, cv=10).fit(X_train, y_train)
    return gbr
コード例 #13
0
def __ensemble_test(type, X_train, X_test, y_train, y_test):
    if type.lower() == 'gbr':
        reg = GBR(n_estimators=100, random_state=1)
    elif type.lower() == 'rfr':
        reg = RFR(n_estimators=100, random_state=1)
    elif type.lower() == 'abr':
        reg = ABR(n_estimators=100, random_state=1)
    elif type.lower() == 'etr':
        reg = ETR(n_estimators=100, random_state=1)
    reg.fit(X_train, y_train)
    return reg, reg.score(X_test, y_test), reg.feature_importances_
コード例 #14
0
    def GBDT(self, n, step):
        best_params = {
            'n_estimators': 1000,
            'max_depth': 10,
            'min_samples_split': 2,
            'learning_rate': 0.01,
            'loss': 'huber'
        }
        params_high = {
            'n_estimators': 1000,
            'max_depth': 10,
            'min_samples_split': 2,
            'learning_rate': 0.01,
            'loss': 'huber'
        }
        model_low = GBR()
        print(self.txl.shape, self.tyllog.shape, self.vxl.shape)
        model_low.fit(self.txl, self.tyllog)
        self.y_pre_train_log = model_low.predict(self.txl).reshape(-1, 1)
        self.y_pre_train = [
            10**x for x in model_low.predict(self.txl).reshape(-1, 1)
        ]
        self.y_pre_valid_log = model_low.predict(self.vxl).reshape(-1, 1)
        self.y_pre_valid = [
            10**x for x in model_low.predict(self.vxl).reshape(-1, 1)
        ]

        model_high = GBR()
        model_high.fit(self.txh, self.tyhlog)
        self.y_pre_train_log = np.r_[
            self.y_pre_train_log,
            model_high.predict(self.txh).reshape(-1, 1)]
        self.y_pre_train = np.r_[
            self.y_pre_train,
            np.exp(model_high.predict(self.txh).reshape(-1, 1))]
        self.y_pre_valid_log = np.r_[
            self.y_pre_valid_log,
            model_high.predict(self.txh).reshape(-1, 1)]
        self.y_pre_valid = np.r_[
            self.y_pre_valid,
            np.exp(model_high.predict(self.vxh).reshape(-1, 1))]
 def GradientBoosting_regression(self, n, step):
     params = {
         'n_estimators': n,
         'max_depth': step,
         'min_samples_split': 2,
         'learning_rate': 0.01,
         'loss': 'lad'
     }
     gbr = GBR(**params)
     gbr.fit(self.train_X, self.train_y)
     self.y_pre_train = gbr.predict(self.train_X)
     self.y_pre_test = gbr.predict(self.test_X)
コード例 #16
0
 def train(self,
           zone,
           num,
           hidden_layer_size=(4),
           n_jobs=1,
           kernel='rbf',
           n_components=15,
           n_estimators=50,
           loss='linear',
           learning_rate=1.0,
           host='127.0.0.1'):
     f = fd(host)
     input_set = f.getTrainData(zone)
     x_train, x_test, y_train, y_test, scaler, pca = self.read_dataset(
         input_set, n_components)
     if num == 1:
         #Linear Regression
         clf = LinearRegression(n_jobs=n_jobs)
         clf.fit(x_train, y_train)
         # storeObj(clf,zone,clf.score(x_test,y_test),'Linear Regression')
         return clf, clf.score(x_test,
                               y_test), 'Linear Regression', scaler, pca
     elif num == 2:
         # SVR sigmoid
         clf = svm.SVR(kernel=kernel)
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'SVR'+','+kernel)
         return clf, clf.score(x_test, y_test), 'SVR' + kernel, scaler, pca
     elif num == 3:
         #Neural Net
         clf = mlpr(hidden_layer_size=hidden_layer_size)
         clf.fit(x_train, y_train)
         str = ''
         for i in hidden_layer_size:
             str += '-> {}'.format(i)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'NeuralNet'+' hidden layer size'+hidden_layer_size)
         return clf, clf.score(
             x_test, y_test), 'NeuralNet hidden_size' + str, scaler, pca
     elif num == 4:
         #Gradient Boosting Regressor
         clf = GBR(loss=loss,
                   n_estimators=n_estimators,
                   learning_rate=learning_rate)
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'Gradient Boosting Regressor')
         return clf, clf.score(
             x_test, y_test), 'Gradient Boosted Regressor', scaler, pca
     elif num == 5:
         clf = ABR()
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'AdaBoost Regressor')
         return clf, clf.score(x_test,
                               y_test), 'AdaBoost Regressor', scaler, pca
コード例 #17
0
def ratio_test():
    df = pd.DataFrame(columns=['ratio', 'score'])
    for i in range(1, 100):
        X_train, X_test, y_train, y_test, predict_X, features = pre.raw_preprocessing(
            i / 100)
        reg = GBR(random_state=1)
        reg.fit(X_train, y_train)
        df = df.append(pd.DataFrame(
            [[1 - i / 100, reg.score(X_test, y_test)]], columns=df.columns),
                       ignore_index=True)
    plt.plot(df['ratio'], df['score'], 'k.-')
    plt.xlabel('train_set_ratio')
    plt.ylabel('score')
    plt.savefig('ratio_score.png')
    df.to_csv('ratio.png', index=None)
コード例 #18
0
def goal(hyper):

    modelo = GBR(n_estimators=int(hyper['n_estimators']),
                 learning_rate=hyper['learning_rate'],
                 subsample=hyper['subsample'],
                 alpha=hyper['alpha'],
                 validation_fraction=hyper['validation_fraction'])

    eval_set = [(X_train, y_train), (X_test, y_test)]

    modelo.fit(X_train, y_train)

    y_pred = modelo.predict(X_test)

    rmse = mse(y_test, y_pred)**0.5

    return {'loss': rmse, 'status': STATUS_OK}
コード例 #19
0
ファイル: GBDT_sklearn.py プロジェクト: KingJames777/Ensemble
def trainMSEtest(data, index_train, index_test):
    X = data[:, :-1]
    y = data[:, -1]
    X_train = X[index_train]
    X_test = X[index_test]
    y_train = y[index_train]
    y_test = y[index_test]
    gbr = GBR(loss='ls',
              max_depth=6,
              n_estimators=80,
              subsample=0.6,
              learning_rate=0.08).fit(X_train, y_train)
    pred_test = gbr.predict(X_test)
    pred_train = gbr.predict(X_train)
    print(around(pred_test[:30]), '\n', y_test[:30])
    print('Test MSE:', mse(y_test, pred_test))
    print('Train MSE:', mse(y_train, pred_train))
コード例 #20
0
def train_cali_metrics(pred_probs, multi_labels, thres = 0.5):
    cal_features = []
    cal_labels = []
    for probs, ml in zip(pred_probs, multi_labels):
        prob_feat = get_prob_feat(probs, thres)
        #prior_feat = get_prior_feat(probs, thres) / float(priors_total)
        card_feat = get_card_feat(probs, thres)
        label_feat = get_label_feat(probs, thres)
        feature = [prob_feat, card_feat] + label_feat
        cal_label = check_same(probs, ml, 0.5)

        cal_features.append(feature)
        cal_labels.append(cal_label)
    
    gb = GBR(loss='ls', learning_rate=0.1, min_samples_leaf=5, n_estimators=100)
    gb.fit(cal_features, cal_labels)
    return gb
コード例 #21
0
def TrainModel_GBR(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    #clf = GBR()
    clf=GBR(alpha=0.9, criterion='friedman_mse', init=None, \
                                  learning_rate=0.03, loss='huber', max_depth=15,\
                                  max_features='sqrt', max_leaf_nodes=None,\
                                  min_impurity_decrease=0.0, min_impurity_split=None, \
                                  min_samples_leaf=10, min_samples_split=40,\
                                  min_weight_fraction_leaf=0.0, n_estimators=300, \
                                  presort='auto', random_state=10, subsample=0.8, verbose=0, \
                                  warm_start=False)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("MSE:", metrics.mean_squared_error(y_test, y_pred))
    return clf
コード例 #22
0
ファイル: test.py プロジェクト: Beathmart/ml
def test_gbdt():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_boston
    dataset = load_boston()
    X, y, features = dataset['data'], dataset['target'], dataset[
        'feature_names']
    X = pd.DataFrame(X, columns=features)
    y = pd.DataFrame(y, columns=['target'])
    data = pd.concat([X, y], axis=1)

    features = data.columns[:-1]
    target = data.columns[-1]

    from sklearn.model_selection import train_test_split
    X_train, X_vali, y_train, y_vali = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=25)
    print('X_train shape: ', X_train.shape)
    print('X_vali shape: ', X_vali.shape)
    print('y_train shape: ', y_train.shape)
    print('y_vali shape: ', y_vali.shape)

    from sklearn.tree import DecisionTreeRegressor as DTR
    dtr = DTR(max_depth=5)
    dtr.fit(X_train, y_train.values.reshape(-1))
    print('sklearn dtr score: ', dtr.score(X_vali, y_vali))

    from sklearn.ensemble import GradientBoostingRegressor as GBR
    import xgboost as xgb
    gbr = GBR(max_depth=5)
    gbr.fit(X_train, y_train)
    print('sklearn gbr score: ', gbr.score(X_vali, y_vali))

    from ml.tree import DecisionTreeRegressor
    mydtr = DecisionTreeRegressor(max_depth=5)
    mydtr.fit(X_train, y_train)
    print('my dtr score: ', mydtr.score(X_vali, y_vali))

    from ml.ensemble import GradientBoostingRegressor
    mygbr = GradientBoostingRegressor()
    mygbr.fit(X_train, y_train)
    print('my gbr score: ', mygbr.score(X_vali, y_vali))
コード例 #23
0
    def __init__(self, n_estimators = 2000, period = 12, savepath = 'Results', modeltype = None):

        """
        parameters
        """
        self.n_estimators = n_estimators
        self.timegap = period * 5
        self.savepath=savepath
        self.modeltype= modeltype

        # model save path
        self.weightspath = self.savepath + '/{}_GBR_model_{}_estimators.joblib'.format(self.modeltype, self.n_estimators)

        # containers for predictions
        self.train_pred = None
        self.test_pred = None

        # Design the model
        self.model = GBR(loss='ls', learning_rate=0.1,
                         validation_fraction=0.1, n_iter_no_change=300,
                         n_estimators=self.n_estimators)
コード例 #24
0
def gbr(data_dir, model_dir, features):
    X_train, X_test, y_train, y_test, predict_X, features = pre.drop_preprocessing(
        data_dir, features)
    os.chdir(model_dir)
    gbr = GBR(subsample=1, random_state=1)
    grid = GridSearchCV(estimator=gbr,
                        param_grid={
                            'loss': ['ls', 'lad', 'huber', 'quantile'],
                            'n_estimators': range(50, 311, 20)
                        },
                        cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_estimator_.score(X_test, y_test))

    joblib.dump(
        grid.best_estimator_, 'gbr_%d_%.4f.m' %
        (len(features), grid.best_estimator_.score(X_test, y_test)))

    df = pd.DataFrame(columns=['pbe_bandgap', 'ml_bandgap'])
    df['pbe_bandgap'] = y_test
    df['ml_bandgap'] = grid.best_estimator_.predict(X_test)
    print(df)
    return grid.best_estimator_
コード例 #25
0
def analyzeMetricNumericalShell(metric, training, excludeFeatures):
    print('\nModeling', metric)
    
    X = training[training.columns - excludeFeatures]
    Y = training[metric]
    
    # To reproduce results, fix the random seed
    seed(1)
    
    features = X.columns
#    print(X.info())

#     as an array
    x = np.asanyarray(X)
    y = np.asanyarray(Y)

    # regressor predictions
    linearP = runCVNumerical(x, y, LinR)
    gbrP = runCVNumerical(x, y, GBR )
#    forestP = fitRandomForestRegressor(x, y)
    forestP = runCVNumerical(x, y, RFR)
    treeP = runCVNumerical(x, y, DTR)

    # RMSLEs
    linearErr = rmse(y, linearP) 
    gbrErr = rmse(y, gbrP) 
    forestErr = rmse(y, forestP) 
    treeErr = rmse(y, treeP) 

    mappings = [
        { 'name' : "Linear Regression", 
          'algo' : LinR
        },
        { 'name' : "Gradient Boosting Regressor", 
          'algo' : GBR
        },
        { 'name' : "Random Forest Regressor", 
          'algo' : RFR
        },
        { 'name' : "Decision Tree Regressor", 
          'algo' : DTR
        }
    ]
    errors = [
        { 'name' : "Linear Regression", 
          'accuracy' : linearErr
        },
        { 'name' : "Gradient Boosting Regressor", 
          'accuracy' : gbrErr
        },
        { 'name' : "Random Forest Regressor", 
          'accuracy' : forestErr 
        },
        { 'name' : "Decision Tree Regressor", 
          'accuracy' : treeErr 
        }
    ]

    errors = sorted(errors, key=lambda k: k['accuracy'])     
    
    for error in errors:
        print(error['name'], ' scores an RMSE value of ', error['accuracy'])        
        
        
    theBest = min(errors, key=lambda x:x['accuracy'])['name']
    bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo']
    bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy']
    print('\nBest performer:')
    print(theBest, ' with an RMSE of ', bestErr)
    
    if theBest == 'Gradient Boosting Regressor':
        best = GBR(n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8)
        fit = best.fit(X, Y)
        gbrPreds = runCVNumerical(x, y, GBR, n_estimators=1000, learning_rate = 0.09, loss = 'ls', 
                              random_state = 652100, max_depth=2, subsample=0.8 )
        gbrFinalErr = rmse(y, gbrPreds)
        print('test rmse by GBR = ', gbrFinalErr)
    elif theBest == 'Random Forest Regressor':
        best = RFR(n_estimators=800, random_state = 652100, max_features=0.2)
        fit = best.fit(X, Y)
        gbrPreds = runCVNumerical(x, y, RFR, n_estimators=800, random_state = 652100, max_features=0.2 )
        gbrFinalErr = rmse(y, gbrPreds)
        print('test rmse by RFR = ', gbrFinalErr)
  
    return ({ 'features' : features, 'predictions' : gbrPreds, 'model' : best })
コード例 #26
0
def analyzeMetricNumericalRMSE(metric, training, excludeFeatures):
    print('\nModeling', metric)
    
    X = training[training.columns - excludeFeatures]
    Y = training[metric]
    
    # To reproduce results, fix the random seed
    seed(1)
    
    features = X.columns
    print(X.info())

#     as an array
    x = np.asanyarray(X)
    y = np.asanyarray(Y)

    # regressor predictions
    linearP = runCVNumerical(x, y, LinR)
    gbrP = runCVNumerical(x, y, GBR )
#    forestP = fitRandomForestRegressor(x, y)
    forestP = runCVNumerical(x, y, RFR)
    treeP = runCVNumerical(x, y, DTR)

    # RMSLEs
    linearErr = rmse(y, linearP) 
    gbrErr = rmse(y, gbrP) 
    forestErr = rmse(y, forestP) 
    treeErr = rmse(y, treeP) 

    mappings = [
        { 'name' : "Linear Regression", 
          'algo' : LinR
        },
        { 'name' : "Gradient Boosting Regressor", 
          'algo' : GBR
        },
        { 'name' : "Random Forest Regressor", 
          'algo' : RFR
        },
        { 'name' : "Decision Tree Regressor", 
          'algo' : DTR
        }
    ]
    errors = [
        { 'name' : "Linear Regression", 
          'accuracy' : linearErr
        },
        { 'name' : "Gradient Boosting Regressor", 
          'accuracy' : gbrErr
        },
        { 'name' : "Random Forest Regressor", 
          'accuracy' : forestErr 
        },
        { 'name' : "Decision Tree Regressor", 
          'accuracy' : treeErr 
        }
    ]

    errors = sorted(errors, key=lambda k: k['accuracy'])     
    
    for error in errors:
        print(error['name'], ' scores an RMSE value of ', error['accuracy'])        
        
        
    theBest = min(errors, key=lambda x:x['accuracy'])['name']
    bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo']
    bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy']
    print('\nBest performer:')
    print(theBest, ' with an RMSE of ', bestErr)

    best = GBR(n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8)
    fit = best.fit(X, Y)
    gbrPreds = runCVNumerical(x, y, GBR, n_estimators=1000, learning_rate = 0.09, loss = 'ls', 
                              random_state = 652100, max_depth=2, subsample=0.8 )
    gbrFinalErr = rmse(y, gbrPreds)
    print('test rmse = ', gbrFinalErr)
    fi = best.feature_importances_ 
    sum_fi = {}
    for j in range( len( features ) ):
        if features[j] in sum_fi:
            sum_fi[features[j]] = sum_fi[features[j]] + fi[j] 
        else:
            sum_fi[features[j]] = fi[j] 
    
    sum_fi_list = [[key, sum_fi[key]] for key in sum_fi]
    sum_fi_list.sort( key = lambda x : x[1], reverse = True )
    
    return ({ 'features' : features, 'predictions' : gbrPreds, 'model' : best, 'importance' : sum_fi_list })
コード例 #27
0
def analyzeMetricNumerical(metric, training, excludeFeatures, tuning):
#    metric = 'Code 2'
#    print('Analyzing', metric, 'with\n', training.columns)
    
    X = training[training.columns - excludeFeatures]
    Y = training[metric]
    
    # To reproduce results, fix the random seed
    seed(1)
    
    features = X.columns
#    print(X.info())

#     as an array
    x = np.asanyarray(X)
    y = np.asanyarray(Y)

    # regressor predictions
    linearP = runCVNumerical(x, y, LinR)
    gbrP = runCVNumerical(x, y, GBR )
#    forestP = fitRandomForestRegressor(x, y)
    forestP = runCVNumerical(x, y, RFR)
    treeP = runCVNumerical(x, y, DTR)

    # RMSLEs
    linearErr = rmsle(y, linearP) 
    gbrErr = rmsle(y, gbrP) 
    forestErr = rmsle(y, forestP) 
    treeErr = rmsle(y, treeP) 

    mappings = [
        { 'name' : "Linear Regression", 
          'algo' : LinR
        },
        { 'name' : "Gradient Boosting Regressor", 
          'algo' : GBR
        },
        { 'name' : "Random Forest Regressor", 
          'algo' : RFR
        },
        { 'name' : "Decision Tree Regressor", 
          'algo' : DTR
        }
    ]
    errors = [
        { 'name' : "Linear Regression", 
          'accuracy' : linearErr
        },
        { 'name' : "Gradient Boosting Regressor", 
          'accuracy' : gbrErr
        },
        { 'name' : "Random Forest Regressor", 
          'accuracy' : forestErr 
        },
        { 'name' : "Decision Tree Regressor", 
          'accuracy' : treeErr 
        }
    ]
    
    print(errors, '\n\n')
    theBest = min(errors, key=lambda x:x['accuracy'])['name']
    bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo']
    bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy']
    print('Best performer = ', theBest, bestErr, bestAlgo)

#    theBest = 'Gradient Boosting Regressor'
#    bestAlgo = GBR

    # get the best performing algorithm
    if theBest is "Decision Tree Regressor":
        best = bestAlgo()
    else:
        if tuning:
            print('plot_partial_dependence...')
            best = GBR(n_estimators=300) # , subsample=0.8, max_features=8
#            fit_partial = best.fit(training, y)
#            fig, axs = plot_partial_dependence(fit_partial, training,
#                                   features=range(len(training.columns)),
#                                   feature_names=training.columns,
#                                   n_cols=2)
#            fig.set_size_inches(13,25)
#            plt.subplots_adjust(top=1.5)
#            fig.show()
            
            fit = best.fit(X, Y)            
#            fig2, axs2 = plot_partial_dependence(fit, X,
#                                   features=range(len(X.columns)),
#                                   feature_names=X.columns,
#                                   n_cols=2)
#            fig2.set_size_inches(13,20)
#            plt.subplots_adjust(top=1.5)
#            fig2.show()
        else:
            print('no tuning...')
            best = bestAlgo(n_estimators=300)
    #        search = GridSearchCV(best, param_grid, verbose=2)
            fit = best.fit(X, Y)
    #        print(fit.best_params_)
#            y_hat = best.predict(X) 
        
#    best = GBR(n_estimators=200)
#    fit = best.fit(X, Y)

    return ({ 'fit' : fit, 'features' : features, 'model' : best})
コード例 #28
0
    df1.drop([variable], axis=1, inplace=True)

Tips = df1['tip_amount']  #only y
df2 = df1.drop(['tip_amount'], axis=1)  #all the x
X_train, X_test, Y_train, Y_test = train_test_split(df2, Tips, test_size=0.20)

# Check shape
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#Train your model
gbr = GBR(loss='ls',
          learning_rate=0.1,
          n_estimators=100,
          subsample=1,
          max_depth=6,
          verbose=1)
est = gbr.fit(X_train, Y_train)
#prediction experiment
y_test_pred = est.predict(X_test)
mean_squared_error(y_test_pred, Y_test)

#plot the train loss vs. iteration
n = np.arange(100) + 1
plt.plot(n, est.train_score_, 'r-')
plt.ylabel('Training Loss')
plt.xlabel('Iteration')

#Lowest mean quare is 0.1866
#Cross-validation and greed search could be used to tune the parameters in the GradientBoostingRegressor model.
コード例 #29
0
ファイル: regression.py プロジェクト: youngsikwon/ngboost
        ngb_nll += [-forecast.logpdf(y_test.flatten()).mean()]
        
        #print(np.sqrt(mean_squared_error(forecast.loc, y_test)))
        #for idx, y_p, y_t in zip(test_index, list(forecast.loc), y_test):
        #    print(idx, y_t, y_p, np.abs(y_p - y_t))

        if args.verbose or True:
            print("[%d/%d] BestIter=%d RMSE: Val=%.4f Test=%.4f NLL: Test=%.4f" % (itr+1, args.n_splits,
                                                                                   best_itr, np.sqrt(val_rmse[best_itr-1]),
                                                                                   np.sqrt(mean_squared_error(forecast.loc, y_test)),
                                                                                   ngb_nll[-1]))

        #logger.tick(forecast, y_test)

        gbr = GBR(n_estimators=args.n_est,
                  learning_rate=args.lr,
                  subsample=args.minibatch_frac,
                  verbose=args.verbose)
        gbr.fit(X_train, y_train.flatten())
        y_pred = gbr.predict(X_test)
        forecast = NormalFixedVar(y_pred.reshape((1, -1)))

        y_gbm += list(y_pred.flatten())
        gbm_rmse += [np.sqrt(mean_squared_error(y_pred.flatten(), y_test.flatten()))]
    
        if args.verbose or True:
            print("[%d/%d] GBM RMSE=%.4f" % (itr+1, args.n_splits,
                                             np.sqrt(mean_squared_error(y_pred.flatten(), y_test.flatten()))))
        #gbrlog.tick(forecast, y_test)

    print('== RMSE GBM=%.4f +/- %.4f, NGB=%.4f +/- %.4f, NLL NGB=%.4f +/ %.4f' % (np.mean(gbm_rmse), np.std(gbm_rmse),
                                                                                  np.mean(ngb_rmse), np.std(ngb_rmse),
コード例 #30
0
# -*- coding: UTF-8 -*-
from sklearn.ensemble import GradientBoostingRegressor as GBR
import pandas as pd

trainData = pd.read_csv("K:\python\lesson6_experiment2\NSW_TRAIN.csv")
testData = pd.read_csv("K:\python\lesson6_experiment2\NSW_TEST.csv")
# X_train = trainData.loc[:, ['WEEK','HOLIDAY','Min_T','Max_T','AVG_T','RAIN']]  # 6列特征
X_train = trainData.loc[:, ['WEEK', 'Min_T', 'Max_T', 'AVG_T', 'RAIN']]  # 6列特征
y_train = trainData.iloc[:, -2]  # 取平均负荷为target
X_test = testData.loc[:, ['WEEK', 'Min_T', 'Max_T', 'AVG_T', 'RAIN']]
y_test = testData.iloc[:, -2]
gbr = GBR()
gbr.fit(X_train, y_train)
pre = gbr.predict(X_test)
score = gbr.score(X_test, y_test)
print gbr.feature_importances_
pre_ele = pd.DataFrame(pre, columns=['pre_avg'])
real_ele = pd.DataFrame(y_test)
after = pd.concat([pre_ele, real_ele], axis=1)
after['error'] = abs(after['AVG_ELE'] - after['pre_avg']) / after['pre_avg']
error_count = after[after['error'] < 0.05].shape[0]
print error_count