Ejemplo n.º 1
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]
    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df_test = pd.read_csv(data_test, index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingRegressor(n_estimators=50)
    clf.fit(X, y)
    y_predict = clf.predict(X_guest)

    result = {
        "mean_squared_error": mean_squared_error(y_guest, y_predict),
        "mean_absolute_error": mean_absolute_error(y_guest, y_predict)
    }
    print(result)
    return {}, result
Ejemplo n.º 2
0
def main(config="../../config.yaml", param="./gbdt_config_reg.yaml"):

    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    clf = GradientBoostingRegressor(random_state=0, n_estimators=50)
    clf.fit(X, y)

    y_predict = clf.predict(X)

    result = {"mean_absolute_error": mean_absolute_error(y, y_predict)}
    print(result)
    return {}, result
Ejemplo n.º 3
0
def GDBT_ST(trainFileName,testFilename):
    trainData = ld.LoadData_DATA_ST(trainFileName)
    testData = ld.LoadData_DATA_ST(testFilename)
    
    store = ['1','2','3','4','5']
    res = []
    
    for i in store:
        train_X = [];train_y = []
        context = trainData[i]
        for array in context:
            array = [float(x) for x in array[2:]]
            train_X.append((array[2:-1]))
            train_y.append(array[-1])
            
        test_X = [];items = []
        context = testData[i]
        for array in context:
            items.append((array[0],array[1]))
            array = [float(x) for x in array[2:] ]
            test_X.append((array[2:]))
            
        clf = GradientBoostingRegressor(loss='lad', n_estimators=50, learning_rate=0.1, max_depth=3).\
                    fit(train_X,train_y)
        pred_y = clf.predict(test_X)
         
        for i in range(len(pred_y)):
            res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)])
    return res
Ejemplo n.º 4
0
def test_gradient_boosting_estimator_with_smooth_quantile_loss():
    np.random.seed(0)
    m = 15000
    n = 10
    p = .8
    X = np.random.normal(size=(m,n))
    beta = np.random.normal(size=n)
    mu = np.dot(X, beta)
    y = np.random.lognormal(mu)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333)
    loss_function = SmoothQuantileLossFunction(1, p, .0001)
    q_loss = QuantileLossFunction(1, p)
    model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), 
                                      loss_function, n_estimators=150, 
                                      stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True)
    assert_raises(NotFittedError, lambda : model.predict(X_train))
    
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)
    model2 = GradientBoostingRegressor(loss='quantile', alpha=p)
    model2.fit(X_train, y_train)
    prediction2 = model2.predict(X_test)
    assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2))
    assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2))
    q = np.mean(y_test <= prediction)
    assert_less(np.abs(q-p), .05)
    assert_greater(model.score_, 0.)
    assert_approx_equal(model.score(X_train, y_train), model.score_)
Ejemplo n.º 5
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)

    clf = GradientBoostingRegressor(random_state=0,
                                    n_estimators=50,
                                    learning_rate=0.1)
    clf.fit(X, y)

    y_predict = clf.predict(X)

    result = {
        "mean_absolute_error": mean_absolute_error(y, y_predict),
    }
    print(result)
    return {}, result
Ejemplo n.º 6
0
class GradientBoostingRegressorImpl():
    def __init__(self,
                 loss='ls',
                 learning_rate=0.1,
                 n_estimators=100,
                 subsample=1.0,
                 criterion='friedman_mse',
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_depth=3,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 init=None,
                 random_state=None,
                 max_features=None,
                 alpha=0.9,
                 verbose=0,
                 max_leaf_nodes=None,
                 warm_start=False,
                 presort='auto',
                 validation_fraction=0.1,
                 n_iter_no_change=None,
                 tol=0.0001):
        self._hyperparams = {
            'loss': loss,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
            'criterion': criterion,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_depth': max_depth,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'init': init,
            'random_state': random_state,
            'max_features': max_features,
            'alpha': alpha,
            'verbose': verbose,
            'max_leaf_nodes': max_leaf_nodes,
            'warm_start': warm_start,
            'presort': presort,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'tol': tol
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
Ejemplo n.º 7
0
def GradientBoosted(X_train, X_test, y_train, y_test):
    mod = GradientBoostingRegressor()
    mod.fit(X_train, y_train)
    print "Done training"
    gb_labels = mod.predict(X_test)
    print "Done testing"
    gb_score = mod.score(X_test, y_test)
    return gb_score, gb_labels
Ejemplo n.º 8
0
def rfr_the_loss(hub, molid):
    X, y = regress_the_loss_from_coocurrences_Xy(hub, molid)
    # rfr = RandomForestRegressor(n_estimators=800, n_jobs=8, oob_score=True, random_state=0)
    rfr = GradientBoostingRegressor(n_estimators=100)
    rfr.fit(X, y)
    # print rfr.oob_score_
    # print rfr.oob_improvement_
    influential = np.argsort(-rfr.feature_importances_)[:20]
    print '\t%s' % '\n\t'.join(X.columns[influential])
Ejemplo n.º 9
0
def GDBT_ALL(trainFileName, testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)])
    return res
Ejemplo n.º 10
0
def GDBT_ALL(trainFileName,testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i],'all','%.4f'%max(pred_y[i],0)])
    return res
Ejemplo n.º 11
0
def GDBT_ALL_train(trainFileName,testFileName):
    train_X, train_y, _ = ld.loadData_all(trainFileName)
    test_X, test_y,items = ld.loadData_all(testFileName)
    clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\
            fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    res = []
    for i in range(len(test_X)):
        res.append([items[i],'all','%.2f'%max(pred_y[i],0),'%.2f'%max(test_y[i],0)])
    return res
Ejemplo n.º 12
0
def model_build(train_set):
    X = train_set.iloc[:, 6:11]
    Y = train_set['label']
    #print(X.head(5))
    #print(Y.head(5))
    model = GradientBoostingRegressor()
    #model = GradientBoostingClassifier()
    model.fit(X, Y)
    print(model.feature_importances_)
    #print(model)
    return model
Ejemplo n.º 13
0
def test_argument_names():
    boston = load_boston()
    X = DataFrame(boston['data'], columns=boston['feature_names'])
    y = boston['target']
    model = GradientBoostingRegressor(verbose=True).fit(X, y)
    code = sklearn2code(model, ['predict'],
                        numpy_flat,
                        argument_names=X.columns)
    boston_housing_module = exec_module('boston_housing_module', code)
    assert_array_almost_equal(model.predict(X),
                              boston_housing_module.predict(**X))
Ejemplo n.º 14
0
def NonlinReg(coeff, regressor='GBR', features=4, interval=0, length=1):
    '''
    NonlinReg: Non-linear Regression Model
    
    coeff: Input sequence disposed by WT (Wavelet Transformation Function)
    
    regressor: Non-linear regressor, 'GBR' default
    
    features: Days used to predict, 4 default
    
    interval: Prediction lagging, 0 default
    
    length: 1 default
    '''
    X, Y = [], []
    for i in range(len(coeff[0])):
        if i + features + interval < len(coeff[0]):
            X.append(coeff[0][i:i + features])
            Y.append(coeff[0][i + features + interval])
    X = np.array(X)
    Y = np.array(Y)

    if regressor == 'GBR':
        gbr = GBR(learning_rate=0.1, n_estimators=80, max_depth=2).fit(X, Y)

        X_ = copy.deepcopy(X)
        Y_ = copy.deepcopy(Y)
        for i in range(length):
            X_ = np.concatenate(
                (X_,
                 np.array([
                     np.concatenate(
                         (X_[-1][-features + 1:], Y_[[-interval - 1]]))
                 ])))
            Y_ = np.concatenate((Y_, gbr.predict(X_[-1])))

    if regressor == 'SVR':
        svr = svm.SVR(kernel='rbf', C=100, gamma=3).fit(X, Y)

        X_ = copy.deepcopy(X)
        Y_ = copy.deepcopy(Y)
        for i in range(length):
            X_ = np.concatenate(
                (X_,
                 np.array([
                     np.concatenate(
                         (X_[-1][-features + 1:], Y_[[-interval - 1]]))
                 ])))
            Y_ = np.concatenate((Y_, svr.predict(X_[-1])))

    return Y_
    def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0):
        '''
        :param data: DataFrame
        :param label: label name
        :param task:  任务类型, [regression, classification]
        :param model: ['gbdt', 'xgb', 'lgb']
        :param importance_threshold, 除去小于阈值的特征
        '''
        self.data = data
        self.label = label
        self.task = task
        self.model_name = model_name
        self._importance_threshold = importance_threshold

        self.model = None
        # 根据任务和label的值,设置验证准则
        self.eval_metric = None

        if model_name == 'lgb':
            if self.task == 'classification':
                self.model = lgb.LGBMClassifier(**lgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'logloss'
            elif self.task == 'regression':
                self.model = lgb.LGBMRegressor(**lgb_params)
                self.eval_metric = 'l2'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        elif model_name == 'xgb':
            if self.task == 'classification':
                self.model = xgb.XGBClassifier(**xgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'mlogloss'
            elif self.task == 'regression':
                self.model = xgb.XGBRegressor(**xgb_params)
                self.eval_metric = 'rmse'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        else: # gbdt
            if self.task == 'classification':
                self.model = GradientBoostingClassifier(**gbdt_params)
            elif self.task == 'regression':
                self.model = GradientBoostingRegressor(**gbdt_params)
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        if not eval_metric:
            self.eval_metric = eval_metric
Ejemplo n.º 16
0
def GDBT_ALL_train(trainFileName, testFileName):
    train_X, train_y, _ = ld.loadData_all(trainFileName)
    test_X, test_y, items = ld.loadData_all(testFileName)
    clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\
            fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    res = []
    for i in range(len(test_X)):
        res.append([
            items[i], 'all',
            '%.2f' % max(pred_y[i], 0),
            '%.2f' % max(test_y[i], 0)
        ])
    return res
Ejemplo n.º 17
0
def main(train, test, filepath):
    if not filepath:
        click.echo("need filepath")
        return

    X, Y = get_data(filepath)

    if not train or not test:
        click.echo("need train or test size")
        return

    TRAIN_SIZE = 96 * int(train)
    TEST_SIZE = 96 * int(test)

    X_train = X[:TRAIN_SIZE]
    Y_train = Y[:TRAIN_SIZE]
    X_test = X[TRAIN_SIZE:]
    Y_test = Y[TRAIN_SIZE:]

    #clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=1)
    #clf = DecisionTreeRegressor(max_depth=25)
    #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
    #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
    #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)

    #clf.fit(X_train,Y_train)
    #y_pred = clf.predict(X_test)
    #plt.plot(X_test, y_pred, linestyle='-', color='red')
    predict_list = []
    for i in range(TEST_SIZE):
        X = [[x] for x in range(i, TRAIN_SIZE + i)]
        clf.fit(X, Y[i:TRAIN_SIZE + i])
        y_pred = clf.predict(np.array([TRAIN_SIZE + 1 + i]).reshape(1, -1))
        predict_list.append(y_pred)

    #print("mean_squared_error:%s"%mean_squared_error(Y_test, predict_list))
    #print("sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list)))
    origin_data = Y_test
    #print("origin data:%s"%origin_data)
    plt.plot([x for x in range(TRAIN_SIZE + 1, TRAIN_SIZE + TEST_SIZE + 1)],
             predict_list,
             linestyle='-',
             color='red',
             label='prediction model')
    plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model')
    plt.legend(loc=1, prop={'size': 12})
    plt.show()
Ejemplo n.º 18
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Ejemplo n.º 19
0
    def run(self):
        loss = self.lossComboBox.currentText()
        if loss == 'Least Squares':
            loss = 'ls'
        if loss == 'Least Absolute Deviation':
            loss = 'lad'
        if loss == 'Huber':
            loss = 'huber'
        if loss == 'Quantile':
            loss = 'quantile'

        params = {
            'loss': loss,
            'learning_rate': self.learningDoubleSpinBox.value(),
            'n_estimators': self.numEstSpinBox.value(),
            'subsample': self.subsampleDoubleSpinBox.value(),
            'criterion': 'friedman_mse',
            'min_samples_split': self.min_n_splitSpinBox.value(),
            'min_samples_leaf': self.min_n_leafSpinBox.value(),
            'min_weight_fraction_leaf': self.min_fractionDoubleSpinBox.value(),
            'max_depth': self.max_depthSpinBox.value(),
            'min_impurity_decrease': self.min_imp_decDoubleSpinBox.value(),
            'random_state': 1,
            'alpha': self.alphaDoubleSpinBox.value()
        }
        return params, self.getChangedValues(params,
                                             GradientBoostingRegressor())
Ejemplo n.º 20
0
def train_model(data):
    half_len = len(data)

    # train
    X = []
    y = []
    for [c, cb, delta] in data[:half_len]:
        X.append([c, cb])
        y.append(delta)

    svr_rbf_general = svm.SVR(kernel='rbf')
    svr_linear_general = svm.SVR(kernel='linear')
    svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1)
    svr_lin = svm.SVR(kernel='linear', C=1e3)
    svr_poly = svm.SVR(kernel='poly', C=1e3, degree=2)

    model_br = BayesianRidge()
    model_lr = LinearRegression()
    model_etc = ElasticNet()
    model_svr = SVR()
    model_gbr = GradientBoostingRegressor()

    # clf = svr_linear_general
    clf = svr_linear_general
    clf.fit(X, y)

    return clf
Ejemplo n.º 21
0
	def tune_gbr(self):
		parameters = {'kernel':['rbf','linear'],
              'C':[88,89,90,91,92],
              'gamma':[0.34,0.36,0.37]}
		clf = GridSearchCV(GradientBoostingRegressor(),parameters,verbose=2)
		clf.fit(self.X_train,self.y_train)
		print (clf.best_params_)
		print (clf.best_score_)
    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):

        from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor as GBR
        # Special fix for gradient boosting!
        if isinstance(X, np.ndarray):
            X = np.ascontiguousarray(X, dtype=X.dtype)
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.learning_rate = float(self.learning_rate)
            self.n_estimators = int(self.n_estimators)
            self.subsample = float(self.subsample)
            self.min_samples_split = int(self.min_samples_split)
            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_weight_fraction_leaf = float(
                self.min_weight_fraction_leaf)
            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)
            self.max_features = float(self.max_features)
            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.verbose = int(self.verbose)

            self.estimator = GBR(
                loss=self.loss,
                learning_rate=self.learning_rate,
                n_estimators=n_iter,
                subsample=self.subsample,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_depth=self.max_depth,
                criterion=self.criterion,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                random_state=self.random_state,
                verbose=self.verbose,
                warm_start=True,
            )

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)

        # Apparently this if is necessary
        if self.estimator.n_estimators >= self.n_estimators:
            self.fully_fit_ = True

        return self
Ejemplo n.º 23
0
def model_build(train_set, weight=None):
    """
    模型建立,根据训练集,构建GBDT模型
    :param train_set: 训练集
    :param weight: 训练集label权重列表
    :return: 训练完成的model
    """
    X = train_set.iloc[:, 6:11]
    Y = train_set['label']
    #print(X.head(5))
    #print(Y.head(5))
    model = GradientBoostingRegressor()
    #model = GradientBoostingClassifier()
    if not weight:
        model.fit(X, Y)
    print(model.feature_importances_)
    #print(model)
    return model
Ejemplo n.º 24
0
def getModels():
    models = {}
    models['dt'] = DecisionTreeRegressor(max_depth=50)
    models['rf1'] = RandomForestRegressor()
    models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15)
    models['gbr'] = GradientBoostingRegressor(n_estimators=128,
                                              max_depth=5,
                                              learning_rate=1.0)
    # models['abr'] = AdaBoostRegressor(n_estimators=128)
    return models
Ejemplo n.º 25
0
def model_build(train_set, weight=None):
    """
    模型建立,根据训练集,构建GBDT模型
    :param train_set: 训练集
    :param weight: 训练集label权重列表
    :return: 训练完成的model
    """
    X = train_set.iloc[:, 1:]
    print(len(X))
    Y = train_set['label']
    print(len(Y))
    #print(X.head(5))
    #print(Y.head(5))
    model = GradientBoostingRegressor()
    #model = GradientBoostingClassifier()
    #model = logistic_regression_path(X, Y)
    model.fit(X, Y)
    print(model.feature_importances_)
    #print(model)
    return model
Ejemplo n.º 26
0
def create_models():
    models = {
        'BayesianRidge': BayesianRidge(),
        # 'LinearRegression': LinearRegression(),
        'ElasticNet': ElasticNet(),
        'SVR(rbf)': SVR(kernel='rbf'),
        'SVR(linear)': SVR(kernel='linear'),
        'Lasso': Lasso(),
        'GBR': GradientBoostingRegressor(n_estimators=300, max_depth=3),
    }
    return models
Ejemplo n.º 27
0
def train_model():
    global train_x, train_y, test_x
    gbr = GradientBoostingRegressor()
    cv_score = cross_val_score(gbr, train_x, train_y).mean()
    print(cv_score)
    nn = MLPRegressor()
    cv_score = cross_val_score(nn, train_x, train_y).mean()
    print(cv_score)
    rft = RandomForestRegressor()
    cv_score = cross_val_score(rft, train_x, train_y).mean()
    print(cv_score)
Ejemplo n.º 28
0
 def train_model(self):
     #  Tried other model such MLP neural network regressor and random forest trees, but GBR performed best
     global train_x, train_y, test_x
     cvscore = []
     range = [4, 5, 6, 7, 8]
     for i in range:
         print(i)
         gbr = GradientBoostingRegressor(max_leaf_nodes=i)
         cv_score = cross_val_score(
             gbr, train_x, train_y,
             scoring='neg_mean_squared_error').mean()
         cvscore.append(cv_score)
     print(cvscore)
Ejemplo n.º 29
0
 def __init__(self,
              loss='ls',
              learning_rate=0.1,
              n_estimators=100,
              subsample=1.0,
              criterion='friedman_mse',
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.0,
              max_depth=3,
              min_impurity_decrease=0.0,
              min_impurity_split=None,
              init=None,
              random_state=None,
              max_features=None,
              alpha=0.9,
              verbose=0,
              max_leaf_nodes=None,
              warm_start=False,
              presort='auto',
              validation_fraction=0.1,
              n_iter_no_change=None,
              tol=0.0001):
     self._hyperparams = {
         'loss': loss,
         'learning_rate': learning_rate,
         'n_estimators': n_estimators,
         'subsample': subsample,
         'criterion': criterion,
         'min_samples_split': min_samples_split,
         'min_samples_leaf': min_samples_leaf,
         'min_weight_fraction_leaf': min_weight_fraction_leaf,
         'max_depth': max_depth,
         'min_impurity_decrease': min_impurity_decrease,
         'min_impurity_split': min_impurity_split,
         'init': init,
         'random_state': random_state,
         'max_features': max_features,
         'alpha': alpha,
         'verbose': verbose,
         'max_leaf_nodes': max_leaf_nodes,
         'warm_start': warm_start,
         'presort': presort,
         'validation_fraction': validation_fraction,
         'n_iter_no_change': n_iter_no_change,
         'tol': tol
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
Ejemplo n.º 30
0
def multi_output_regression(train, test, grid, outputs):

    # Multi-Layer Perceptron Regressor
    input_train, input_test, output_train, actual = pd.training_testing_data(
        train, test, grid, outputs)
    print('You are training on %d samples' % (len(input_train)))
    print('You are testing on %d samples' % (len(input_test)))
    multi_output_mlp = MultiOutputRegressor(
        MLPRegressor(solver='adam',
                     learning_rate='adaptive',
                     max_iter=500,
                     early_stopping=True))
    multi_output_mlp.fit(input_train, output_train)
    prediction_mlp = multi_output_mlp.predict(input_test)
    print('Multi-Layer Perceptron')
    print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_mlp)))
    print('MSE: %.5f' % (mean_squared_error(actual, prediction_mlp)))
    print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_mlp))))

    # Gradient Boosting Regressor
    input_train, input_test, output_train, actual = pd.training_testing_data(
        train, test, grid, outputs)
    print('You are training on %d samples' % (len(input_train)))
    print('You are testing on %d samples' % (len(input_test)))
    multi_output_gbr = MultiOutputRegressor(
        GradientBoostingRegressor(loss='huber'))
    multi_output_gbr.fit(input_train, output_train)
    prediction_gbr = multi_output_gbr.predict(input_test)
    print('Gradient Boosting Regressor')
    print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_gbr)))
    print('MSE: %.5f' % (mean_squared_error(actual, prediction_gbr)))
    print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_gbr))))

    # Random Forest Regressor
    input_train, input_test, output_train, actual = pd.training_testing_data(
        train, test, grid, outputs)
    print('You are training on %d samples' % (len(input_train)))
    print('You are testing on %d samples' % (len(input_test)))
    multi_output_rfr = MultiOutputRegressor(RandomForestRegressor())
    multi_output_rfr.fit(input_train, output_train)
    prediction_rfr = multi_output_rfr.predict(input_test)
    print('Random Forest Regressor')
    print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_rfr)))
    print('MSE: %.5f' % (mean_squared_error(actual, prediction_rfr)))
    print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_rfr))))

    return actual, prediction_gbr, prediction_mlp, prediction_rfr
Ejemplo n.º 31
0
 def predict_using_local_model(self):
     gbr = GradientBoostingRegressor()
     gbr.fit(self.train_x, self.train_y)
     print('Accuracy of gbr, on the training set: ' +
           str(gbr.score(train_x, train_y)))
     start_time = time.time()
     predictions = gbr.predict(self.test_x)
     predict_time = time.time() - start_time
     print('Prediction time for gbr is ' + str(predict_time) + '\n')
     predictions = predictions.astype('uint8')
     return predictions
Ejemplo n.º 32
0
def parameter_choose(train_set):
    """
    模型最佳参数选择,根据对应的训练集选择最佳模型参数
    :param train_set: 训练集
    :return: 无
    """
    X = train_set.iloc[:, 6:11]
    Y = train_set['label']
    param_test = {'n_estimators': range(10, 81, 10)}
    gsearch = GridSearchCV(
        estimator=GradientBoostingRegressor(learning_rate=1),
        param_grid=param_test,
        iid=True,
        cv=5)
    gsearch.fit(X, Y)
    print(gsearch.cv_results_)
    print(gsearch.best_params_, gsearch.best_score_)
Ejemplo n.º 33
0
def prediction():
    global train_x, train_y, test_x
    gbr = GradientBoostingRegressor()
    gbr.fit(train_x, train_y)
    print('Accuracy of gbr, on the training set: ' +
          str(gbr.score(train_x, train_y)))
    start_time = time.time()
    predictions = gbr.predict(test_x)
    predict_time = time.time() - start_time
    print('Prediction time for gbr is ' + str(predict_time) + '\n')
    predictions = predictions.astype('uint8')
    print(predictions)
    return predictions
Ejemplo n.º 34
0
def compare_algorithms(datasetName, data, target):
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=1)
    params = {
        'n_estimators': [10, 20, 30, 40],
        'loss': ['ls', 'huber'],
        'min_samples_leaf': [6],
        'max_depth': [3, 4, 5, 6]
    }

    print("\n\nTraining GBRT on %s..." % datasetName)
    clf = GridSearchCV(GradientBoostingRegressor(), params, cv=5, n_jobs=-1)
    clf.fit(X_train, y_train)
    print("Best params original: %s" % clf.best_params_)
    print("Avg train time original: %s seconds" %
          clf.cv_results_["mean_fit_time"][clf.best_index_])
    bestOriginal = clf.best_estimator_

    myclf = GridSearchCV(MyGradientBoostingRegressor(),
                         params,
                         cv=5,
                         n_jobs=-1)
    myclf.fit(X_train, y_train)
    print("Best params mine: %s" % myclf.best_params_)
    print("Avg train time mine: %s seconds" %
          myclf.cv_results_["mean_fit_time"][myclf.best_index_])
    bestMine = myclf.best_estimator_

    originalPredictions = bestOriginal.predict(X_test)
    myPredicttions = bestMine.predict(X_test)
    print("The dataset: %s with %s train instances" %
          (datasetName, data.shape[0]))
    print("Original GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" %
          (r2_score(y_test, originalPredictions),
           mean_squared_error(y_test, originalPredictions),
           mean_absolute_error(y_test, originalPredictions)))
    print("My GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" %
          (r2_score(y_test, myPredicttions),
           mean_squared_error(y_test, myPredicttions),
           mean_absolute_error(y_test, myPredicttions)))
def trainmodels():
    global n_folds,model_br,model_dic,model_etc,model_gbr,model_lr,model_names,\
           model_svr,cv_score_list,pre_y_list
    n_folds = 6  # 设置交叉检验的次数
    model_br = BayesianRidge()  # 建立贝叶斯岭回归模型对象
    model_lr = LinearRegression()  # 建立普通线性回归模型对象
    model_etc = ElasticNet()  # 建立弹性网络回归模型对象
    model_svr = SVR()  # 建立支持向量机回归模型对象
    model_gbr = GradientBoostingRegressor()  # 建立梯度增强回归模型对象
    model_names = [
        'BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'
    ]  # 不同模型的名称列表
    model_dic = [model_br, model_lr, model_etc, model_svr,
                 model_gbr]  # 不同回归模型对象的集合
    cv_score_list = []  # 交叉检验结果列表
    pre_y_list = []  # 各个回归模型预测的y值列表
    for model in model_dic:  # 读出每个回归模型对象
        scores = cross_val_score(model, X, y,
                                 cv=n_folds)  # 将每个回归模型导入交叉检验模型中做训练检验
        cv_score_list.append(scores)  # 将交叉检验结果存入结果列表
        pre_y_list.append(model.fit(X, y).predict(X))  # 将回归训练中得到的预测y存入列表
Ejemplo n.º 36
0
        return sum((tree.predict(X) for tree in self.trees))

    def fit(self, X, y):
        for m in range(self.n_boosting_steps):
            residuals = y - self.predict(X)
            new_tree = Node(X, residuals)
            new_tree.fit(max_tree_size=self.max_tree_size)
            self.trees.append(new_tree)

if __name__ == '__main__':

    from sklearn.cross_validation import train_test_split
    from sklearn.metrics.metrics import mean_squared_error
    from sklearn.datasets import load_boston

    boston = load_boston()
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        test_size=0.33)

    from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
    sk_gbrt = GradientBoostingRegressor(n_estimators=20)
    sk_gbrt.fit(X_train, y_train)
    print "sklearn test MSE", mean_squared_error(y_test, sk_gbrt.predict(X_test))

    mart = MART(10, 15)
    mart.fit(X_train, y_train)
    print "mart test MSE", mean_squared_error(y_test, mart.predict(X_test))


Ejemplo n.º 37
0
#     train_X = [];train_y = []
#     context = trainData[i]
#     for array in context:
#         array = [float(x) for x in array[2:] ]
#         train_X.append((array[2:-1]))
#         train_y.append(array[-1])
#     test_X = [];test_y = [];items = []
#     context = testData[i]
#     for array in context:
#         items.append((array[0],array[1]))
#         array = [float(x) for x in array[2:] ]
#         test_X.append((array[2:-1]))
#         test_y.append(array[-1])
                        
    n_etemators = 1000
    clf1 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.01, max_depth=3,verbose=0).\
            fit(train_X, train_y)
    test_score1 = np.zeros((n_etemators,), dtype=np.float64)
    for i, pred_y in enumerate(clf1.staged_predict(test_X)):
        print(i,clf1.feature_importances_)
        test_score1[i] = clf1.loss_(test_y, pred_y)
    
#     clf2 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.1, max_depth=2,verbose=0).\
#             fit(train_X, train_y)
#     test_score2 = np.zeros((n_etemators,), dtype=np.float64)
#     for i, pred_y in enumerate(clf2.staged_predict(test_X)):
#         test_score2[i] = clf2.loss_(test_y, pred_y)
#     
#     clf3 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.1, max_depth=2,verbose=0,subsample=0.5).\
#             fit(train_X, train_y)
#     test_score3 = np.zeros((n_etemators,), dtype=np.float64)
#     for i, pred_y in enumerate(clf3.staged_predict(test_X)):
Ejemplo n.º 38
0
for year in [2007, 2009, 2011, 2013]:

    X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split(
        train_for_loo,
        'WnvPresent',
        year)      

    X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False)
    X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
    y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False)
    y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False)


    if predict_num_mosquitos:
        reg = GradientBoostingRegressor(n_estimators=40)

        reg.fit(X_train.drop(['NumMosquitos'], axis=1), y_train_numMosquitos.astype(float))
        predicted_mosquitos = reg.predict(X_test)
        X_test['NumMosquitos'] = predicted_mosquitos
        print("Accuracy is", metrics.r2_score(y_test_numMosquitos, predicted_mosquitos))

    clf.fit(X_train.drop(['NumMosquitos'], axis=1), y_train)

    y_pred = clf.predict_proba(X_test)[:, 1]
    # print(y_pred)

    # y_pred = clf.predict_proba(X_test) # For xgbwrapper best score: 57.2
    #         y_pred = clf.predict_proba(X_test)
    # y_pred = clf.predict(X_test)
Ejemplo n.º 39
0
from sklearn.ensemble import GradientBoostingClassifier
from BinReader import BinReader
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

(data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') 

X_train = np.array(data)
label = [item[0] for item in label]
y_train = np.array(label)
est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train)
print 'testing...'

reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin')
reader.open()
result = [0] * reader.LineCount
for i in xrange(reader.LineCount):
    (x,userid,itemid,label) = reader.readline()
    x[0] = 1
    y = est.predict([x])[0]
    result[i] = (userid,itemid,y)
    if i % 10000 == 0:
        print '%d/%d' % (i,reader.LineCount)
    
result.sort(key=lambda x:x[2],reverse=True)
result = result[:7000]


print ur'正在输出...'
with open('result.csv','w') as f:
    for item in result: