Beispiel #1
0
 def fit_model(self, X_train, y_train, X_test, y_test):
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train, y_train)
     # y_pre = clf.predict(X_test)
     # y_pro = clf.predict_proba(X_test)[:, 1]
     # print
     # "pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro)
     # print("pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre))
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set sample number remains the same"
     return X_train_new, y_train, X_test_new, y_test
Beispiel #2
0
def cbd_model(cbd_df,cbd_finalinput):
    '''
    function that creates model from the cbd dataframe and returns the predicted
    number of crimes for the next three days
    '''

    X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index',
       'gdp_millions_2007', 'seasonally_adjusted_unemployment',
       'unadjusted_unemployment', 'Possession, cocaine ',
       'Heroin, possession ', 'Heroin Price Canada',
       'day_segment_1200pm-1159pm', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday']]
    y_cbd=cbd_df['number_of_crimes']


    scaler = StandardScaler()
    scaler.fit(X_cbd)  # Don't cheat - fit only on training data
    X_cbd = scaler.transform(X_cbd)
    cbd_input_scaled = scaler.transform(cbd_finalinput)
    xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
    xgb.fit(X_cbd,y_cbd)
    predict_cbd=xgb.predict(cbd_input_scaled)

    return predict_cbd
Beispiel #3
0
def XGB_reg_evaluation(individual, evaluation_method='roll_win'):
    '''
    evaluation_method : can be roll_win, mse
    '''

    if evaluation_method == 'roll_win':
        trainNumber = individual[6]  # the train num
        param = {
            'eta': individual[0],
            'silent': True,
            'objective': "reg:linear",
            'nthread': -1,
            'min_child_weight': individual[1],
            'max_depth': individual[2],
            'subsample': individual[3],
            'colsample_bylevel': individual[4],
            'seed': 0
        }
        roll_win_mseValue = 0
        for i in xrange(N_validation):
            trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\
                                          trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)]

            testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \
                                       trainY[(trainNum - (i + 1) * window):(trainNum - i * window)]
            dtrain = xgb.DMatrix(data=trainingX, label=trainingY)
            bst = xgb.train(params=param,
                            dtrain=dtrain,
                            num_boost_round=individual[5])
            testingX = xgb.DMatrix(testingX)
            roll_win_mseValue += sum(
                (testingY - bst.predict(testingX))**2) / window
        roll_win_mseValue /= N_validation
        return (roll_win_mseValue, )

    if evaluation_method == 'mse':
        ### The cross validation evaluation
        N_SPLITS = N_splits
        kf = KFold(n_splits=N_SPLITS)
        cv_mseValue = 0
        fc = XGBRegressor(learning_rate=individual[0],
                          n_estimators=individual[5],
                          silent=True,
                          objective="reg:linear",
                          nthread=-1,
                          gamma=0,
                          min_child_weight=individual[1],
                          max_depth=individual[2],
                          subsample=individual[3],
                          colsample_bylevel=individual[4],
                          seed=0)
        for train, test in kf.split(trainX):
            fc.fit(trainX[train, :], trainY[train])
            cv_mseValue += sum(
                (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test)
        cv_mseValue = cv_mseValue / N_SPLITS
        return (cv_mseValue, )

    print "There is no evaluation method for %s" % evaluation_method
    raise Exception("evaluation_method is not valid")
Beispiel #4
0
def go(data_dict,feats_to_use, params={"seed":0,"silent":False,"n_jobs":-1},
 parameter_tuning=False):
    
    '''
    if with_gpu:
        xgb = XGBRegressor(seed=0, silent=False, tree_method='gpu_hist', n_gpus=-1)
    else:
        xgb = XGBRegressor(seed=0, silent=False, n_jobs=-1)
    '''
    X_train=data_dict['X_train'][feats_to_use].copy()
    y_train=data_dict['y_train'].copy()
    X_test=data_dict['X_test'][feats_to_use].copy()
    X_val=data_dict['X_val'][feats_to_use].copy()
    y_val=data_dict['y_val'].copy()

    
    
    if parameter_tuning:
        fit_params={
        "early_stopping_rounds":10, 
        "eval_metric" : "rmse", 
        "eval_set" : [(X_val,y_val)]}
        xgb=XGBRegressor() 
        train_val_features=pd.concat([X_train,X_val])
        train_val_labels=pd.concat([y_train,y_val])
        test_fold = np.zeros(train_val_features.shape[0])   # initialize all index to 0
        test_fold[:X_train.shape[0]] = -1   # set index of training set to -1, indicating not to use it in validation
        
        ps=PredefinedSplit(test_fold=test_fold)
        X_train=data_dict['X_train'][feats_to_use]
        y_train=data_dict['y_train']
        X_test=data_dict['X_test'][feats_to_use]
        grid=GridSearchCV(xgb,params,fit_params=fit_params,scoring=RMSE , cv=ps, verbose=32, n_jobs=-1)
        start=time.time()
        grid.fit(train_val_features,train_val_labels)
        elapsed=time.time()-start
        print (elapsed)
        print ('best params:',grid.best_params_)
        print ('best score:',grid.best_score_)

        return grid.best_params_, grid.best_estimator_
        
    else:
        xgb=XGBRegressor(**params)
        print (xgb)
    
        print ('start xgboost training')
        start=time.time()
        eval_set=[(X_val,y_val)]
        xgb.fit(X_train,y_train, eval_set=eval_set,eval_metric='rmse',early_stopping_rounds=30)
        elapsed=time.time()-start
        print (elapsed)
        data_dict['y_pred']=np.exp(xgb.predict(X_test))-1

        #generate submission
        data_dict['X_test']['item_cnt_month']=data_dict['y_pred']
        test=pd.read_csv('test.csv')
        submission=pd.merge(test,data_dict['X_test'], 
            on=['shop_id','item_id'],how='left')[['ID','item_cnt_month']]
        return submission, xgb
Beispiel #5
0
class my_model:
    def __init__(self, d):
        self.linear_reg = linear_model.Ridge()
        self.xgb_reg = XGBRegressor(max_depth=7)
        self.d = d

    def fit(self, X, y):
        self.linear_reg.fit(X[:, 0].reshape(-1, 1), y)
        self.l_reg_res = self.linear_reg.predict(X[:, 0].reshape(-1, 1))
        self.xgb_reg.fit(X[:, 1:], y - self.l_reg_res)
        X_nn = np.hstack([
            X,
            self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1),
            self.l_reg_res.reshape(-1, 1)
        ])
        return X_nn

    def predict(self, X):
        if isinstance(X[0, -1], str):
            for i in range(X.shape[0]):
                X[i, -1] = self.d[X[i, -1]]
            X = X.astype(np.float64, copy=False)

        X_nn_final = np.hstack([
            X,
            self.xgb_reg.predict(X[:, 1:]).reshape(-1, 1),
            self.linear_reg.predict(X[:, 1].reshape(-1, 1)).reshape(-1, 1)
        ])
        return X_nn_final
Beispiel #6
0
def run_xgb(output_train, df_X_train, df_Y_train, output_test, df_X_test,
            df_Y_test):
    xgb_estimator = XGBRegressor()
    param_grid = {
        'nthread': [4],  #when use hyperthread, xgboost may become slower
        'objective': ['reg:linear'],
        'learning_rate': [.03, 0.05, .07],  #so called `eta` value
        'max_depth': [5, 6, 7],
        'min_child_weight': [4],
        'silent': [1],
        'subsample': [0.7],
        'colsample_bytree': [0.7],
        'n_estimators': [30]
    }

    opt_pars = {"score": None, "alpha": None}
    xgb_grid = GridSearchCV(xgb_estimator, param_grid)
    xgb_grid.fit(df_X_train, df_Y_train.cnt)
    r2_train = xgb_grid.best_score_
    opt_pars = xgb_grid.best_params_
    # n_estimators = 30,max_features='log2',bootstrap=True,  max_depth=None
    xgb_opt = XGBRegressor(random_state=1).set_params(**opt_pars)
    xgb_opt.fit(df_X_train, df_Y_train.cnt)
    r2_train = xgb_opt.score(df_X_train, df_Y_train.cnt)
    r2_test = xgb_opt.score(df_X_test, df_Y_test.cnt)
    result = df_proc.compare_results("XGBoost", xgb_opt, output_train,
                                     df_X_train, output_test, df_X_test)
    return {
        "r2": [r2_train, r2_test],
        "R2": [result[1], result[2]],
        "plot": result[0]
    }
class XGBWrapper_regr(object):
    """
    A wrapper for xgboost model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = XGBRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        self.model = self.model.set_params(**params)
        
        eval_set = [(X_train, y_train)]
        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_metric='rmse',
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        scores = self.model.evals_result()
        self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()}
#         self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()}

        self.feature_importances_ = self.model.feature_importances_
    
    def predict(self, X_test):
        return self.model.predict(X_test)
Beispiel #8
0
 def xgboostmodel(self):
     df = pd.read_csv(datafile, encoding='utf-8', index_col=0)
     print(df.shape)
     traindata = df.iloc[:, :].values
     x = traindata[:, :-1]
     y = traindata[:, -1]
     x_train, x_test, y_train, y_test = train_test_split(
         x, y, train_size=0.7)  # list
     if self.params is None:
         params = {'max_depth': 80, 'n_estimators': 512}
     else:
         params = self.params
     raw_model = XGBRegressor(max_depth=128,
                              n_estimators=768,
                              learning_rate=0.01,
                              silence=False)
     raw_model.fit(x_train, y_train)
     raw_model.save_model(self.model_file)
     pred = raw_model.predict(x_test)
     self.true = y_test
     self.pred = pred
     self.show_save_figure(fig_path=self.fig_path,
                           modelname=self.job_name,
                           detal_idx=500)
     t_mean = self.cal_mean(self.true)
     p_mean = self.cal_mean(self.pred)
     self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
Beispiel #9
0
def train_xg_boost(params):
    xg_model = XGBRegressor(n_estimators=int(params['n_estimators']), learning_rate=params['eta'], n_jobs=-1,
                            max_depth=int(params['max_depth']), gamma=params['gamma'], colsample_bytree=params['colsample_bytree'],
                            min_child_weight=params['min_child_weight'], reg_alpha=params['xg_reg_alpha'], subsample=params['subsample'],
                            reg_lambda=params['xg_reg_lambda']
                            )

    # In[ ]:
    xg_model.fit(X_train.values, y_train.values)
    training_values = xg_model.predict(X_train.values)
    print(training_values)
    training_rmse = math.sqrt(mean_squared_error(y_train, training_values))
    print("training_rmse", training_rmse)
    validation_values = xg_model.predict(X_validtn.values)
    validation_rmse = math.sqrt(mean_squared_error(y_validtn, validation_values))
    print("validation_rmse", validation_rmse)
    """test_submission = pd.DataFrame()
    test_submission["Score"] = xg_model.predict(combined_test_data)
    test_submission.to_excel('submission4.xlsx', index=False)"""

    return {
        'loss': validation_rmse,
        'status': STATUS_OK,
        'eval_time': time.time(),
    }
Beispiel #10
0
class XGBRegressorMetaPrim(primitive):
    def __init__(self, random_state=0):
        super(XGBRegressorMetaPrim, self).__init__(name='XGBRegressorMeta')
        self.hyperparams = []
        self.type = 'ensemble'
        self.description = "XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = XGBRegressor(random_state=self.random_state, n_jobs=5)
        self.accept_type = 'xgb'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
Beispiel #11
0
    def __train_model(self, features):
        combo_list = [
            ['available_year_avg', 'min_nights_year_avg', 'price_year_avg']
        #     ['available_winter_avg', 'min_nights_winter_avg', 'price_winter_avg'],
        #     ['available_spring_avg', 'min_nights_spring_avg', 'price_spring_avg'],
        #     ['available_summer_avg', 'min_nights_summer_avg', 'price_summer_avg']
        ]
        for combo in combo_list:
            X_base = features.drop([
                'price_year_avg', 'price_winter_avg', 'price_spring_avg', 'price_summer_avg', 'price_fall_avg',
                'available_year_avg', 'available_winter_avg', 'available_spring_avg', 'available_summer_avg', 'available_fall_avg',
                'min_nights_year_avg', 'min_nights_winter_avg', 'min_nights_spring_avg', 'min_nights_summer_avg', 'min_nights_fall_avg'
            ], axis=1)
            X_base[combo[0]] = features[combo[0]]
            X_base[combo[1]] = features[combo[1]]
            y = features[combo[2]]
            X_train, X_test, y_train, y_test = train_test_split(X_base, y, test_size=.25, random_state=42, shuffle=True)

            model = XGBRegressor(
                objective='reg:squarederror',
                learning_rate=0.1,
                max_depth=8,
                n_estimators=200,
                cv=5,
                n_jobs=-1
            )
            model.fit(X_train, y_train)
            self.logger.info('Gradient boost model:')
            self.logger.info(f'Target label: {combo[2]}')
            self.logger.info(f'R^2: {model.score(X_test, y_test)}')
            self.logger.info(f'MAE: {mean_absolute_error(y_test, model.predict(X_test))}')
            return model
Beispiel #12
0
def over_sample(train, test, feat):
    predictors = [x for x in train.columns if x not in ['ID', 'y']]
    groups = list(train[feat].unique())
    result = None
    for name in groups:
        train_temp = pd.concat([train, train[train[feat] == name]])
        test_temp = test[test[feat] == name]
        model = XGBRegressor(max_depth=4,
                             learning_rate=0.0045,
                             n_estimators=1250,
                             silent=True,
                             objective='reg:linear',
                             nthread=-1,
                             min_child_weight=1,
                             max_delta_step=0,
                             subsample=0.93,
                             seed=27)
        model.fit(train_temp[predictors], train_temp['y'])
        pred = model.predict(test_temp[predictors])
        if result is None:
            result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred})
        else:
            result = pd.concat([
                result,
                pd.DataFrame({
                    'ID': test_temp['ID'].values,
                    'y': pred
                })
            ])
    result.sort_values('ID', inplace=True)

    return result
Beispiel #13
0
def generate_XGB_model(train_df):
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    '''print 'Train And Fix Missing Age Value...'
    train_df, xgb_age = train_model_for_age(train_df)
    joblib.dump(xgb_age, 'XGB_age.model')'''
    train_df.drop(['marriageStatus', 'haveBaby', 'sitesetID', 'positionType'],
                  axis=1,
                  inplace=True)
    print 'Done'
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:, 0]
    X = train_np[:, 1:]
    print 'Train Xgboost Model...'
    start_time = datetime.datetime.now()
    xbg_clf = XGBRegressor(n_estimators=100,
                           max_depth=6,
                           objective="binary:logistic",
                           silent=False)
    xbg_clf.fit(X, y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
    model_df = pd.DataFrame({
        'columns': list(train_df.columns)[1:],
        'values': xbg_clf.feature_importances_
    })
    print model_df
    return xbg_clf
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        xgb_base = XGBRegressor(objective='reg:linear',
                                n_estimators=ntree,
                                random_state=1234,
                                silent=0,
                                booster='gbtree',
                                eval_metric='rmse')
        rmse_t_1, rmse_v_1 = [], []
        print('此时 ntree = %s' % ntree)
        for train, test in get_cv(y=y_train, n_splits=5, random_state=42):
            X_t, y_t = X_train[train], y_train[train]
            X_v, y_v = X_train[test], y_train[test]
            xgb_base.fit(X_t, y_t)
            y_t_pre = xgb_base.predict(X_t)
            y_v_pre = xgb_base.predict(X_v)
            rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
            rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
            rmse_t_1.append(rmse_t_each)
            rmse_v_1.append(rmse_v_each)
        rmse_t = np.mean(rmse_t_1)
        rmse_v = np.mean(rmse_v_1)
        rmse_t_total.append(rmse_t)
        rmse_v_total.append(rmse_v)

    return rmse_t_total, rmse_v_total
Beispiel #15
0
    def xgboost_single_pred(self):

        x_train = self.x_train
        y_train = self.y_train

        x_test = self.x_test
        y_test = self.y_test

        self.y_pred_all_xgb = []
        y_train = list(y_train)
        xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75)

        for i in range(len(x_test)):
            xgboost_clf.fit(x_train, y_train)
            x_test_one = x_test.iloc[i:i + 1]
            y_test_one = xgboost_clf.predict(x_test_one)
            self.y_pred_all_xgb.append(list(y_test_one)[0])
            x_train = x_train.append(x_test_one)
            y_train.append(y_test[i])

        xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb)
        xgboost_rmse = np.sqrt(xgboost_mse)
        y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb))
        ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb
        return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
Beispiel #16
0
 def fit_model_split(self, X_train, y_train, X_test, y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBRegressor(learning_rate=self.learning_rate,
                        n_estimators=self.n_estimators,
                        max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        gamma=self.gamma,
                        subsample=self.subsample,
                        colsample_bytree=self.colsample_bytree,
                        objective=self.objective,
                        nthread=self.nthread,
                        scale_pos_weight=self.scale_pos_weight,
                        reg_alpha=self.reg_alpha,
                        reg_lambda=self.reg_lambda,
                        seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     # y_pre = clf.predict(X_train_2)
     # y_pro = clf.predict_proba(X_train_2)[:, 1]
     # print
     # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro)
     # print
     # "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre)
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print
     "Training set of sample size 0.4 fewer than before"
     return X_train_new2, y_train_2, X_test_new, y_test
    def fit_model(self, data, target, test):
        clf = XGBRegressor(learning_rate=self.learning_rate,
                           n_estimators=self.n_estimators,
                           max_depth=self.max_depth,
                           min_child_weight=self.min_child_weight,
                           gamma=self.gamma,
                           subsample=self.subsample,
                           colsample_bytree=self.colsample_bytree,
                           objective=self.objective,
                           nthread=self.nthread,
                           scale_pos_weight=self.scale_pos_weight,
                           reg_alpha=self.reg_alpha,
                           reg_lambda=self.reg_lambda,
                           seed=self.seed)
        data = np.array(data).astype(float)
        scaler = MinMaxScaler()
        temp = scaler.fit(data)
        data = scaler.transform(data)
        test = scaler.transform(test)
        target = scaler.fit_transform(target)

        clf.fit(data, target)
        new_feature = clf.apply(data)
        new_test = clf.apply(test)
        X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature)
        X_test_new = self.mergeToOne(pd.DataFrame(test), new_test)
        X_train_new = pd.DataFrame(X_train_new)
        X_test_new = pd.DataFrame(X_test_new)
        return X_train_new, target, X_test_new
def FI_xgb_sklearn():
    X, y = load_traindata(encodetype='le')
    cols = list(X.columns)

    rndcol = np.random.randn(X.shape[0])
    X = np.column_stack((X, rndcol))
    cols.append('random')

    xgb1 = XGBRegressor(learning_rate=0.01,
                        n_estimators=3320,
                        max_depth=3,
                        min_child_weight=4,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        importance_type='total_gain',
                        objective='reg:linear',
                        n_jobs=-1,
                        random_state=0,
                        seed=27,
                        silent=True)

    xgb1.fit(X, y)

    imp = sorted(list(zip(cols, xgb1.feature_importances_)),
                 key=lambda t: abs(t[1]),
                 reverse=True)
    imp = pd.DataFrame(imp, columns=['Feature', 'Importance'])
    rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0]
    print(imp.iloc[:rnd_idx + 1, :])
    return imp
Beispiel #19
0
def xgt_regressor(lr, max_d, estimators, X_train, X_test, y_train, y_test, obj):
    rmse = 10000
    for i in lr:
        for j in max_d:
            for k in estimators:

                clf=XGBRegressor(learning_rate=i,
                                n_estimators=k,
                                max_depth=j, 
                                min_child_weight=1, 
                                gamma=1,
                                subsample=0.5,
                                colsample_bytree=0.8,
                                objective=obj,
                                nthread=4,
                                scale_pos_weight=1, 
                                missing=np.nan)
                clf.fit(X_train,y_train)
                y_pred=clf.predict(X_test)
                
                a,b,c = pred_eval(y_pred,y_test)

                if a < rmse:
                    b_lr = i
                    b_d = j
                    b_e = k
                    
                    rmse = a
                    clf_b = clf
        
    return clf_b, (b_lr, b_d, b_e)
Beispiel #20
0
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target):
    param = {
        'n_estimators': 10,
        'learning_rate': 0.01,
    }

    adj_params = {
        'n_estimators': [10, 50, 100, 200, 300, 400, 500, 1000],
        'learning_rate': [0.01, 0.1, 1]
    }

    xgbt = XGBRegressor(**param)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    cscv = GridSearchCV(xgbt,
                        adj_params,
                        scoring='neg_mean_absolute_error',
                        cv=cv,
                        n_jobs=-1)
    cscv.fit(train_input, train_target)
    xgbt = XGBRegressor(**cscv.best_params_)
    xgbt.fit(train_input, train_target.ravel())
    predicted = xgbt.predict(test_input)
    xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(
        test_target, predicted))
    print("xgbt_base_rmse: ", xgbt_base_rmse)
    #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted)))
    return xgbt_base_rmse
Beispiel #21
0
def learn_model(X_train, y_train, X_valid, y_valid):
    t1 = time()
    model = XGBRegressor(max_depth=7, n_estimators=500)
    model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=True, early_stopping_rounds=10)
    t2 = time()
    print('Total of training time: ', t2 - t1)
    return model
Beispiel #22
0
    def apply_model(self,
                    feature_names,
                    cv_type=None,
                    search_type=None,
                    scorer=None):
        self.cv_type = cv_type
        self.search_type = search_type
        self.scorer = scorer

        if cv_type is None:
            print("Seed:", self.seed)
            x_gb = XGBRegressor(learning_rate=0.1,
                                n_estimators=300,
                                random_state=self.seed)
            x_gb.fit(self.X_train, self.y_train)
        else:
            if search_type == "random":
                parameter_grid = self._get_random_parameter_grid()
                print("Seed:", self.seed)
                x_gb_search = RandomizedSearchCV(
                    estimator=XGBRegressor(n_estimators=300,
                                           random_state=self.seed),
                    param_distributions=parameter_grid,
                    scoring=self._get_scorer(),
                    n_iter=150,
                    cv=self._get_cv(),
                    iid=False,
                    random_state=self.seed,
                    n_jobs=-1)
                x_gb_search.fit(self.X_train, self.y_train)
                x_gb = x_gb_search.best_estimator_

                mean_test_score = x_gb_search.cv_results_["mean_test_score"]
                std_test_score = x_gb_search.cv_results_["std_test_score"]
                print(max(mean_test_score),
                      std_test_score[np.argmax(mean_test_score)])
                print(x_gb)
            elif search_type == "smac":
                smac = self._get_smac()
                try:
                    incumbent = smac.optimize()
                finally:
                    incumbent = smac.solver.incumbent
                x_gb = XGBRegressor(
                    learning_rate=0.1,
                    n_estimators=300,
                    seed=self.seed,
                    max_depth=incumbent["max_depth"],
                    min_child_weight=incumbent["min_child_weight"],
                    gamma=incumbent["gamma"],
                    subsample=incumbent["subsample"],
                    colsample_bytree=incumbent["colsample_bytree"])
                x_gb.fit(self.X_train, self.y_train)
                print(x_gb)
            else:
                raise ValueError("search_type must be either random or smac.")

        train_prediction, test_prediction = self.apply_predict(x_gb)
        return train_prediction, test_prediction
Beispiel #23
0
def xgb_regressor(x, y):
    model = XGBRegressor(n_estimators=33,
                         learning_rate=0.1,
                         subsample=0.6,
                         max_depth=1,
                         objective='rank:pairwise',
                         random_state=0)
    model.fit(x, y)
    return model
Beispiel #24
0
def get_feat_imp(train,ID='id',target='price_doc'):

    predictors = [x for x in train.columns if x not in [ID,target]]
    model = XGBRegressor( max_depth=5, learning_rate=0.05, n_estimators=385,
                          silent=True, objective='reg:linear', nthread=-1, min_child_weight=1,
                          max_delta_step=0, subsample=0.93, seed=27)
    model.fit(train[predictors],train[target])
    feat_imp = pd.Series(model.booster().get_fscore(),index=predictors).sort_values(ascending=False)
    return feat_imp
Beispiel #25
0
def cv_test(train, cv=5):

    t0 = time.time()
    target = 'y'
    predictors = [x for x in train.columns if x not in ['ID', 'y']]
    train_X = train[predictors]
    train_Y = train[target]

    mean_r2 = []
    kf = KFold(len(train_Y), n_folds=cv, shuffle=True, random_state=520)
    for i, (train_index, test_index) in enumerate(kf):

        x_train = train_X.iloc[train_index]
        x_test = train_X.iloc[test_index]
        y_train = train_Y.iloc[train_index]
        y_test = train_Y.iloc[test_index]

        lgb_model = LGBMRegressor(boosting_type='gbdt',
                                  num_leaves=10,
                                  max_depth=4,
                                  learning_rate=0.005,
                                  n_estimators=675,
                                  max_bin=25,
                                  subsample_for_bin=50000,
                                  min_split_gain=0,
                                  min_child_weight=5,
                                  min_child_samples=10,
                                  subsample=0.995,
                                  subsample_freq=1,
                                  colsample_bytree=1,
                                  reg_alpha=0,
                                  reg_lambda=0,
                                  seed=0,
                                  nthread=-1,
                                  silent=True)
        xgb_model = XGBRegressor(max_depth=4,
                                 learning_rate=0.0045,
                                 n_estimators=1250,
                                 silent=True,
                                 objective='reg:linear',
                                 nthread=-1,
                                 min_child_weight=1,
                                 max_delta_step=0,
                                 subsample=0.93,
                                 seed=27)
        xgb_model.fit(x_train, y_train)

        pred = xgb_model.predict(x_test)
        from sklearn.metrics import r2_score
        score = r2_score(y_test, pred)
        mean_r2.append(score)
        print('{0}: r2:{1}\n\n'.format(i + 1, score))

    print(u'r2-均值:%s' % (np.array(mean_r2).mean()))
    print('Done in %.1fs!' % (time.time() - t0))

    return None
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM,
                                             test_size=0.1,
                                             random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(
            cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index,
                                            CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(
            SUBMISSION_FOLDER_PATH,
            "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(learning_rate=0.01,
                             max_depth=12,
                             n_estimators=N_ESTIMATORS,
                             silent=False,
                             objective="reg:linear",
                             gamma=1,
                             min_child_weight=1,
                             subsample=0.8,
                             colsample_bytree=0.5,
                             reg_alpha=1,
                             seed=cross_validation_index,
                             nthread=-1)

        model.fit(X_train[train_index],
                  Y_train[train_index],
                  eval_set=[(X_train[valid_index], Y_train[valid_index])],
                  eval_metric=lambda y_predicted, y_true:
                  ("actual_mae",
                   mean_absolute_error(np.exp(y_true.get_label()),
                                       np.exp(y_predicted))),
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
Beispiel #27
0
def model(df, alpha):
    X = df
    y = df.pop('y')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.3,
                                                        random_state=42)

    upper = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=1 - alpha / 2,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    lower = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=alpha / 2,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    median = _random_search(
        X_train, y_train,
        XGBOOSTQUANTILE(quant_alpha=.5,
                        quant_delta=1,
                        quant_thres=6,
                        quant_var=3.2), {
                            'quant_delta': uniform(.01, 12),
                            'quant_thres': uniform(1, 12),
                            'quant_var': uniform(1, 12)
                        }).predict(X_test)

    xgbls = XGBRegressor()
    xgbls.fit(X_train, y_train)
    mean = xgbls.predict(X_test)

    return pd.concat([
        X_test.reset_index(drop=True),
        y_test.reset_index(drop=True),
        pd.DataFrame(upper, columns=['upper_bound']),
        pd.DataFrame(lower, columns=['lower_bound']),
        pd.DataFrame(mean, columns=['mean']),
        pd.DataFrame(median, columns=['median'])
    ],
                     axis=1)
def xgb_regression(X_train,y_train,X_val, y_val,X_test,y_test,args):
    if y_test.shape[-1] == 1:
        model = XGBRegressor(
            learn_rate=0.1,
            max_depth=4,  # 4
            min_child_weight=10,
            gamma=1,  # 1
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.8,
            objective='reg:linear',
            n_estimators=2000,
            tree_method='gpu_hist',
            n_gpus=-1
        )
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse',
                  early_stopping_rounds=300)
        y_pred = model.predict(X_test)
        y_test = y_test.astype('float')
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = MSE ** 0.5
        return RMSE
    else:
        RMSEs = []
        if len(y_train.shape) == 3:
            y_train = [x[0] for x in y_train]
            y_val = [x[0] for x in y_val]
            y_test = [x[0] for x in y_test]
            y_train = pd.DataFrame(y_train)
            y_val = pd.DataFrame(y_val)
            y_test = pd.DataFrame(y_test)
        for i in range(y_test.shape[1]):
            if float(max(y_val[i])) == 0 or float(max(y_train[i])) == 0 or float(max(y_test[i])) == 0:
                continue
            model = XGBRegressor(
                learn_rate=0.1,
                max_depth=4,  # 4
                min_child_weight=10,
                gamma=1,  # 1
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.8,
                objective='reg:linear',
                n_estimators=2000,
                tree_method='gpu_hist',
                n_gpus=-1
            )
            model.fit(X_train, [float(k) for k in y_train[i]], eval_set=[(X_val, [float(k) for k in y_val[i]])], eval_metric='rmse',
                      early_stopping_rounds=300)
            y_pred = model.predict(X_test)
            y_test = y_test.astype('float')
            MSE = mean_squared_error(y_test[i], y_pred)
            RMSE = MSE ** 0.5
            RMSEs.append(RMSE)
        return np.mean(RMSEs)
Beispiel #29
0
def run_xgb(**args):
    print("building xgb model:")
    xgb_model = XGBRegressor()
    xgb_model.fit(args["training_data"], args["training_label"])
    output = xgb_model.predict(args["test_data"])
    pickle.dump(xgb_model, open("xgb_testmodel.p", "wb"))

    output = list(map(lambda e: round(e), output))
    print(output)
    pickle.dump(output, open("xgb_output.p", "wb"))
    return output
Beispiel #30
0
def fit_model(X, y, model_name):
    print('Model Fitting started: ', datetime.now())
    start_time = time.time()

    classifier = XGBRegressor(objective='reg:squarederror', n_jobs=8, n_estimators= 1000, verbosity= 3)
    classifier.fit(X, y)
    pickle.dump(classifier, open("models/{}.pickle.dat".format(model_name), 'wb'))

    print('Duration Fitting: ', (time.time() - start_time))

    return classifier
Beispiel #31
0
def xgb(x_train, y_train, x_val, y_val):
    xgb = XGBRegressor(n_estimators=1000,
                       max_depth=10,
                       learning_rate=0.01,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=2000)
    xgb.fit(x_train, y_train)
    result = xgb.predict(x_val)
    score = mean_absolute_error(result, y_val)
    return score
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
Beispiel #33
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
pd.Series(rf_age_valid_pred).isnull().sum()
statistics.mean(error_df.sqerr)

len(rf_age_valid_pred)
len(age_valid_Y)
pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y)

## XGB for age prediction
from xgboost.sklearn import XGBRegressor

xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100,
                    objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321)

eval_set  = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)]
xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1)
xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))


## ADAboost for age prediction
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321)
ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,)
ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))

len(ada_age_valid_pred)
len(age_valid_Y)

error_df = pd.DataFrame(pd.Series(ada_age_valid_pred), columns=['pred'])
error_df.reset_index( inplace=True)
act_df = pd.DataFrame(age_valid_Y)
gsearch7.best_params_
#{'reg_alpha': 0.09}
xgb1.set_params(reg_alpha= 0.09)

#parameters of algorithm
xgb1=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=7, min_child_weight=4, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='reg:gamma', random_state=0,
       reg_alpha=0.09, reg_lambda=1, scale_pos_weight=1, seed=1048,
       silent=True, subsample=0.86)
###################

 #Fit the algorithm on the data

xgb1.fit(train_x, train_y,eval_metric='rmse')

### train vs test
pre = xgb1.predict(test_x)
plt.figure(figsize=(16,9))
plt.style.use('ggplot')
plt.plot(pre,label='predict_load1')
plt.plot(np.array(test_y),label='test_load1')
plt.title('Test_prediction  MAE=%s' % str(np.sum(abs(pre-test_y))/len(pre)))
plt.legend(loc='upper left')
plt.savefig("D:\\load Forecasting\\plot\\TraiVsTest.jpg")


#####Newdata
pre1=xgb1.predict(load1_test.drop('load1',1))
plt.figure(figsize=(16,9))
Beispiel #36
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)