Beispiel #1
0
def run_xgb(**args):
    print("building xgb model:")
    xgb_model = XGBRegressor()
    xgb_model.fit(args["training_data"], args["training_label"])
    output = xgb_model.predict(args["test_data"])
    pickle.dump(xgb_model, open("xgb_testmodel.p", "wb"))

    output = list(map(lambda e: round(e), output))
    print(output)
    pickle.dump(output, open("xgb_output.p", "wb"))
    return output
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = XGBRegressor(
            learning_rate=0.01,
            max_depth=12,
            n_estimators=N_ESTIMATORS,
            silent=False,
            objective="reg:linear",
            gamma=1,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.5,
            reg_alpha=1,
            seed=cross_validation_index,
            nthread=-1)

        model.fit(X_train[train_index], Y_train[train_index], eval_set=[(X_train[valid_index], Y_train[valid_index])],
            eval_metric=lambda y_predicted, y_true:("actual_mae", mean_absolute_error(np.exp(y_true.get_label()), np.exp(y_predicted))),
            early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=True)

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
Beispiel #3
0
 def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
     """
     input params:
     - df (DataFrame): dataframe of training data
     - target_column (string): name of target column
     - id_column (string): name of id column
     - target_type (string): 'linear' or 'binary'
     - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
     - drop_columns (list): list of columns to drop
     - numeric_columns (list): list of columns to convert to numeric
     - verbose (bool): verbosity of printouts
     """
     if type(df) == pd.core.frame.DataFrame:
         self.df = df
         self.early_stopping_rounds = early_stopping_rounds
         if target_column:
             self.target_column = target_column
             self.id_column = id_column
             self.target_type = target_type
             self.categorical_columns = categorical_columns
             self.numeric_columns = numeric_columns
             self.drop_columns = drop_columns
             self.verbose = verbose
             self.num_training_rounds = num_training_rounds
             # init the classifier
             if self.target_type == 'binary':
                 self.scoring = 'auc'
                 self.clf = XGBClassifier(
                     learning_rate =0.1,
                     n_estimators = num_training_rounds,
                     subsample = 0.8,
                     colsample_bytree = 0.8,
                     objective = 'binary:logistic',
                     scale_pos_weight = 1,
                     seed = 123)
             elif self.target_type == 'linear':
                 self.scoring = 'rmse'
                 self.clf = XGBRegressor(
                         n_estimators = num_training_rounds,
                         objective = 'reg:linear'
                         )
         else:
             print('please provide target column name')
     else:
         print('please provide pandas dataframe')
X_test_scaled = X_test_scaled_new

trials = Trials()
algo = partial(tpe.suggest, n_startup_jobs=10)
best = fmin(lasso_f, lasso_space, algo=tpe.suggest, max_evals=2, trials=trials)
best_nodes = parse_lasso_nodes(trials, lasso_space_nodes)
save_inter_params(trials, lasso_space_nodes, best_nodes, "tmdb_box_office_prediction")
rsg = train_lasso_model(best_nodes, X_train_scaled, Y_train)

Y_pred = rsg.predict(X_test_scaled)
data = {"id":data_test["id"], "revenue":Y_pred}
output = pd.DataFrame(data = data)            
output.to_csv("lasso_predicton.csv", index=False)
"""

rfc_model = XGBRegressor(random_state=42).fit(X_train_scaled, Y_train)
perm = PermutationImportance(rfc_model,
                             random_state=42).fit(X_train_scaled, Y_train)
feature_importances1 = perm.feature_importances_  #这是返回每个特征的权重
feature_importances_std = perm.feature_importances_std_
feature_importances2 = np.where(feature_importances1 > 0)  #此时我记录下了每个特征的列数
X_train_scaled_new = X_train_scaled[
    X_train_scaled.columns[feature_importances2]]
X_test_scaled_new = X_test_scaled[X_test_scaled.columns[feature_importances2]]
X_train_scaled = X_train_scaled_new
X_test_scaled = X_test_scaled_new

trials = Trials()
algo = partial(tpe.suggest, n_startup_jobs=10)
best = fmin(xgb_f, xgb_space, algo=tpe.suggest, max_evals=1,
            trials=trials)  #一共这么多种组合1012000000
Beispiel #5
0

from xgboost.sklearn import XGBRegressor, XGBClassifier

# 回归例子
data = np.array([
    [5, 20, 1.1],
    [7, 30, 1.3],
    [21, 55, 1.7],
    [30, 60, 1.8],
    [26, 40, 1.6],
])

xgb = XGBRegressor(n_estimators=n_estimators,
                   learning_rate=LR,
                   max_depth=MAX_DEPTH,
                   min_child_weight=min_child_weight,
                   base_score=base_score,
                   gamma=GAMMA)
xgb.fit(data[:, :-1], data[:, -1])
print("xgboost:", xgb.predict(data[0, :-1].reshape(1, -1)))

my_xgb_tree = XGBoostModel(target='regression',
                           n_estimators=n_estimators,
                           lr=LR,
                           max_depth=MAX_DEPTH,
                           min_child_weight=min_child_weight,
                           reg_lambda=1,
                           reg_alpha=0,
                           base_score=base_score,
                           gamma=GAMMA)
my_xgb_tree.fit(data)
Beispiel #6
0
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]

X_test.head()

y = np.log1p(train.SalePrice)

xTrain, xTest, yTrain, yTest = train_test_split(X_train,
                                                y,
                                                test_size=0.2,
                                                random_state=0)
#d_Train = xgb.DMatrix(xTrain, label=yTrain)
#d_Test = xgb.DMatrix(yTest, label=xTest)

mod1 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
mod1.fit(xTrain,
         yTrain,
         early_stopping_rounds=100,
         eval_set=[(xTest, yTest)],
         verbose=True)
predictions = mod1.predict(xTest)

print("MSE = " + str(mean_squared_error(predictions, yTest)))

print(test.head())

print(predictions)
print(predictions.dtype)

predictions2 = np.exp(mod1.predict(X_test))
Beispiel #7
0
def predict(X, Y, test_X, best_algo):
    # PCA
    pred_test_Ys = []
    if 'xgboost' in best_algo or 'lasso' in best_algo:
        pca = PCA(n_components=40)
        pca.fit(X)
        print("pca explained variance ratio: {}...".format(
            sum(pca.explained_variance_ratio_)))
        pca_X = pca.transform(X)
        pca_test_X = pca.transform(test_X)

    if 'lasso' in best_algo:
        # lasso
        model = Lasso(alpha=1.0)
        model.fit(pca_X, Y)
        pred_test_Y = model.predict(pca_test_X)
        pred_test_Ys.append(pred_test_Y)

    if 'xgboost' in best_algo:
        # Xgboost
        model = XGBRegressor(
            learning_rate=0.01,  # 默认0.3
            n_estimators=500,  # 树的个数
            max_depth=3,
            # min_child_weight=1,
            # gamma=0,
            # subsample=0.8,
            # colsample_bytree=0.8,
            # scale_pos_weight=1
        )
        model.fit(pca_X, Y)
        pred_test_Y = model.predict(pca_test_X)
        pred_test_Ys.append(pred_test_Y)

    if 'stepwise' in best_algo:
        # stepwise forward selection (by p value)
        all_feature_indices = set(range(X.shape[1]))
        selected_feature_indices = [0, 1, 2, 3, 4, 5]

        def mp_get_pvalue(ind):
            """get p value for newly added feature, which is the last feature"""
            model = sm.OLS(Y, X[:,
                                list(selected_feature_indices) + [ind]]).fit()
            pvalue = model.pvalues[-1]
            return pvalue

        def get_pvalue(Y, X):
            """get p value for newly added feature, which is the last feature"""
            model = sm.OLS(Y, X).fit()
            pvalue = model.pvalues[-1]
            return pvalue

        while len(selected_feature_indices) < MAX_N_FEATURE_SELECT:
            unselected_feature_indices = all_feature_indices - set(
                selected_feature_indices)
            unselected_feature_indice1_pvalue0 = 100  # some random large p-value
            selected_feature_index = 0  # some random index

            # multi-processing (doesn't seem to speed up, moreover, costs a lot more time)
            # import time
            # pool = multiprocessing.Pool(4)
            # unselected_feature_indices_list = list(unselected_feature_indices)
            # start_time = time.time()
            # unselected_feature_pvalues = pool.map(mp_get_pvalue, unselected_feature_indices_list)
            # print("takes {}...".format(time.time() - start_time))
            # selected_feature_index = unselected_feature_indices_list[int(np.argmin(unselected_feature_pvalues))]
            # selected_feature_indices += [selected_feature_index]

            # construct array of pvalues
            unselected_feature_indices_list = list(unselected_feature_indices)
            unselected_feature_pvalues = [
                get_pvalue(Y, X[:, list(selected_feature_indices + [ind])])
                for ind in tqdm(unselected_feature_indices_list)
            ]
            selected_feature_index = unselected_feature_indices_list[int(
                np.argmin(unselected_feature_pvalues))]
            selected_feature_indices += [selected_feature_index]

        model = sm.OLS(Y, X[:, selected_feature_indices]).fit()
        pred_test_Y = model.predict(test_X[:, selected_feature_indices])
        pred_test_Ys.append(pred_test_Y)

    if 'knn' in best_algo:
        model = KNeighborsRegressor(n_neighbors=5)
        model.fit(X, Y)
        pred_train_Y = model.predict(X)
        factor = np.mean(
            [Y[i] / pred_train_Y[i] for i in range(len(pred_train_Y))])
        pred_test_Y = factor * model.predict(test_X)
        pred_test_Ys.append(pred_test_Y)

    if 'ensemble' in best_algo:
        n_algo = len(pred_test_Ys)
        pred_test_Y = np.sum(np.array(pred_test_Ys), axis=0) / n_algo

    return pred_test_Y
Beispiel #8
0
df_train['DaysFromJan2016']= df_train['DaysFromJan2016'].dt.days
df_train = df_train.drop('Date', axis = 1)
df_train.head()

X_train, X_test, y_train, y_test = train_test_split(df_train.drop('Price', axis = 1), df_train['Price'], test_size = 0.3, random_state = 100)

"""<h1 id="Model-Building">Model Building<a class="anchor-link" href="#Model-Building">¶</a></h1><hr/>
<p><img src="https://slideplayer.com/slide/15204316/92/images/1/What+is+Regression+Analysis.jpg"/></p>

##1. Xgboost

#### For train eval
"""

from xgboost.sklearn import XGBRegressor
model = XGBRegressor(objective="reg:linear", random_state=42) 
t0  = time.time()
model.fit(X_train, y_train)
print ("fitting time:", round(time.time()-t0, 3), "s")

y_pred = model.predict(X_test)

mse=metrics.mean_squared_error(y_test, y_pred)
mse

"""#### For test eval"""

y_pred = model.predict(X_train)

mse=metrics.mean_squared_error(y_train, y_pred)
mse
Beispiel #9
0
num_rounds = cv.shape[0] - 1
print('Best rounds: ', num_rounds)

params = {
    'n_estimators': num_rounds,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'min_child_weight': 1.1,
    'max_depth': 4,
    'silent': 1,
}

model = XGBRegressor(**params)

print('Starting Cross Validation...')
score = cross_val_score(model, train_x1, train_y1, cv=5)
print('Score: ', score)
print('Mean CV scores: ', np.mean(score))

print('Training...')
model.fit(train_x1, train_y1)
print('Predicting...')

########################################################################
#lasso
#######################################################################
from sklearn.linear_model import Lasso, ElasticNet
best_alpha = 0.0015
Beispiel #10
0
# ## INTERJECTION: Change models?

# At this point I'd thought it would be wise to try a different modelling technique. Even with the best will in the world, I was getting issues with the ever-reliable random forest. Namely, there was huge noise in my cross validation score, which was making choosing the right data engineering steps a bit of a nightmare. Also RF regression is tediously slow. Enter XGBoost.
# 
# I won't explain how XGBoost works, as there is literature online which can explain it better than I ever will, but suffice to say it is similar to RF in that it combines a lot of trees together, but unlike RF it doesn't build them in a random manner.

# In[ ]:


from xgboost.sklearn import XGBRegressor


# In[ ]:


xgb_test = XGBRegressor(learning_rate=0.05,n_estimators=500,max_depth=3,colsample_bytree=0.4)
cv_score = cross_val_score(xgb_test, train_med.drop(['SalePrice','Id'], axis = 1), train_med['SalePrice'], cv = 5, n_jobs = -1)


# In[ ]:


print('CV Score is: '+ str(np.mean(cv_score)))


# This is quite a bit stronger than RF, submitting yields 0.13031, which puts us strongly in the top 50%. A while to go yet, but we are moving in the right direction. Furthermore, let's move to using XGBoost as our regression method now.

# ## 4. Categoric to Numeric

# This is interesting. Some of the fields regarding the quality of the property are 'secretly' ordinal. Case in point, the field entitled BsmtCond, which has different quality ratings. Perhaps turning these into their numeric correspondent will improve performance, as we will be able to mine out better trends.
Beispiel #11
0
def cross_validation(dtrain,ytrain,predictors):
    #每次调整完一个参数,重新确定新的num_rounds
    dtrain = dtrain[predictors]
    xgb_model = XGBRegressor(
                learning_rate= 0.5,
                max_depth = 20,
                n_estimators = 100,
                min_child_weight = 1,
                gamma = 0,
                objective='reg:linear',
                nthread=4,
                )
    modelfit(xgb_model,dtrain,ytrain)
    print('tunning learning rate...')
    params = {'learning_rate':[0.01,0.015,0.025,0.05,0.1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring = 'neg_mean_squared_error',n_jobs = 4,iid=False,cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(learning_rate = gsearch.best_params_['learning_rate'])
    print(gsearch.best_params_)

    print('tunning max_depth...')
    params = { 'max_depth':[3,5,7,9]}
    print(xgb_model.get_params()['n_estimators'])
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = params, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(max_depth = gsearch.best_params_['max_depth'])
    print(gsearch.best_params_)
    #choose best num_round
    modelfit(xgb_model,dtrain,ytrain)
    print(xgb_model.get_params()['n_estimators'])
    
    print('tunning min_child_weight...')
    param_child_weight = {'min_child_weight':[1,3,5,7]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_child_weight, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(min_child_weight = gsearch.best_params_['min_child_weight'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    print('tunning gamma...')
    param_gamma = {'gamma':[0.05,0.1,0.3,0.5,0.7,0.9,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_gamma, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(gamma = gsearch.best_params_['gamma'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print(xgb_model.get_params()['n_estimators'])

    #print('tunning colsample_bylevel')
    #param_colsample_bylevel = {'colsample_bylevel':[0.6,0.8,1]}
    #gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bylevel, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    #gsearch.fit(dtrain.values,ytrain)
    #xgb_model.set_params(colsample_bylevel = gsearch.best_params_['colsample_bylevel'])
    #tunning colsample_bytree
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bylevel:%f'%xgb_model.get_params()['n_estimators'])

    print('tunning colsample_bytree...')
    param_colsample_bytree = {'colsample_bytree':[0.6,0.7,0.8,1]}
    gsearch = GridSearchCV(estimator = xgb_model,param_grid = param_colsample_bytree, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
    gsearch.fit(dtrain.values,ytrain)
    xgb_model.set_params(colsample_bytree = gsearch.best_params_['colsample_bytree'])
    print(xgb_model.get_params())
    modelfit(xgb_model,dtrain.values,ytrain)
    print('num_rounds after tunning colsample_bytree:%f'%xgb_model.get_params()['n_estimators'])
    # save and return model
    cur_time = time.strftime("%Y-%m-%d-%H-%M",time.localtime())
    pickle.dump(xgb_model,open('../models/autogridsearch_xgb_'+cur_time+'.model','wb'))
    cv_score(xgb_model,dtrain.values,ytrain)
    return xgb_model
Beispiel #12
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Beispiel #13
0
 'colsample_bytree':np.arange(0.1,1.0,0.1)
}
# param_grid = {
#  'max_depth':[7,8],
#  'min_child_weight':[4,5]
# }


#gsearch1 = GridSearchCV(estimator = XGBClassifier(
#       learning_rate =0.1, n_estimators=140, max_depth=9,
#       min_child_weight=1, gamma=0, subsample=0.8,colsample_bytree=0.8,
#       objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27),
#       param_grid=param_grid,cv=10)


gsearch1 = GridSearchCV(estimator = XGBRegressor(
       learning_rate =0.2, 
       objective= 'binary:logistic', 
       booster= 'gbtree',
        eta=0.2,
        max_depth=4,  # 4 3
        colsample_bytree=0.7,  #0.8
        subsample= 0.7,
        min_child_weight=1,  # 2 3
        silent= 0,
        eval_metric='error',),
       param_grid=param_grid,cv=10)
gsearch1.fit(np.array(x_train),np.array(y_train))
print(gsearch1.best_params_,gsearch1.best_score_)

    print(best)

    best = {'alpha': 2.0, 'colsample_bytree': 0.9, 'eta': 0.04, 'gamma': 0.0,
            'lambda': 1.8, 'max_depth': 9, 'min_child_weight': 4.0, 'n_estimators': 968.0, 'subsample': 0.55}

    # launch prediction with this parameters
    from xgboost.sklearn import XGBRegressor
    clf = XGBRegressor(
        learning_rate=float(best['eta']),
        max_depth=int(best['max_depth']),
        min_child_weight=float(best['min_child_weight']),
        subsample=float(best['subsample']),
        gamma=float(best['gamma']),
        colsample_bytree=float(best['colsample_bytree']),
        n_estimators=int(best['n_estimators']),
        reg_alpha=float(best['alpha']),
        reg_lambda=float(best['lambda']),
        objective='reg:linear',
        eval_metric='mae',
        nthread=-1,
        booster='gbtree',
        tree_method='exact',
        silent=1
    )
    # test
    clf.fit(X, y)
    y_hat = clf.predict(X_test)
    dscores = metrics_regression(y_test, y_hat, X.shape[1])
    tf = t.since('test')
    print('\nBayesian tuning - test:  bias = %.3f  mae = %.3f  r2 = %.3f (time: %s)' %
          (dscores['bias'], dscores['mae'], dscores['r2'], format_duration(tf)))
    def run(self, train_data_path):
        """Takes argument 'train_data_path'.
        train_data_path: Training data path.
        
        Performs models selection process on the specified order.
        A no. of reqred models can added to this method body and corss validated
        These can be saved as it is or ensembling can be applied. 
        """
        #Loading training data
        dtrain = pd.read_csv(train_data_path)
        #gets predictors
        predictor_vars = self.get_predictors(dtrain)

        #Model I
        xgboost = XGBRegressor(learning_rate=0.06,
                               n_estimators=1000,
                               max_depth=2,
                               min_child_weight=2,
                               gamma=0,
                               subsample=0.4,
                               colsample_bytree=0.2,
                               objective='reg:linear',
                               nthread=-1,
                               scale_pos_weight=1,
                               seed=27,
                               reg_alpha=77)

        #Model II
        xgboost2 = XGBRegressor(learning_rate=0.04,
                                n_estimators=1500,
                                max_depth=2,
                                min_child_weight=0,
                                gamma=0,
                                subsample=0.7,
                                colsample_bytree=0.2,
                                objective='reg:linear',
                                nthread=-1,
                                scale_pos_weight=1,
                                seed=99,
                                reg_alpha=1.7)

        #Model III
        xgboost3 = XGBRegressor(learning_rate=0.02,
                                n_estimators=1200,
                                max_depth=3,
                                min_child_weight=2,
                                gamma=0,
                                subsample=0.65,
                                colsample_bytree=0.2,
                                objective='reg:linear',
                                nthread=-1,
                                scale_pos_weight=1,
                                seed=585,
                                reg_alpha=5000)

        #Model IV
        lightgbm = LGBMRegressor(objective='regression',
                                 num_leaves=4,
                                 min_data_in_leaf=5,
                                 learning_rate=0.02,
                                 n_estimators=3000,
                                 max_bin=320,
                                 bagging_fraction=0.85,
                                 bagging_freq=10,
                                 bagging_seed=9,
                                 feature_fraction=0.2,
                                 feature_fraction_seed=9,
                                 data_random_seed=9,
                                 reg_alpha=0.55,
                                 reg_lambda=0.3,
                                 verbose=-1)

        #Model V
        lightgbm2 = LGBMRegressor(objective='regression',
                                  num_leaves=4,
                                  min_data_in_leaf=3,
                                  learning_rate=0.01,
                                  n_estimators=4000,
                                  max_bin=295,
                                  bagging_fraction=0.5,
                                  bagging_freq=10,
                                  bagging_seed=24,
                                  feature_fraction=0.2,
                                  feature_fraction_seed=24,
                                  data_random_seed=24,
                                  reg_alpha=10,
                                  reg_lambda=0.7,
                                  verbose=-1)

        #Ensembling all the five models
        ens_model = EnsembleRegressor(
            [xgboost, xgboost2, xgboost3, lightgbm, lightgbm2])

        #Performs cross validation on the ensembled model.
        self.cross_validate(cv=5,
                            model=ens_model,
                            X=dtrain[predictor_vars],
                            y=dtrain[self.target_var],
                            n_jobs=1)
        #CV Score is: 0.92528287952747 all predictors

        #Saving the final model.
        self.finalize_and_save(ens_model, self.output_file_path,
                               dtrain[predictor_vars], dtrain[self.target_var])
Beispiel #16
0
    print(" Score (Train): %f" % mean_squared_error(train_Y.values, dtrain_predictions))
    #Predict on testing data:
    dtest_predictions = alg.predict(test_X)
    print("Score (Test): %f" % mean_squared_error(test_Y.values, dtest_predictions))




XGBmodel = XGBRegressor(booster='gbtree',
                    objective= 'reg:linear',
                    eval_metric='rmse',
                    gamma = 0.1,
                    min_child_weight= 1.1,
                    max_depth= 5,
                    subsample= 0.7,
                    colsample_bytree= 0.7,
                    tree_method= 'exact',
                    learning_rate=0.1,
                    n_estimators=300,
                    nthread=4,
                    scale_pos_weight=1,
                    seed=27
                    )


modelfit(XGBmodel)

#adjust parameters

param_test1 = {
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
Beispiel #17
0
 def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
     """
     input params:
     - df (DataFrame): dataframe of training data
     - target_column (string): name of target column
     - id_column (string): name of id column
     - target_type (string): 'linear' or 'binary'
     - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
     - drop_columns (list): list of columns to drop
     - numeric_columns (list): list of columns to convert to numeric
     - verbose (bool): verbosity of printouts
     """
     # checks for sampling
     sample_fraction = float(sample_fraction)
     if sample_fraction > 1:
         sample_fraction = 1.0
     if sample_fraction * n_samples > 1:
         n_samples = round(1.0/sample_fraction)
     if sample_fraction <= 0:
         print('sample_fraction 0 or negative, switching to 0.1')
         sample_fraction = 0.1
     # if sample_fraction is results in sample smaller than 1
     if round(sample_fraction * len(df)) == 0:
         sample_fraction = 1.0/len(df)
     # check if data is dataframe
     if type(df) == pd.core.frame.DataFrame:
         self.df = df
         self.early_stopping_rounds = early_stopping_rounds
         if target_column:
             self.target_column = target_column
             self.id_column = id_column
             self.target_type = target_type
             self.categorical_columns = categorical_columns
             self.numeric_columns = numeric_columns
             self.drop_columns = drop_columns
             self.verbose = verbose
             self.sample_fraction = sample_fraction
             self.n_samples = n_samples
             self.num_training_rounds = num_training_rounds
             self.prefix = prefix
             # init the classifier:
             if self.target_type == 'binary':
                 self.scoring = 'auc'
                 self.clf = XGBClassifier(
                     learning_rate =0.1,
                     n_estimators = num_training_rounds,
                     subsample = 0.8,
                     colsample_bytree = 0.8,
                     objective = 'binary:logistic',
                     scale_pos_weight = 1,
                     seed = 123)
             elif self.target_type == 'multiclass':
                 self.scoring = 'merror'
                 self.clf = XGBClassifier(
                     learning_rate =0.1,
                     n_estimators = num_training_rounds,
                     subsample = 0.8,
                     colsample_bytree = 0.8,
                     objective = 'multi:softmax',
                     scale_pos_weight = 1,
                     seed = 123)
             elif self.target_type == 'linear':
                 self.scoring = 'rmse'
                 self.clf = XGBRegressor(
                         n_estimators = num_training_rounds,
                         objective = 'reg:linear'
                         )
             # if preferred scoring metric is stated:
             if scoring:
                 self.scoring = scoring
         else:
             print('please provide target column name')
     else:
         print('please provide pandas dataframe')
Beispiel #18
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)
Beispiel #19
0
class XGBoostModel:
    def __init__(self):
        self.model = None

    def train(self, X_train, X_test, y_train, y_test):
        '''
        Trains the machine learning model based on the dataframe provided as input.
        The fitted model will be saved under model/xgboost.pkl
        The function returns the MSE and the RMSE
        :param df:
        :return: RMSE and MAE scores
        '''
        print('Training is starting...')
        eval_set = [(X_train, y_train), (X_test, y_test)]

        self.model = XGBRegressor(max_depth=7,
                                  objective='reg:squarederror',
                                  gamma=0,
                                  learning_rate=0.03,
                                  subsample=1,
                                  colsample_bytree=0.9,
                                  min_child_weight=10)

        self.model.fit(X_train,
                       y_train,
                       eval_set=eval_set,
                       eval_metric="rmse",
                       early_stopping_rounds=500)

        predictions = self.predict(X_test)

        with open('generated/gxboost_model.pickle', 'wb') as file:
            pickle.dump(self.model, file)

        self.evaluate(y_test, X_test)

    def predict(self, X_test):
        predictions = self.model.predict(X_test)
        return predictions

    def grid_search(self, X_train, X_test, y_train, y_test):
        grid_param = {
            'max_depth': [n for n in range(2, 10)],
            'gamma': np.arange(0, 0.5, 0.1),
            'learning_rate': [0.0001, 0.001, 0.01, 0.1],
            'subsample': np.arange(0.5, 0.9, 0.1),
            'colsample_bytree': np.arange(0.5, 0.9, 0.1),
            'min_child_weight': [1, 3, 5, 7]
        }

        model = XGBRegressor(max_depth=7,
                             objective='reg:squarederror',
                             gamma=0,
                             learning_rate=0.03,
                             subsample=1,
                             colsample_bytree=0.9,
                             min_child_weight=10)

        gd_sr = GridSearchCV(estimator=model,
                             param_grid=grid_param,
                             scoring='neg_mean_squared_error',
                             cv=5,
                             n_jobs=-1)

        gd_sr.fit(X_train, y_train)

        best_parameters = gd_sr.best_params_
        print(best_parameters)

    def evaluate(self, y_test, X_test):
        print('#' * 15 + ' Model Evaluation ' + '#' * 15)
        print()

        predictions = self.predict(X_test)
        predictions = MLProcessing.invert_scaling(predictions)
        y_test = MLProcessing.invert_scaling(np.array(y_test))

        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)

        print('RMSE: {} - MAE: {}'.format(rmse, mae))

        print()
        print('#' * 48)
Beispiel #20
0
    def fit(self, inputs_train, labels_train, fit_options={}):
        xgb_reg = XGBRegressor(random_state=self.options['seed'])

        print('Starting with low learning rate and tuning: \
            max_depth, min_child_weight, n_estimators')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            # np.arange(2,14,2),
            "max_depth": self.options['max_depth'],
            # np.arange(1,7,6),
            "min_child_weight": self.options['min_child_weight'],
            # np.arange(10,80,10),
            "n_estimators": self.options['n_estimators'],
            "colsample_bytree": [0.8],
            "subsample": [0.8],
            "gamma": [0],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: gamma')
        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "colsample_bytree": [0.8],
            "subsample": [0.8],
            # np.arange(0.05,0.45,0.05),
            "gamma": self.options['gamma'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: colsample_bytree, subsample')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],

            # np.arange(0.60, 0.95, 0.05),
            "colsample_bytree": self.options['colsample_bytree'],
            # np.arange(0.60, 0.95, 0.05),
            "subsample": self.options['subsample'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: reg_alpha, reg_lambda')

        params = {
            "learning_rate": [0.1],  # np.arange(0.05,0.45,0.05), #eta
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],
            "colsample_bytree": [GSCV.best_params_['colsample_bytree']],
            "subsample": [GSCV.best_params_['subsample']],

            # ,[1e-5, 1e-2, 0.1, 1, 10], #alpha
            "reg_alpha": self.options['reg_alpha'],
            # [1e-5, 1e-2, 0.1, 1, 10],#lambda
            "reg_lambda": self.options['reg_lambda'],
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Tuning: learning_rate')

        params = {
            # np.arange(0.025,0.150,0.025), #np.arange(0.05,0.45,0.05), #eta
            "learning_rate": self.options['learning_rate'],
            "max_depth": [GSCV.best_params_['max_depth']],
            "min_child_weight": [GSCV.best_params_['min_child_weight']],
            "n_estimators": [GSCV.best_params_['n_estimators']],
            "gamma": [GSCV.best_params_['gamma']],
            "colsample_bytree": [GSCV.best_params_['colsample_bytree']],
            "subsample": [GSCV.best_params_['subsample']],
            "reg_alpha": [GSCV.best_params_['reg_alpha']],  # alpha
            "reg_lambda": [GSCV.best_params_['reg_lambda']]  # lambda
        }

        GSCV = GridSearchCV(
            xgb_reg,  # , #np.arange(0.05,0.45,0.05), #eta),
            params,
            cv=self.options['cv'],
            scoring=self.options['scoring'],
            n_jobs=self.options['n_jobs'],
            verbose=self.options['verbose'],  # verbose,
            return_train_score=True)

        GSCV.fit(inputs_train, labels_train)

        print('best_params_:', GSCV.best_params_)  # ,
        print('best_score_:', GSCV.best_score_)

        print('Final model')

        # Regression
        regressor = XGBRegressor(random_state=self.options['seed'])  # seed)
        regressor.set_params(**GSCV.best_params_)
        trained_regressor = regressor.fit(inputs_train, labels_train)
        self.regressor = trained_regressor
        self.feature_importances_ = self.regressor.feature_importances_
    #Fit the algorithm on the data
    print 'kaishixunlian'
    alg.fit(train_data, train_target)
    print 'kaishiyuce'
    dtrain_predictions = alg.predict(test_data)
    daochu(dtrain_predictions)
    print 'Feature Importance'
    get_xgb_feat_importances(alg)
    # Print Feature Importance:
    # feat_imp = pd.Series(get_xgb_feat_importances(alg)(1)).sort_values(ascending=False)
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')
    # plt.show()
    # get_xgb_feat_importances(alg)


xgb1 = XGBRegressor(learning_rate=0.01,
                    n_estimators=1000,
                    max_depth=6,
                    min_child_weight=6,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    reg_alpha=0.005,
                    objective='reg:linear',
                    nthread=14,
                    scale_pos_weight=3,
                    seed=2016)
modelfit(xgb1, train_data, train_target, test_data)
Beispiel #22
0
    # 载入train和test数据集
    dataSet = train_data.iloc[:, :-3]
    labelSet = train_data.iloc[:, -3:]

    y_label = []
    for i in range(test_features.shape[1]):
        train_text, test_text, train_labels, test_labels = train_test_split(
            dataSet.iloc[:, i], labelSet, test_size=0.33, random_state=23333)

        train_text = np.mat(train_text).T
        test_text = np.mat(test_text).T
        # ==============================================================================
        # 模型选择
        # ==============================================================================
        model_lscv = LassoCV()
        model_xgb = XGBRegressor()
        model_rfg = RandomForestRegressor()
        model_gb = GradientBoostingRegressor()

        preds1_test1, preds1_test2 = model_select(model_lscv)
        preds2_test1, preds2_test2 = model_select(model_xgb)
        preds3_test1, preds3_test2 = model_select(model_rfg)
        preds4_test1, preds4_test2 = model_select(model_gb)

        pred1 = (preds1_test1 + preds2_test1 + preds3_test1 + preds4_test1) / 4
        pred2 = (preds1_test2 + preds2_test2 + preds3_test2 + preds4_test2) / 4
        print('-----第%s个特征-------' % i)
        print('Fit score1 + score2: The pearsonr of test set is {}'.format(
            pearsonr(list(test_labels.iloc[:, -1]), list(pred1))[0]))
        print('Only fit score: the pearsonr of test set is {}'.format(
            pearsonr(list(test_labels.iloc[:, -1]), list(pred2))[0]))
Beispiel #23
0
#from sklearn.ensemble import GradientBoostingRegressor # 1
#model = GradientBoostingRegressor() # 2

#from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators = 200, max_depth = 50)

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

model = XGBRegressor(learning_rate=0.1,
                     n_estimators=310,
                     max_depth=5,
                     min_child_weight=2,
                     gamma=0.2,
                     subsample=0.85,
                     colsample_bytree=0.65,
                     objective='reg:linear',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27,
                     reg_alpha=84)

model.fit(x_train, y_train)
#model.fit(x_data, y_data)
y_test = model.predict(x_test)

#####################################################################

# Uncomment the two lines below to roughly check the accuracy of
# your model on the validation data
#y_pred = forest.predict(x_valid)
Beispiel #24
0
def train(X, Y):
    """search params and try different algs in ALGOS_TO_RUN;
    save a recipe, which contains the best algo (xgboost, ensemble, lasso, etc.)
     for the city"""
    # K-fold crossvalidation
    kfold = KFold(n_splits=K_FOLD_SPLITS)
    train_Ys, valid_Ys, train_metrics, valid_metrics, train_ensemble, valid_ensemble = {}, {}, {}, {}, {}, {}

    for algo in ALGOS_TO_RUN:
        train_Ys[algo] = []
        valid_Ys[algo] = []
        train_metrics[algo] = []
        valid_metrics[algo] = []
        train_ensemble = []
        valid_ensemble = []

    for kf_id, (train_indices, valid_indices) in enumerate(kfold.split(X)):
        if kf_id >= K_FOLDS_TO_RUN:
            break
        train_X = X[train_indices]
        val_X = X[valid_indices]
        train_Y = Y[train_indices]
        val_Y = Y[valid_indices]
        train_ensemble1 = []
        valid_ensemble1 = []

        # PCA
        if 'xgboost' in ALGOS_TO_RUN or 'lasso' in ALGOS_TO_RUN:
            pca = PCA(n_components=40)
            pca.fit(train_X)
            print("pca explained variance ratio: {}...".format(
                sum(pca.explained_variance_ratio_)))
            pca_train_X = pca.transform(train_X)
            pca_val_X = pca.transform(val_X)

        if 'lasso' in ALGOS_TO_RUN:
            # lasso
            model = Lasso(alpha=1.0)
            model.fit(pca_train_X, train_Y)
            pred_train_Y = model.predict(pca_train_X)
            pred_valid_Y = model.predict(pca_val_X)

            fold_train_mae = np.mean(abs(train_Y - pred_train_Y))
            fold_val_mae = np.mean(abs(val_Y - pred_valid_Y))
            train_Ys['lasso'].append(train_Y)
            valid_Ys['lasso'].append(val_Y)
            train_metrics['lasso'].append(fold_train_mae)
            valid_metrics['lasso'].append(fold_val_mae)
            train_ensemble1.append(train_Y - pred_train_Y)
            valid_ensemble1.append(val_Y - pred_valid_Y)

        if 'xgboost' in ALGOS_TO_RUN:
            # Xgboost
            model = XGBRegressor(
                learning_rate=0.01,  # 默认0.3
                n_estimators=500,  # 树的个数
                max_depth=3,
                # min_child_weight=1,
                # gamma=0,
                # subsample=0.8,
                # colsample_bytree=0.8,
                # scale_pos_weight=1
            )
            model.fit(pca_train_X, train_Y)
            pred_train_Y = model.predict(pca_train_X)
            pred_valid_Y = model.predict(pca_val_X)

            fold_train_mae = np.mean(abs(train_Y - pred_train_Y))
            fold_val_mae = np.mean(abs(val_Y - pred_valid_Y))
            train_Ys['xgboost'].append(train_Y)
            valid_Ys['xgboost'].append(val_Y)
            train_metrics['xgboost'].append(fold_train_mae)
            valid_metrics['xgboost'].append(fold_val_mae)
            train_ensemble1.append(train_Y - pred_train_Y)
            valid_ensemble1.append(val_Y - pred_valid_Y)

        if 'stepwise' in ALGOS_TO_RUN:
            # stepwise forward selection (by p value)
            all_feature_indices = set(range(train_X.shape[1]))
            selected_feature_indices = [0, 1, 2, 3, 4, 5]

            def mp_get_pvalue(ind):
                """get p value for newly added feature, which is the last feature"""
                model = sm.OLS(
                    train_Y,
                    train_X[:, list(selected_feature_indices) + [ind]]).fit()
                pvalue = model.pvalues[-1]
                return pvalue

            def get_pvalue(Y, X):
                """get p value for newly added feature, which is the last feature"""
                model = sm.OLS(Y, X).fit()
                pvalue = model.pvalues[-1]
                return pvalue

            while len(selected_feature_indices) < MAX_N_FEATURE_SELECT:
                unselected_feature_indices = all_feature_indices - set(
                    selected_feature_indices)
                unselected_feature_indice1_pvalue0 = 100  # some random large p-value
                selected_feature_index = 0  # some random index

                # multi-processing (doesn't seem to speed up, moreover, costs a lot more time)
                # import time
                # pool = multiprocessing.Pool(4)
                # unselected_feature_indices_list = list(unselected_feature_indices)
                # start_time = time.time()
                # unselected_feature_pvalues = pool.map(mp_get_pvalue, unselected_feature_indices_list)
                # print("takes {}...".format(time.time() - start_time))
                # selected_feature_index = unselected_feature_indices_list[int(np.argmin(unselected_feature_pvalues))]
                # selected_feature_indices += [selected_feature_index]

                # construct array of pvalues
                unselected_feature_indices_list = list(
                    unselected_feature_indices)
                unselected_feature_pvalues = [
                    get_pvalue(
                        train_Y,
                        train_X[:, list(selected_feature_indices + [ind])])
                    for ind in tqdm(unselected_feature_indices_list)
                ]
                selected_feature_index = unselected_feature_indices_list[int(
                    np.argmin(unselected_feature_pvalues))]
                selected_feature_indices += [selected_feature_index]

            model = sm.OLS(train_Y, train_X[:, selected_feature_indices]).fit()
            pred_train_Y = model.predict(train_X[:, selected_feature_indices])
            pred_valid_Y = model.predict(val_X[:, selected_feature_indices])
            # print("avg Y: {}, train mae is: {}, val mae is: {}".format(np.mean(Y),
            #                                                            np.mean(abs(train_Y - pred_train_Y)),
            #                                                            np.mean(abs(val_Y - pred_valid_Y))))
            fold_train_mae = np.mean(abs(train_Y - pred_train_Y))
            fold_val_mae = np.mean(abs(val_Y - pred_valid_Y))
            train_Ys['stepwise'].append(train_Y)
            valid_Ys['stepwise'].append(val_Y)
            train_metrics['stepwise'].append(fold_train_mae)
            valid_metrics['stepwise'].append(fold_val_mae)
            train_ensemble1.append(train_Y - pred_train_Y)
            valid_ensemble1.append(val_Y - pred_valid_Y)

        if 'knn' in ALGOS_TO_RUN:
            model = KNeighborsRegressor(n_neighbors=5)
            model.fit(train_X, train_Y)
            pred_train_Y = model.predict(train_X)
            factor = np.mean([
                train_Y[i] / pred_train_Y[i] for i in range(len(pred_train_Y))
            ])
            pred_train_Y = factor * pred_train_Y
            pred_valid_Y = factor * model.predict(val_X)

            fold_train_mae = np.mean(abs(train_Y - pred_train_Y))
            fold_val_mae = np.mean(abs(val_Y - pred_valid_Y))
            train_Ys['knn'].append(train_Y)
            valid_Ys['knn'].append(val_Y)
            train_metrics['knn'].append(fold_train_mae)
            valid_metrics['knn'].append(fold_val_mae)
            train_ensemble1.append(train_Y - pred_train_Y)
            valid_ensemble1.append(val_Y - pred_valid_Y)

        train_ensemble.append(
            np.mean(
                abs(
                    np.sum(np.array(train_ensemble1), axis=0) /
                    len(ALGOS_TO_RUN))))
        valid_ensemble.append(
            np.mean(
                abs(
                    np.sum(np.array(valid_ensemble1), axis=0) /
                    len(ALGOS_TO_RUN))))

    for k in train_metrics:
        print("{}: avg train_Y: {}, avg val_Y: {}, train mae: {}, val mae: {}".
              format(k, np.mean(train_Ys[k]), np.mean(valid_Ys[k]),
                     np.mean(train_metrics[k]), np.mean(valid_metrics[k])))
    print("ensemble: train mae: {}, val mae: {}".format(
        np.mean(train_ensemble), np.mean(valid_ensemble)))
    algos = [k for k in train_metrics
             ] + ['ensemble-{}'.format('-'.join(train_metrics.keys()))]
    algos_scores = [np.mean(valid_metrics[k])
                    for k in train_metrics] + [np.mean(valid_ensemble)]
    return algos[int(np.argmin(algos_scores))]
Beispiel #25
0
 def __init__(self, d):
     self.linear_reg = linear_model.Ridge()
     self.xgb_reg = XGBRegressor(max_depth=7)
     self.d = d
Beispiel #26
0
    def get_best_model(self):
        rgr = None
        params = None
        if self.model_name_ == "xgboost":
            rgr = XGBRegressor(colsample_bytree=0.2,
                               gamma=0.0,
                               learning_rate=0.5,
                               max_depth=6,
                               min_child_weight=1.5,
                               n_estimators=2000,
                               reg_alpha=0.9,
                               reg_lambda=0.6,
                               subsample=0.2,
                               seed=42,
                               silent=1)

            #rgr = XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8,
            #                    colsample_bytree=0.8, objective="reg:squarederror", scale_pos_weight=1, seed=10)
            params = {}
        elif self.model_name_ == "lightgbm":
            rgr = LGBMRegressor(boosting_type='gbdt',
                                objective='regression',
                                n_estimators=300,
                                metric={'l2', 'l1'},
                                num_leaves=31,
                                learning_rate=0.05,
                                feature_fraction=0.9,
                                bagging_fraction=0.8,
                                bagging_freq=5,
                                verbose=0)
            params = {}
        elif self.model_name_ == "svr":
            rgr = SVR(
                kernel='rbf',
                degree=3,
                gamma='auto',
                coef0=0.0,
                tol=1e-3,
                C=1.0,
                #epsilon=0.1,
                epsilon=1.0,
                shrinking=True,
                cache_size=200,
                verbose=False,
                max_iter=-1)
            params = {}

        X_train, X_train_tmp, y_train, y_train_tmp = None, None, None, None
        for i in range(1, 5):
            file_name = "train_{0}.csv".format(i)
            X_train_tmp, y_train_tmp = HandlerFeatures(
                file_name).get_train_features()
            if X_train is None:
                X_train = X_train_tmp[:]
                y_train = y_train_tmp[:]
            else:
                X_train = np.append(X_train, X_train_tmp, axis=0)
                y_train = np.append(y_train, y_train_tmp, axis=0)

        grid = GridSearchCV(estimator=rgr,
                            param_grid=params,
                            cv=5,
                            scoring=None,
                            iid=False,
                            n_jobs=-1)
        grid.fit(X_train, y_train)

        self.best_model_ = grid.best_estimator_
Beispiel #27
0
xgb_params = {
    'max_depth': 100,
    'random_state': 10,
    'n_estimators': 1500,
    'learning_rate': 0.1,
    'silent': False,
    'booster': 'gbtree',
    'min_child_weight': 57,
    'gamma': 1.45,
    'alpha': 0.0,
    'subsample': 0.67,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.5,
    'metric': 'rmse'
}
model = CrossValRegressor(XGBRegressor(**xgb_params), n_split=10)
_, _ = model.fit(X_train.values,
                 np.log1p(Y_train.values),
                 X_val.values,
                 np.log1p(Y_val.values),
                 eval_metric=RMSE)
model.save_models('test_regressionxgbcv.pkl')
del model

with open('test_regressionxgbcv.pkl', 'rb') as f:
    model = pickle.load(f)
preds = model.predict(X_val.values)
print(RMSE(np.log1p(Y_val.values), preds))
model_single = XGBRegressor(**xgb_params)
model_single.fit(X_train,
                 np.log1p(Y_train),
from xgboost.sklearn import XGBRegressor

params = {
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'max_depth': 12,
    'min_child_weight': 1,
    'reg_alpha': 1,
    'gamma': 0
}

regressor = XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'],
                             booster='gbtree', objective='reg:linear', n_jobs=-1, subsample=params['subsample'],
                             colsample_bytree=params['colsample_bytree'], random_state=0,
                             max_depth=params['max_depth'], gamma=params['gamma'],
                             min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha'])

regressor.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_set=eval_set)

# use the trained model to predict in the test data
test_df['prediction'] = regressor.predict(test_df[train_feature].values)
test_df.iloc[:5]

df2 = pd.merge(df2, test_df[['date','prediction']], on=['date'],how='left')
df2.iloc[-5:]

# calculate the final speed
df2['imputationa1'] = df2['speed'].isnull()
df2['speed'] = df2['speed'].fillna(value=df2['prediction'])
Beispiel #29
0
def run_find(x_train, y_train, i, x_predict):

    # 找到合适的参数调优的估计器数目

    clf = XGBRegressor(
        objective='reg:linear',
        learning_rate=0.1,  # [默认是0.3]学习率类似,调小能减轻过拟合,经典值是0.01-0.2
        gamma=
        0,  # 在节点分裂时,只有在分裂后损失函数的值下降了,才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。这个参数值越大,算法越保守。
        subsample=0.8,  # 随机采样比例,0.5-1 小欠拟合,大过拟合
        colsample_bytree=0.8,  # 训练每棵树时用来训练的特征的比例
        reg_alpha=1,  # [默认是1] 权重的L1正则化项
        reg_lambda=1,  # [默认是1] 权重的L2正则化项
        max_depth=10,  # [默认是6] 树的最大深度,这个值也是用来避免过拟合的3-10
        min_child_weight=
        1,  # [默认是1]决定最小叶子节点样本权重和。当它的值较大时,可以避免模型学习到局部的特殊样本。但如果这个值过高,会导致欠拟合。
    )
    nums, fscore = modelfit(clf,
                            x_train,
                            y_train,
                            cv_folds=5,
                            early_stopping_rounds=50,
                            feval=evalerror)
    print('test_estimators:', nums)
    clf.set_params(n_estimators=nums)

    # 1 先对 max_depth和min_child_weight 这两个比较重要的参数进行调优
    ## 粗调:
    param_test1 = {
        'max_depth': [i for i in range(3, 12, 2)],
        'min_child_weight': [i for i in range(1, 10, 2)]
    }
    best_params, best_score = find_params(param_test1, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调:
    max_d = best_params['max_depth']
    min_cw = best_params['min_child_weight']
    param_test2 = {
        'max_depth': [max_d - 1, max_d, max_d + 1],
        'min_child_weight': [min_cw - 1, min_cw, min_cw + 1]
    }
    best_params, best_score = find_params(param_test2, clf, x_train, y_train)
    clf.set_params(max_depth=best_params['max_depth'],
                   min_child_weight=best_params['min_child_weight'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 2 对 gamma 进行调参:
    ## 粗调:
    param_test3 = {'gamma': [i / 10.0 for i in range(0, 10, 2)]}
    best_params, best_score = find_params(param_test3, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调:
    b_gamma = best_params['gamma']
    param_test4 = {'gamma': [b_gamma, b_gamma + 0.1, b_gamma + 0.2]}
    best_params, best_score = find_params(param_test4, clf, x_train, y_train)
    clf.set_params(gamma=best_params['gamma'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 3 对subsample和colsample_bytree进行调参
    ## 粗调
    param_test5 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    best_params, best_score = find_params(param_test5, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调
    b_subsample = best_params['subsample']
    b_colsample_bytree = best_params['colsample_bytree']
    param_test6 = {
        'subsample': [b_subsample - 0.05, b_subsample, b_subsample + 0.05],
        'colsample_bytree': [
            b_colsample_bytree - 0.05, b_colsample_bytree,
            b_colsample_bytree + 0.05
        ]
    }
    best_params, best_score = find_params(param_test6, clf, x_train, y_train)
    clf.set_params(subsample=best_params['subsample'],
                   colsample_bytree=best_params['colsample_bytree'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 4 对 reg_alpha和lambda 进行调节
    ## 粗调
    param_test7 = {
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 2],
        'reg_lambda': [1e-5, 1e-2, 0.1, 1, 2]
    }
    best_params, best_score = find_params(param_test7, clf, x_train, y_train)
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    ## 精调
    b_alp = best_params['reg_alpha']
    b_lam = best_params['reg_lambda']
    param_test8 = {
        'reg_alpha': [b_alp, 2 * b_alp, 3 * b_alp],
        'reg_lambda': [b_lam, 2 * b_lam, 3 * b_lam]
    }
    best_params, best_score = find_params(param_test7, clf, x_train, y_train)
    clf.set_params(reg_alpha=best_params['reg_alpha'],
                   reg_lambda=best_params['reg_lambda'])
    print('model', i, ':')
    print(best_params, ':best_score:', best_score)

    # 5 调小learning_rate, 提高迭代次数
    clf.set_params(learning_rate=0.01)
    nums, fscore = modelfit(clf,
                            x_train,
                            y_train,
                            cv_folds=5,
                            early_stopping_rounds=50,
                            feval=evalerror)
    clf.set_params(n_estimators=nums)

    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_predict)

    return y_predict, fscore
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from xgboost.sklearn import XGBRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score


data = pd.read_csv('train.csv')

data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

# select XGBRegressor
my_model = XGBRegressor(n_estimators=750, learning_rate=0.02)
# make pipeline
my_pipeline = make_pipeline(SimpleImputer(), my_model)

scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=2)

print('Mean Absolute Error %2f' %(-1 * scores.mean()))
Y_pred = rfr.predict(X_test)

print("Accuracy:", rfr.score(X_test, Y_test))

rootsqerr = sqrt(mean_squared_error(Y_test, Y_pred))

print("RMSE:", rootsqerr)

####################################
# XGBoostRegressor
####################################

print("XGBoost Regressor...")

xgb1 = XGBRegressor()
parameters = {
    'nthread': [1, 2, 3, 4],
    'objective': ['reg:linear'],
    'learning_rate': [.03, 0.05, .07],
    'max_depth': [5, 6, 7],
    'min_child_weight': [4],
    'silent': [1],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'n_estimators': [100, 200, 500]
}

xgb_grid = GridSearchCV(xgb1, parameters, cv=5, n_jobs=5, verbose=True)

xgb_grid.fit(X_train, Y_train)
Beispiel #32
0
    dense1_layer_model = Model(inputs=model.input,  
                                    outputs=model.get_layer('fc1').output) 

    model.summary()
    return model,dense1_layer_model

#CNN 模型
model,dense1_layer_model = get_model()
# cnn提取特征+lgb
dense1_train = dense1_layer_model.predict(X_tr)
nn_train = pd.DataFrame(dense1_train, columns=['nn_%d' % column for column in range(10)])

dense1_test = dense1_layer_model.predict(X_te)
nn_test = pd.DataFrame(dense1_test, columns=['nn_%d' % column for column in range(10)])

clf = XGBRegressor().fit(nn_train,y_train)
y_pre = clf.predict(nn_test)

print('CNN(特征)+lgb:')
print(mean_squared_error(y_pre,y_test))

#原始特征
#=================================================================
def get_original_feature(train,test,seq_len):
    predictor = [column for column in train.columns 
                 if column not in ['year','lunar_year',
                                   'lunar_xun','djz','date','count1','virtual_date']]
    train = train[predictor]
    test = test[predictor]
    train = train[seq_len:]
    test = test[seq_len:]
Beispiel #33
0
labels = labels[marks]
# x_test = loadtxt('train_feature.txt', delimiter=' ')
seed = 10
test_size = 0.3

max_idx = list()
max_score = 0

for i in range(1, 7):
    sub_idxs = list(combinations(idx, i))
    for sub_idx in sub_idxs:
        # print(list(sub_idx), raw_x.shape)
        x = raw_x[:, list(sub_idx)]
        # print(len(x))
        x_train, x_test, y_train, y_test = train_test_split(
            x, labels, test_size=test_size, random_state=seed)
        # model = XGBClassifier(learning_rate=0.01,
        #                       # seed=seed,
        #                       max_depth=10,
        #                       silent=1)
        model = XGBRegressor(max_depth=10)
        model.fit(x_train, y_train)
        y_pre = model.predict(x_test)
        predictions = [round(value) for value in y_pre]
        accuracy = accuracy_score(y_test, predictions)
        if accuracy > max_score:
            max_score = accuracy
            max_idx = np.copy(sub_idx)

print(max_score, max_idx)
warnings.filterwarnings('ignore')

# 1 데이터

dataset = load_diabetes()
x = dataset.data
y = dataset.target

x_train , x_test,y_train ,y_test = train_test_split( x, y, train_size = 0.8, random_state=104)

KFold = KFold(n_splits=5,shuffle=True) # (shuffle=False : 순차적)
st = datetime.datetime.now()

parameters = [
    {'n_estimoters':[100,200,300], 'learning_rate':[0.1,0.3,0.001,0.01], 'max_depth':[4,5,6]},
    {'n_estimoters':[90,100,110], 'learning_rate':[0.1,0.001,0.01], 'max_depth':[4,5,6],'colsample_bytree':[0.6,0.9,1]},
    {'n_estimoters':[100,110], 'learning_rate':[0.1,0.5,0.001], 'max_depth':[4,5,6],'colsample_bytree':[0.6,0.9,1],'colsample_bylevel':[0.6,0.7,0.9]}
]

#2 모델 구성

model = RandomizedSearchCV(XGBRegressor(eval_metric='mlogloss'), parameters, cv = KFold  ) 

score = cross_val_score(model, x_train,y_train,cv= KFold)


print(score)

et = datetime.datetime.now()

print(et-st)
Beispiel #35
0

def save_result(data, columns):
    """
    :param data: np.array
    :param columns: list
    """
    index = []
    for i in range(len(data)):
        index.append(i + 1461)
    data = np.column_stack((index, data))
    df = pd.DataFrame(data, columns=columns)
    df['Id'] = df['Id'].astype('int')
    df.to_csv("result.csv", index=False)


# RF Model
rf = XGBRegressor()
#rf = RandomForestRegressor()
for i in range(Config.max_step + 1):
    # features, labels = train.next_batch(Config.batch_size)
    rf.fit(train.df, train.labels)
    if i % 20 == 0:
        print("Step: %d" % i)
        _pre = rf.predict(test.df)
        N = len(_pre)
        print(np.c_[_pre, result['SalePrice']])
        print(np.sqrt(np.sum(np.log10(result['SalePrice'] / _pre)) / N))
result = rf.predict(test.df)
save_result(result, ['Id', 'SalePrice'])

error_df[error_df.pred.isnull()]


pd.Series(rf_age_valid_pred).isnull().sum()
statistics.mean(error_df.sqerr)

len(rf_age_valid_pred)
len(age_valid_Y)
pd.crosstab(pd.Series(rf_age_valid_pred).apply(lambda x: round(x)),age_valid_Y)

## XGB for age prediction
from xgboost.sklearn import XGBRegressor

xgb = XGBRegressor(max_depth=6, learning_rate=0.2, n_estimators=100,
                    objective='reg:linear', subsample=0.5, colsample_bytree=0.5, seed=321)

eval_set  = [(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y), (mvt_valid_X.drop(['age', 'gender'], axis=1),age_valid_Y)]
xgb.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y, eval_set = eval_set, eval_metric= 'rmse',early_stopping_rounds= 10, verbose=1)
xgb_age_valid_pred = xgb.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))


## ADAboost for age prediction
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(n_estimators=50,learning_rate=0.1,loss='linear', random_state=321)
ada.fit(mvt_train_X.drop(['age', 'gender'], axis=1), age_train_Y.values,)
ada_age_valid_pred = ada.predict(mvt_valid_X.drop(['age', 'gender'], axis=1))

len(ada_age_valid_pred)
len(age_valid_Y)
    print("Test RMSE : %.4g" % mean_squared_error(y_test.values, dtest_prediction) ** 0.5)

    # feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    # feat_imp.plot(kind='bar', title='Feature Importance')
    # plt.ylabel('Feature Importance Score')
    # plt.show()
    plot_importance(alg)
    plt.show()

    importances = alg.feature_importances_

    return dtrain_prediction, dtest_prediction


# 初始化模型
xgb0 = XGBRegressor(random_state=10, importance_type='gain')
start = time()
dtrain_prediction, dtest_prediction = modelfit(xgb0, X_train, y_train, X_val, y_val)
end = time()
print("the model fit time: %.4f" % (end-start))

train_out = pd.DataFrame(list(zip(y_train.values.flatten(), pd.Series(dtrain_prediction))),
                         index=y_train.index, columns=['y_true', 'y_pred'])
test_out = pd.DataFrame(list(zip(y_val.values.flatten(), pd.Series(dtest_prediction))),
                        index=y_val.index, columns=['y_true', 'y_pred'])


# step4:特征选择
model = SelectFromModel(xgb0, prefit=True)
selection_X_train = model.transform(X_train)
selection_X_val = model.transform(X_val)
Beispiel #38
0
target_raw = boston.target

X_train, X_test, y_train, y_test = train_test_split(data_raw,
                                                    target_raw,
                                                    test_size=0.1,
                                                    random_state=33)
feature_names = boston.feature_names

# =============================================================================

# =============================================================================
# 建模并训练

# 封装为sklearn格式的模型
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
xgb = XGBRegressor()

xgb.fit(
    X_train,
    y_train,
    early_stopping_rounds=10,  # 当n次结果没有变小时提前终止
    eval_set=[(X_train, y_train), (X_test, y_test)],  # 评价集,类似于predict
    eval_metric=['rmse']
    # 评价指标 'error','logloss','rmse','auc'等
    # http://xgboost.readthedocs.io/en/latest/parameter.html
)

# =============================================================================

# =============================================================================
# 预测并评价
split_num=int(38070*0.95)
load1_train=load1.iloc[0:split_num,:]
load1_test=load1.iloc[split_num:38070,:]
load1_test2=load1[37089-48:37089]
train_x, test_x, train_y, test_y = train_test_split(load1_train.drop('load1',1), load1_train['load1'],train_size=0.9, random_state=133)
#Dtrain=xgb.DMatrix(train_x,train_y)
#Dtest=xgb.DMatrix(test_x,test_y)

#######################################################cv调参
xgb1 = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='reg:gamma',
 nthread=4,
 scale_pos_weight=1,
 seed=1024)
    
  


#####parameter 1max_depth 

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
        metrics='rmse', early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])