Beispiel #1
0
 def __init__(self, model, test_file):
     self.model = model
     self.test_file = cd.clean_dataframe(test_file)
     self.x_actual, self.y_actual = cd.X_Y_split(self.test_file)
     self.y_actual = self.y_actual.values.ravel()
     self.predictions = self.make_predictions()
     self.test_file["predictions"] = self.predictions
Beispiel #2
0
 def __init__(self, params, dataframe):
     self.params = params
     self.dataframe = dataframe
     self.model = xgb.XGBRegressor(**params)
     self.train_data, self.validation_data = train_test_split(
         self.dataframe, test_size=0.3, random_state=100)
     train_x, train_y = cd.X_Y_split(self.train_data)
     validation_x, validation_y = cd.X_Y_split(self.validation_data)
     self.dtrain = xgb.DMatrix(data=train_x,
                               label=train_y,
                               feature_names=train_x.columns)
     self.dvalidation = xgb.DMatrix(data=validation_x,
                                    label=validation_y,
                                    feature_names=validation_x.columns)
     self.eval_matrix = [(self.dtrain, "train"),
                         (self.dvalidation, "validation")]
     self.eval_set = [(train_x, train_y), (validation_x, validation_y)]
Beispiel #3
0
def tune_all(data, estimator, param_grid, n_iter=10, n_splits=5):
    train_x, train_y = cd.X_Y_split(data)
    kfold = KFold(n_splits=n_splits)
    param_search = RandomizedSearchCV(estimator,
                                      param_grid,
                                      n_iter=n_iter,
                                      scoring="neg_mean_squared_error",
                                      cv=kfold)
    grid_result = param_search.fit(train_x, train_y, verbose=0)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    return grid_result
Beispiel #4
0
 def add_evalset(self, dataframe):
     """
     Function to add additional dataset for validation during training. 
     dataframe must be cleaned before with clean_dataframe function.
     """
     dataframe = cd.clean_dataframe(dataframe)
     new_val_x, new_val_y = cd.X_Y_split(dataframe)
     new_val_mat = xgb.DMatrix(data=new_val_x,
                               label=new_val_y,
                               feature_names=new_val_x.columns)
     self.eval_matrix.append((new_val_mat, "validation_2"))
     self.eval_set.append((new_val_x, new_val_y))
Beispiel #5
0
def tune_parameter(data,
                   parameter,
                   param_range,
                   save_plot=False,
                   randomized=False,
                   save_path=None,
                   n_iter=None,
                   n_splits=5,
                   estimator=None):
    """
    Function to tune a parameter using either gridsearch or randomized search with possibility of cross validation.
    Input:
        - data = dataset to be used tuning, usually the training dataset.
        - parameter = string of parameter to be tuned. (works with XGBoost for now)
        - param_range = parameter search space
        - estimator = model to be tuned if existing already, if not a new default XGBRegressor model wil be created
    """

    train_x, train_y = cd.X_Y_split(data)
    param_grid = {parameter: list(param_range)}
    kfold = KFold(n_splits=n_splits, random_state=7)
    if not estimator:
        estimator = xgb.XGBRegressor(objective="reg:squarederror", )
    if randomized:
        assert n_iter != None, "Missing number of iterations"
        param_search = RandomizedSearchCV(estimator,
                                          param_grid,
                                          n_iter=n_iter,
                                          scoring="neg_mean_squared_error",
                                          cv=kfold)
    else:
        param_search = GridSearchCV(estimator,
                                    param_grid,
                                    verbose=0,
                                    cv=kfold,
                                    scoring="neg_mean_squared_error")
    grid_result = param_search.fit(train_x, train_y, verbose=0)

    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    #for mean, sdev, param in zip(means, stds, params):
    #print("%f (%f) with: %r" % (mean, stdev, param))

    if randomized:
        param_range = [list(i.values())[0] for i in params]

    fig, ax = plt.subplots()
    ax.errorbar(param_range, -1 * means, yerr=stds)
    ax.set_title("XGBoost %s vs RMSE" % parameter)
    ax.set_xlabel('%s' % parameter)
    ax.set_ylabel('RMSE')
    if save_plot:
        if save_path:
            fig.savefig("%s/%s.png" % (save_path, parameter))
        else:
            fig.savefig("%s.png" % parameter)
    return grid_result