def grid_search_para(train_data,
                     label,
                     best_para=0,
                     grid_param=0,
                     is_search_estimator=False,
                     search_lr=0.1,
                     scoring='roc_auc',
                     search_estimators=100,
                     iid=False,
                     cv=skfold):
    if not is_search_estimator:
        print("search other parameters")
        xgb_ = XGBRegressor(**best_para)
        best_para['objective'] = 'binary:logistic'
        best_para['nthread'] = 8
        grid_search = GridSearchCV(estimator=xgb_,
                                   param_grid=grid_param,
                                   scoring=scoring,
                                   iid=iid,
                                   cv=cv)
        grid_search.fit(train_data, label)
        best_para.update(grid_search.best_params_)
    else:
        print("search n_estimators parameters")
        xgb_ = XGBRegressor(booster="dart")
        if best_para == 0:
            best_para = xgb_.get_params()
        best_para['n_estimators'] = search_estimators
        best_para['learning_rate'] = search_lr
        xgb_ = XGBRegressor(**best_para)
        best_estimator = xgb_cv(xgb_, train_data, label)
        best_para['n_estimators'] = best_estimator

    return best_para
def grid_search(parameters,
                X_train_res,
                y_train_res,
                X_test,
                y_test,
                useTrainCV=False):
    xgbmodel = XGBRegressor()
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    grid_search_xg = GridSearchCV(xgbmodel,
                                  parameters,
                                  scoring='roc_auc',
                                  n_jobs=1,
                                  cv=kfold,
                                  verbose=1)
    result_gcv_xgb = grid_search_xg.fit(X_train_res, y_train_res)
    best_params = result_gcv_xgb.best_params_
    print("Best params: %s" % (best_params))

    # rebuild using best params
    xg_reg = XGBRegressor(objective=best_params['objective'],
                          learning_rate=best_params['learning_rate'],
                          max_depth=best_params['max_depth'],
                          n_estimators=best_params['n_estimators'],
                          min_child_weight=best_params['min_child_weight'],
                          gamma=best_params['gamma'],
                          colsample_bytree=best_params['colsample_bytree'],
                          subsample=best_params['subsample'],
                          reg_alpha=best_params['reg_alpha'])

    if useTrainCV:
        xgb_param = xg_reg.get_xgb_params()
        xgtrain = DMatrix(X_train_res, label=y_train_res)
        cvresult = cv(xgb_param,
                      xgtrain,
                      num_boost_round=xg_reg.get_params()['n_estimators'],
                      folds=kfold,
                      metrics='auc',
                      early_stopping_rounds=20)
        xg_reg.set_params(n_estimators=cvresult.shape[0])
        print("Best number of estimators: %i" % (cvresult.shape[0]))

    eval_set = [(X_test, y_test)]
    xg_reg.fit(X_train_res,
               y_train_res,
               eval_metric="error",
               eval_set=eval_set,
               verbose=False)
    y_pred_train = xg_reg.predict(X_train_res)
    #print("Accuracy train: %f" % (accuracy_score(y_train_res, y_pred_train)))
    #print("Recall train: %f" % (recall_score(y_train_res, y_pred_train)))
    #print("Precision train: %f" % (precision_score(y_train_res, y_pred_train)))
    print("AUC train: %f" % (roc_auc_score(y_train_res, y_pred_train)))
    y_pred = xg_reg.predict(X_test)
    #print("Accuracy test: %f" % (accuracy_score(y_test, y_pred)))
    #print("Recall test: %f" % (recall_score(y_test, y_pred)))
    #print("Precision test: %f" % (precision_score(y_test, y_pred)))
    print("AUC test: %f" % (roc_auc_score(y_test, y_pred)))
Exemple #3
0
    def get_params(self, deep=True):
        ''' 
        A hack to make it work through the XGB code. They use the base class 0 to retrieve the parameters.
        Since I overwrite the base_class[0] as OnehotEncodingClassifierMixin, now I do a hack to temporarily
        assign the base class as the next one (XGB class).
        '''
        orig_bases = copy.deepcopy(self.__class__.__bases__)
        self.__class__.__bases__ = (XGBRegressor, )
        self.__class__ = XGBRegressor

        params = XGBRegressor.get_params(self, deep=deep)
        self.__class__ = MyXGBRegressor
        self.__class__.__bases__ = orig_bases
        return params
class XGBaseline(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.xgb_mean = XGBRegressor(**kwargs)

    def fit(self, X, y):
        self.xgb_mean.fit(X, y)
        errors = y - self.xgb_mean.predict(X)
        self.std = np.std(errors)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_std = self.std * np.ones(len(pred_mean))
        return pred_mean, pred_std

    def get_params(self, deep=True):
        return self.xgb_mean.get_params()

    def set_params(self, **params):
        self.xgb_mean.set_params(**params)
        return self
class XGBLogLikelihood(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.xgb_mean = XGBRegressor(**kwargs)
        kwargs["objective"] = ll_objective
        self.xgb_log_var = XGBRegressor(**kwargs)

    def fit(self, X, y):
        self.xgb_mean.fit(X, y)
        errors = y - self.xgb_mean.predict(X)
        self.xgb_log_var.fit(X, errors)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_std = np.exp(self.xgb_log_var.predict(X) / 2)
        return pred_mean, pred_std

    def get_params(self, deep=True):
        return self.xgb_mean.get_params()

    def set_params(self, **params):
        self.xgb_mean.set_params(**params)
        self.xgb_log_var.set_params(**params)
        return self
Exemple #6
0
    model = XGBRegressor(
        n_estimators=70,
        learning_rate=0.15,
        reg_alpha=10,
        max_depth=3,
        missing=np.nan,
        subsample=0.7,
        reg_lambda=100,
        n_jobs=-1,
        gamma=2,
        min_child_weight=1,
        #     nthread = -1,
        seed=555)
    model.fit(np.array(x), np.array(y))
    print('model fitted!')
    print(model.get_params())

    # x_test, y_test = make_val_set(test_rdd)
    # y_pred = model.predict(np.array(x_test))
    # rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    # print('oob rmse is : ', rmse)

    y_pred = model.predict(np.array(x_test))
    to_save = list(
        map(lambda x: (x[0][0], x[0][1], x[1]), zip(test_rdd.collect(),
                                                    y_pred)))
    write_csv(to_save, output_path)

    # y_pred = model.predict(np.array(x_train))
    # rmse = np.sqrt(mean_squared_error(y_pred, y_train))
    # print('in sample rmse is : ', rmse)
Exemple #7
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=0,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None):

    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent,
                             objectibe, booster, n_jobs, nthread, gamma,
                             min_child_weight, max_delta_step, subsample,
                             colsample_bytree, colsample_bylevel, reg_alpha,
                             reg_lambda, scale_pos_weight, base_score,
                             random_state, seed, missing)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_cols])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Importance
    | {image_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['_repr_brtc_'] = rb.get()

    return {'model': out_model}
Exemple #8
0
    pred_array[1, 3] = month_map[mon_map[(temp + 2) % 12]]  #set month
    pred_array[1, 2] = df.iloc[-1]['Year'] + 1  #set year
else:
    pred_array[1, 3] = df.iloc[-1]['Month'] + 2  #set month
    pred_array[1, 2] = df.iloc[-1]['Year']  #set year

if temp + 3 > 12:
    pred_array[2, 3] = month_map[mon_map[(temp + 3) % 12]]  #set month
    pred_array[2, 2] = df.iloc[-1]['Year'] + 1  #set year
else:
    pred_array[2, 3] = df.iloc[-1]['Month'] + 3  #set month
    pred_array[2, 2] = df.iloc[-1]['Year']  #set year

pred_array[0, 4] = df.iloc[-1]['date'] + 1  #set date
pred_array[1, 4] = df.iloc[-1]['date'] + 2
pred_array[2, 4] = df.iloc[-1]['date'] + 3

df1 = df[df['APMC'] == int(apmc)]  #to get the district name
dname = df1.iloc[0]['district_name']
pred_array[:, 5] = dname
op = np.array([[0, 0, 0, 0, 0, 0]])

for i in range(0, 3):
    op[0] = pred_array[i]
    # y=y.reshape(-1,len(x))
    print('Input for prediction: ', np.array(op))
    result_array = clf.predict(np.array(op))
    print('Output for prediction for future ', i, ' month: ', result_array)

print(clf.get_params())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


# MinMaxScaler da mejores resultados que StanderScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)


model = XGBRegressor()
model.fit(train_scaled, y_train)

print("Accuracy on train data: ", round(model.score(train_scaled, y_train)*100, 2), "%")
print("Accuracy on test data: ", round(model.score(test_scaled, y_test)*100, 2), "%")
print("Parameters: ", model.get_params())
print("MAE: ", mean_absolute_error(y_test, model.predict(test_scaled)))
# TODO: Se puede mejorar el Grid
gridParams = {
     "n_estimators": np.arange(1100, 1500)
    }

grid = GridSearchCV(model, gridParams,
                    verbose=1,
                    cv=3,
                    n_jobs=5)
grid.fit(train_scaled, y_train)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)
Exemple #10
0
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100,
                          silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
                          max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
                          scale_pos_weight=1, base_score=0.5, random_state=None, seed=None, missing=None,
                          sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True,
                          xgb_model=None, sample_weight_eval_set=None, importance_type='gain'):

    if random_state is None:
        random_state = randint(-2**31, 2**31-1)

    regressor = XGBRegressor(max_depth=max_depth,
                             learning_rate=learning_rate,
                             n_estimators=n_estimators,
                             silent=silent,
                             objective=objectibe,
                             booster=booster,
                             n_jobs=n_jobs,
                             nthread=nthread,
                             gamma=gamma,
                             min_child_weight=min_child_weight,
                             max_delta_step=max_delta_step,
                             subsample=subsample,
                             colsample_bytree=colsample_bytree,
                             colsample_bylevel=colsample_bylevel,
                             reg_alpha=reg_alpha,
                             reg_lambda=reg_lambda,
                             scale_pos_weight=scale_pos_weight,
                             base_score=base_score,
                             random_state=random_state,
                             seed=seed,
                             missing=missing,
                             importance_type=importance_type)
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]
    regressor.fit(features, label,
                  sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)
    
    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
#     plt.rcdefaults()
    # plot_importance(regressor)
    # plt.tight_layout()
    # fig_plot_importance = plt2MD(plt)
    fig_plot_importance = _plot_feature_importances(feature_cols, regressor)
    # plt.clf()
#     plt.rcParams['figure.dpi'] = figure_dpi
#     plot_tree(regressor)
#     fig_plot_tree_UT = plt2MD(plt)
#     plt.clf()
#     plt.rcParams['figure.dpi'] = figure_dpi
#     plot_tree(regressor, rankdir='LR')
#     fig_plot_tree_LR = plt2MD(plt)
#     plt.rcdefaults()
#     plt.clf()
    
    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
#     out_model['plot_tree_UT'] = fig_plot_tree_UT
#     out_model['plot_tree_LR'] = fig_plot_tree_LR
#         out_model['to_graphviz'] = md_to_graphviz
    
    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_names])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_names).T
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Feature Importance
    | {image_importance}
    |
    | ### Normalized Feature Importance Table
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df)            
               )))      
    out_model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame([[feature_cols[i],feature_importance[i]] for i in range(len(feature_cols))],columns = ['feature_name','importance'])
    out_model['feature_importance_table'] = feature_importance_table
    return {'model' : out_model}
Exemple #11
0
class RaceStrategyModel(object):
    def __init__(self, year: int, verbose=False, n_cores=1):
        print("XGB using {} threads".format(n_cores))
        self.regular_model = XGBRegressor(n_jobs=n_cores)
        self.pit_model = XGBRegressor(n_jobs=n_cores)
        self.safety_model = XGBRegressor(n_jobs=n_cores)
        self.test_race = None
        self.scaler = None
        self.test_race_pit_model = None
        self.dummy_columns = None
        self.n_cores = n_cores
        # self.start_lap = start_lap

        if year == 2014:
            year = "year_1"
        elif year == 2015:
            year = "year_2"
        elif year == 2016:
            year = "year_3"
        elif year == 2017:
            year = "year_4"
        elif year == 2018:
            year = "year_5"
        elif year == 2019:
            year = "year_6"
        else:
            raise ValueError("No race available for year " + str(year))

        self.year = year
        self.verbose = verbose

    def split_train_test(self, df: pd.DataFrame, split_fraction: float):
        """ Split the dataset randomly but keeping whole races together """
        test_data = pd.DataFrame(columns=df.columns)

        races = df[df[self.year] == 1]['raceId'].unique()

        if split_fraction != 0:
            split_size = int(round(split_fraction * len(races)))
        else:
            # Leave only one race out from the training
            split_size = 1

        test_races = np.random.choice(races, size=split_size)
        for race in test_races:
            race_laps = df.loc[df['raceId'] == race]
            test_data = test_data.append(race_laps)
            df = df[df.raceId != race]

        return df, test_data

    def normalize_dataset(self, df):
        """ Normalize integer-valued columns of the dataset """
        data = df.copy()
        # print(df.columns)
        # Remove columns not to be normalized
        zero_one = [
            'battle', 'drs', "circuitId_1", "circuitId_2", "circuitId_3",
            "circuitId_4", "circuitId_6", "circuitId_7", "circuitId_9",
            "circuitId_10", "circuitId_11", "circuitId_13", "circuitId_14",
            "circuitId_15", "circuitId_17", "circuitId_18", "circuitId_22",
            "circuitId_24", "circuitId_32", "circuitId_34", "circuitId_69",
            "circuitId_70", "circuitId_71", "circuitId_73", "tyre_1", "tyre_2",
            "tyre_3", "tyre_4", "tyre_5", "tyre_6", "year_1", "year_2",
            "year_3", "year_4", "year_5", "year_6", "nextLap", 'pit', 'safety',
            "unnorm_lap"
        ]
        #'milliseconds',
        #'cumulative', 'unnorm_lap']

        temp_df = data[zero_one].copy()
        data.drop(zero_one, axis=1, inplace=True)

        # if self.columns is not None and len(data.columns) != len(self.columns):
        #     print(set(data.columns).difference(set(self.columns)))
        #     exit(-1)

        if not self.scaler:
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scaler.fit(data)
            scaled = data
        else:
            scaled = self.scaler.transform(data)
        data.loc[:, :] = scaled
        data = data.join(temp_df)

        del temp_df
        return data

    def __process_dataset(self, dataset):
        """ Pre-process the dataset to obtain training data and its labels"""

        # Discard wet and suspended races
        old_races = len(dataset['raceId'].unique())
        dataset = discard_wet(dataset)
        dataset = discard_suspended_races(dataset)
        new_races = len(dataset['raceId'].unique())
        if self.verbose:
            print(
                "{} wet and suspended races were discarded".format(old_races -
                                                                   new_races))

        # Eliminate the last lap from the training data, as it has 0 target
        dataset = dataset[dataset['nextLap'] > 0]

        # Express the next lap target as a delta to the pole lap
        dataset['nextLap'] = (dataset['nextLap'] - dataset['pole'])

        # Duplicate columns to use them after normalization
        dataset['base'] = dataset['pole'].astype(int)
        dataset['true'] = dataset['milliseconds'].astype(int)
        dataset['true_cumulative'] = dataset['cumulative'].astype(int)

        # Normalize the dataset, but normalize the lap time and cumulative time individually, in order to be able to
        # normalize them at runtime

        # Remove the duplicated unnormalized columns from the train data
        dataset = dataset.drop(columns=['base', 'true', 'true_cumulative'])
        dataset = self.normalize_dataset(dataset)

        _, self.test_race = self.split_train_test(dataset, split_fraction=0)
        self.__compute_pitstop_model(dataset)

        self.dummy_columns = dataset.columns
        train_data = self.normalize_dataset(dataset)

        # train_data = train_data[train_data['unnorm_lap'] > self.start_lap]  # Take laps after a threshold

        # Remove columns used only to identify the laps in testing
        train_data = train_data.drop(
            columns=['unnorm_lap', "raceId", "driverId", "race_length"])

        # Split the dataset into three separate datasets, one per each model to be trained
        train_pit = deepcopy(train_data.loc[train_data['pit'] != 0])
        train_safety = deepcopy(train_data.loc[(train_data['safety'] != 0)
                                               & (train_data['pit'] == 0)])
        train_regular = deepcopy(train_data.loc[(train_data['pit'] == 0)
                                                & (train_data['safety'] == 0)])

        # Remove features related to pit and safety in the "regular" laps model
        train_regular = train_regular.drop(
            columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds'])

        # Extract the target labels
        labels_pit = train_pit.pop('nextLap')
        labels_safety = train_safety.pop('nextLap')
        labels_regular = train_regular.pop('nextLap')

        train_data = {
            'regular': train_regular,
            'safety': train_safety,
            'pit': train_pit
        }
        labels = {
            'regular': labels_regular,
            'safety': labels_safety,
            'pit': labels_pit
        }

        return train_data, labels

    def __compute_pitstop_model(self, full_dataset: pd.DataFrame):
        """Compute a normal distribution's parameters for each driver's pit-stop times"""

        circuit = get_current_circuit(self.test_race)

        pits = []
        pits_safety = []

        stop_laps = full_dataset[(full_dataset['pitstop-milliseconds'] > 0) & (
            full_dataset[circuit] == 1)].sort_values('lap')

        pit_times = stop_laps[stop_laps['safety'] ==
                              0]['pitstop-milliseconds'].values
        pit_safety_times = stop_laps[
            stop_laps['safety'] > 0]['pitstop-milliseconds'].values
        pits.extend(pit_times.tolist())
        pits_safety.extend(pit_safety_times.tolist())

        safety_mean = np.mean(
            pit_safety_times) if len(pit_safety_times) > 0 else 0
        safety_std = np.std(
            pit_safety_times) if len(pit_safety_times) > 0 else 0

        mean = np.mean(pit_times) if len(pit_times) > 0 else 0
        std = np.std(pit_times) if len(pit_times) > 0 else 0

        self.test_race_pit_model = {
            'regular': (mean, std),
            'safety': (safety_mean, safety_std)
        }

    def train(self):
        """ Train the regression models """
        if self.verbose:
            print('Training models...')
        self.scaler = None
        if self.verbose:
            print("Model uses {} cores".format(self.n_cores))
        # self.regular_model = XGBRegressor(n_jobs=self.n_cores)
        # self.pit_model = XGBRegressor(n_jobs=self.n_cores)
        # self.safety_model = XGBRegressor(n_jobs=self.n_cores)

        dataset = load_dataset()
        datasets, labels = self.__process_dataset(dataset)

        self.regular_model.fit(datasets['regular'], labels['regular'])
        self.pit_model.fit(datasets['pit'], labels['pit'])
        self.safety_model.fit(datasets['safety'], labels['safety'])

        if self.verbose:
            print('Done!\n')

    def resplit(self):
        # TODO fix the invalidation of scaler to avoid the normalization of test races
        self.scaler = None
        dataset = load_dataset()
        self.__process_dataset(dataset)
        self._test_race = fix_data_types(self.test_race)
        self.laps_database = defaultdict(lambda: None)
        self.race_id = self.test_race["raceId"].values[0]

        for i in range(self.test_race["lap"].count()):
            row = self.test_race.iloc[[i]]
            self.laps_database[(row["driverId"].values[0],
                                row["lap"].values[0])] = row

    def load(self):
        """ Restore prediction models from previously pickled files to avoid retraining """

        if self.verbose:
            print("Loading prediction models from pickled files...")
        if not os.path.isfile(
                "./envs/race_strategy_model/pickled_models/regular.model"):
            print("ERROR: regular.model is missing")
            exit(-1)
        else:
            self.regular_model.load_model(
                './envs/race_strategy_model/pickled_models/regular.model')

        if not os.path.isfile(
                "./envs/race_strategy_model/pickled_models/safety.model"):
            print("ERROR: safety.model is missing")
            exit(-1)
        else:
            self.safety_model.load_model(
                './envs/race_strategy_model/pickled_models/safety.model')

        if not os.path.isfile(
                "./envs/race_strategy_model/pickled_models/pit.model"):
            print("ERROR: pit.model is missing")
            exit(-1)
        else:
            self.pit_model.load_model(
                './envs/race_strategy_model/pickled_models/pit.model')

        if not os.path.isfile(
                "./envs/race_strategy_model/pickled_models/scaler.pickle"):
            print("ERROR: scaler.pickle is missing")
            exit(-1)
        else:
            with open(
                    './envs/race_strategy_model/pickled_models/scaler.pickle',
                    'rb') as scaler_file:
                self.scaler = pickle.load(scaler_file)
                scaler_file.close()

        # if not os.path.isfile("pickled_models/test_race.pickle"):
        #     print("ERROR: test_race.pickle is missing")
        #     exit(-1)
        # else:
        #     with open('pickled_models/test_race.pickle', 'rb') as pit_file:
        #         self.pit_model = pickle.load(pit_file)
        #         pit_file.close()

        if self.verbose:
            print("Done!\n")

        # self.regular_model.set_params(**{"n_jobs": self.n_cores})
        # self.safety_model.set_params(**{"n_jobs": self.n_cores})
        # self.pit_model.set_params(**{"n_jobs": self.n_cores})
        print(self.regular_model.get_params())

    def save(self):
        """ Pickle the model objects to avoid retraining """

        for model, name in zip(
            [self.regular_model, self.safety_model, self.pit_model],
            ['regular', 'safety', 'pit']):
            model.save_model(
                './envs/race_strategy_model/pickled_models/{}.model'.format(
                    name))

        with open('./envs/race_strategy_model/pickled_models/scaler.pickle',
                  'wb') as savefile:
            pickle.dump(self.scaler, savefile)
            savefile.close()
        #self.test_race.to_csv(".envs/race_strategy_model/dataset/test_race.csv")

    def predict(self, state, lap_type):
        if lap_type == 'regular':
            state.drop(
                columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds'])
            return self.regular_model.predict(state)
        elif lap_type == 'pit':
            return self.regular_model.predict(state)
        else:
            return self.safety_model.predict(state)

    def get_prediction_model(self, state: str):
        if state == 'regular':
            return self.regular_model
        if state == 'safety':
            return self.safety_model
        if state == 'pit':
            return self.pit_model
        else:
            raise ValueError(
                "The specified state is not valid, allowed model states are 'regular', 'safety' and 'pit'"
            )
from GradienBoosting import format_output
from read_data import x_train_split, x_val, y_train_split, y_val, x_test, x_train_aug, x_test_aug



from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor


model = XGBRegressor(max_depth=5,
                      learning_rate=0.02,
                      objective='reg:linear',
                      n_estimators=300,
                     booster="gblinear")

print(model.get_params().keys())

eval_set = [(x_val, y_val)]
model.fit(x_train_split, y_train_split, eval_metric="rmse", eval_set=eval_set, verbose=True)

print(model.feature_importances_)

y_pred = model.predict(x_val)
print(mean_squared_error(y_val,y_pred))

y_test = model.predict(x_test)
y_test = format_output(y_test)
y_test.to_csv("submission/result_xgboost.csv")


                  n_estimators=750,
                  max_depth=5,
                  min_child_weight=1,
                  gamma=0,
                  subsample=0.8,
                  colsample_bytree=0.8,
                  objective='reg:gamma',
                  nthread=4,
                  scale_pos_weight=1,
                  seed=27)

xgb_param = gb.get_xgb_params()
xgtrain = xgb.DMatrix(df[features].values, label=df['SPEED_AVG'].values)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=gb.get_params()['n_estimators'],
                  nfold=10,
                  metrics='mae',
                  early_stopping_rounds=50)
gb.set_params(n_estimators=cvresult.shape[0])

gb.fit(x_train, y_train, eval_metric='mae')


def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


predictions = gb.predict(x_train)
Exemple #14
0
def runXGBRegressorTuning(X_train,
                          X_test,
                          y_train,
                          y_test,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          initial_max_depth=[3, 5, 7, 9],
                          initial_min_child_weight=[1, 3, 5],
                          objective='reg:linear',
                          learning_rate=0.1,
                          n_estimators=140,
                          max_depth=5,
                          min_child_weight=1,
                          reg_alpha=0,
                          reg_lambda=0,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8):
    # Tune max depth and min child weight - strongest bearing on model tuning
    best_score = 1000000000
    xgb_param_dict = dict(learning_rate=learning_rate,
                          n_estimators=n_estimators,
                          max_depth=max_depth,
                          min_child_weight=min_child_weight,
                          reg_alpha=reg_alpha,
                          gamma=gamma,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          objective=objective,
                          reg_lambda=reg_lambda,
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27)
    xgb_model = XGBRegressor(**xgb_param_dict)

    param_test1 = {
        'max_depth': initial_max_depth,
        'min_child_weight': initial_min_child_weight
    }

    gsearch = GridSearchCV(estimator=XGBRegressor(**xgb_param_dict),
                           param_grid=param_test1,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)
    gsearch.fit(X_train, y_train)
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))

    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
    xgb_param_dict['min_child_depth'] = gsearch.best_params_['min_child_depth']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Decision tree to determine new search ranges if optimal solution found at limit of initial range
    if gsearch.best_params_['max_depth'] == max(initial_max_depth):
        print('Best max_depth at max limit of initial range...')
        new_initial_max_depth = range(max(initial_max_depth),
                                      max(initial_max_depth) + 6, 2)
    elif gsearch.best_params_['max_depth'] == min(initial_max_depth):
        print('Best max_depth at min limit of initial range...')
        new_initial_max_depth = range(
            min(initial_max_depth) - 6, min(initial_max_depth), 2)
    else:
        new_initial_max_depth = initial_max_depth

    if gsearch.best_params_['min_child_weight'] == max(
            initial_min_child_weight):
        print('Best min_child_weight at max limit of initial range...')
        new_initial_min_child_weight = range(max(initial_min_child_weight),
                                             max(initial_min_child_weight) + 6,
                                             2)
    elif gsearch.best_params_['min_child_weight'] == min(
            initial_min_child_weight):
        print('Best max_depth at min limit of initial range...')
        new_initial_min_child_weight = range(
            min(initial_min_child_weight) - 6, min(initial_min_child_weight),
            2)
    else:
        new_initial_min_child_weight = initial_min_child_weight

    # Run various procedures depending on outcome
    if new_initial_max_depth != initial_min_child_weight or new_initial_max_depth != initial_max_depth:
        param_test = {
            'max_depth': new_initial_max_depth,
            'min_child_weight': new_initial_min_child_weight
        }
        gsearch = GridSearchCV(estimator=xgb_model,
                               param_grid=param_test,
                               scoring=scoring,
                               n_jobs=4,
                               iid=False,
                               cv=cv)
        gsearch.fit(X_train, y_train)
        print('Best params: {}'.format(gsearch.best_params_))
        print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
        best_score = np.sqrt(-gsearch.best_score_)
        xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
        xgb_param_dict['min_child_depth'] = gsearch.best_params_[
            'min_child_depth']
        xgb_model = XGBRegressor(**xgb_param_dict)

    else:
        # Check either side of best variables to check
        param_test = {
            'max_depth': [
                xgb_param_dict['max_depth'] - 1, xgb_param_dict['max_depth'],
                xgb_param_dict['max_depth'] + 1
            ],
            'min_child_weight': [
                xgb_param_dict['min_child_weight'] - 1,
                xgb_param_dict['min_child_weight'],
                xgb_param_dict['min_child_weight'] + 1
            ]
        }
        gsearch = GridSearchCV(estimator=xgb_model,
                               param_grid=param_test,
                               scoring=scoring,
                               n_jobs=4,
                               iid=False,
                               cv=cv)
        gsearch.fit(X_train, y_train)
        # Fine-tuned max_depth and min_child_weight parameters
        print('Fine-tuned max_depth and min_child_weight parameters...\n')
        print('Best params: {}'.format(gsearch.best_params_))
        print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
        best_score = np.sqrt(-gsearch.best_score_)
        xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth']
        xgb_param_dict['min_child_weight'] = gsearch.best_params_[
            'min_child_weight']
        xgb_model = XGBRegressor(**xgb_param_dict)

    warnings = {}
    # Tune gamma
    param_test3 = {'gamma': [i / 10.0 for i in range(0, 5)]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test3,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned gamma parameters
    print('Fine-tuned gamma parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['gamma'] = gsearch.best_params_['gamma']
    xgb_model = XGBRegressor(**xgb_param_dict)

    if xgb_param_dict['gamma'] == max(param_test3['gamma']):
        warnings[
            'gamma'] = 'gamma: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['gamma'])

    # Tune subsample and colsample_bytree
    param_test4 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test4,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned subsample and colsample_bytree parameters
    print('Tuned subsample and colsample_bytree parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['subsample'] = gsearch.best_params_['subsample']
    xgb_param_dict['colsample_bytree'] = gsearch.best_params_[
        'colsample_bytree']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # while xgb_param_dict['subsample'] == max(param_test4['subsample'] or
    #       xgb_param_dict['colsample_bytree'] == max(param_test4['colsample_bytree']) or
    #       xgb_param_dict['subsample'] == min(param_test4['subsample'] or
    #       xgb_param_dict['colsample_bytree'] == min(param_test4['colsample_bytree']):

    if xgb_param_dict['subsample'] == max(param_test4['subsample']):
        warnings[
            'subsample'] = 'subsample: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['subsample'])
    elif xgb_param_dict['subsample'] == min(param_test4['subsample']):
        warnings[
            'subsample'] = 'subsample: Optimal parameter {} at min of search range'.format(
                xgb_param_dict['subsample'])

    if xgb_param_dict['colsample_bytree'] == max(
            param_test4['colsample_bytree']):
        warnings[
            'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at max of search range'.format(
                xgb_param_dict['colsample_bytree'])
    elif xgb_param_dict['colsample_bytreee'] == min(
            param_test4['colsample_bytree']):
        warnings[
            'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at min of search range'.format(
                xgb_param_dict['colsample_bytree'])

    # Tune regularisation parameters
    param_test6 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}
    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test6,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned regularisation parameters
    print('Tuned regularisation parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score_)
    xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Fine-tune regularisation parameters

    param_test7 = {
        'reg_alpha': [
            float(xgb_param_dict['reg_alpha']) / 10,
            float(xgb_param_dict['reg_alpha']) / 2,
            float(xgb_param_dict['reg_alpha']),
            float(xgb_param_dict['reg_alpha']) * 5,
            float(xgb_param_dict['reg_alpha']) * 2
        ]
    }

    gsearch = GridSearchCV(estimator=xgb_model,
                           param_grid=param_test6,
                           scoring=scoring,
                           n_jobs=4,
                           iid=False,
                           cv=cv)

    gsearch.fit(X_train, y_train)
    # Fine-tuned regularisation parameters
    print('Tuned regularisation parameters...\n')
    print('Best params: {}'.format(gsearch.best_params_))
    print('Best score: {}'.format(np.sqrt(-gsearch.best_score_)))
    best_score = np.sqrt(-gsearch.best_score)
    xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha']
    xgb_model = XGBRegressor(**xgb_param_dict)

    # Tune the learning rate of the model

    cvresult = xgb.cv(xgb_model.get_params(),
                      X_train,
                      num_boost_round=xgb_model.get_params()['n_estimators'],
                      nfold=cv,
                      metrics='rmse',
                      early_stopping_rounds=50,
                      show_progress=False)

    # Set the model to the optimal number of estimators wrt early stopping round limit
    xgb_param_dict['n_estimators'] = cvresult.shape[0]

    # Learn final XGBoost model
    xgb_model = XGBRegressor(**xgb_param_dict)
    xgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_metric='rmse',
                  verbose=True)

    return xgb_model, xgb_model.get_params(), xgb_model.evals_result(
    ), warnings
Exemple #15
0
if PLOTS == True:
    # plot feature importance
    importance = model.feature_importances_
    plot_feature_importance(importance, cols_to_use)

    # plot loss curves
    loss = model.evals_result()
    epochs = len(loss['validation_0']['rmse'])
    x_axis = range(0, epochs)
    plt.plot(x_axis, loss['validation_0']['rmse'], label='Train')
    plt.plot(x_axis, loss['validation_1']['rmse'], label='Test')
    plt.legend()
    plt.ylabel('RMSE')
    plt.show()

###################
# predictions and export
###################

score = model.best_score
features = cols_to_use
params = model.get_params()
pred_val = model.predict(X_val).clip(0, 20)
pred_test = model.predict(X_test).clip(0, 20)
ids = np.array(df.loc[df['date_block_num'] == 34, 'ID'])
submission = make_submission(ids, pred_test)

if DEBUG == False:
    export_model(OUT_FOLDER, score, features, params, pred_val, pred_test,
                 submission)
    'gamma': 0,
    'importance_type': 'gain',
    'learning_rate': 0.1,
    'max_delta_step': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'n_estimators': 1450,
    'n_jobs': 1,
    'nthread': None,
    'objective': 'reg:squarederror',
    'random_state': 0,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'seed': None,
    'silent': None,
    'subsample': 1,
    'verbosity': 1
}
rfr = XGBRegressor(**params)
rfr.fit(X_train, y_train)
print('fitted', '--- %s seconds ---' % (time.time() - start_time))
y_pred = rfr.predict(X_test)
print('R^2=', rfr.score(X_test, y_test))
print('RFR_params:', rfr.get_params())
print('Finished', time.ctime())
# save model
joblib.dump(rfr, datadir + 'JLmodel_' + \
            str(rfr.get_params()['n_estimators']) + '_' + \
            str(rfr.get_params()['max_depth']) + '.json')
Exemple #17
0
#finding numof boosting rounds and learning rate
alg = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:squarederror',
 seed=27)

xgb_param = alg.get_xgb_params()

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=5,metrics='rmse', early_stopping_rounds=50)

n_estimators = cvresult.shape[0]

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=n_estimators, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, seed=27), 
 param_grid = param_test1, scoring=make_scorer(mean_squared_error),n_jobs=4,iid=False, cv=5)
gsearch1.fit(train_df,target)
gsearch1.best_params_, gsearch1.best_score_