Esempio n. 1
0
def run_xgb(args, steps_out):
    #Parameter list:
    param_list = ['speed', 'cos_wind_dir', 'sin_wind_dir']

    predict = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})
    true = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})
    baseline = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})

    for param in param_list:
        x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast,
                                            args.steps_in, steps_out, param)
        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            shuffle=False)
        xg = XGBRegressor(max_depth=args.max_depth,
                          n_estimators=args.n_estimators,
                          colsample_bytree=args.colsample_bytree,
                          min_child_weight=args.min_child_weight,
                          subsample=args.subsample,
                          learning_rate=args.lr)
        xg.fit(X_train, y_train)
        y_hat = xg.predict(X_test)

        predict[param] = pd.Series(y_hat)
        #print(np.array(y_test).reshape(-1))
        true[param] = pd.Series(
            np.array(y_test).reshape(-1))  #y_test.flatten())
        baseline[param] = x_df[param + '_forecast'][-len(y_hat):]

    #reset index
    baseline.reset_index(inplace=True)
    return predict, true, baseline
Esempio n. 2
0
def run_regression(steps_in, steps_out):
    # Parameter list:
    param_list = ['speed', 'cos_wind_dir', 'sin_wind_dir']

    predict = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})
    true = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})
    baseline = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'})

    for param in param_list:
        x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in,
                                            steps_out, param)
        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            shuffle=False)
        xg = XGBRegressor(max_depth=5)
        xg.fit(X_train, y_train)
        y_hat = xg.predict(X_test)

        predict[param] = pd.Series(y_hat)
        true[param] = pd.Series(y_test.flatten())
        baseline[param] = x_df[param + '_forecast'][-len(y_hat):]

    # reset index
    baseline.reset_index(inplace=True)
    return predict, true, baseline
Esempio n. 3
0
def train_xgb(measurement, forecast, steps_in, steps_out):
    #flag message
    print('running xgb for steps_out=', steps_out)
    #Parameter list:
    param_list = [
        'scenario', 'dangerous'
    ]  #['speed','cos_wind_dir','sin_wind_dir','scenario','dangerous']

    for param in param_list:
        print(param)
        #train on the entire data
        x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in,
                                            steps_out, param)

        #gridsearch
        if param in ['speed', 'cos_wind_dir', 'sin_wind_dir']:
            xgb_model = XGBRegressor()
            splitter = KFold(n_splits=4, shuffle=True)
            score = 'neg_mean_absolute_error'

        if param in ['scenario', 'dangerous']:
            xgb_model = XGBClassifier()
            splitter = StratifiedKFold(n_splits=4, shuffle=True)
            score = 'accuracy'

            if (param == 'dangerous'):
                sm = SMOTE(sampling_strategy=0.6, random_state=0)
                x, y = sm.fit_resample(x, y)
                score = 'roc_auc'

        grid = GridSearchCV(xgb_model,
                            param_grid=grid_params,
                            scoring=score,
                            cv=splitter.split(x, y))
        grid.fit(x, y)
        best_model = grid.best_estimator_

        #print grid parameters
        print('gridsearch result for param: ', param)
        print(grid.best_params_)

        #save model into a pickle file
        pickle.dump(
            best_model,
            open('trained_models/' + str(param) + '_t_' + str(steps_out),
                 'wb'))
    return
Esempio n. 4
0
def run_rf(steps_in, steps_out):
    #flag message
    print('running random forrest for steps_out=', steps_out)
    #Parameter list:
    param_list = [
        'scenario', 'dangerous', 'speed', 'cos_wind_dir', 'sin_wind_dir'
    ]  #['scenario','dangerous'] #

    predict_test = pd.DataFrame(
        columns={
            'speed', 'cos_wind_dir', 'sin_wind_dir', 'scenario', 'dangerous',
            'dangerous_proba'
        })
    predict_train = pd.DataFrame(
        columns={
            'speed', 'cos_wind_dir', 'sin_wind_dir', 'scenario', 'dangerous',
            'dangerous_proba'
        })

    for param in param_list:
        x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in,
                                            steps_out, param)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            shuffle=False)

        #gridsearch
        if param in ['speed', 'cos_wind_dir', 'sin_wind_dir']:
            rf_model = RandomForestRegressor()
            splitter = KFold(n_splits=4, shuffle=True)
            score = 'neg_mean_absolute_error'  #MAE

        if param in ['scenario', 'dangerous']:
            rf_model = RandomForestClassifier()
            splitter = StratifiedKFold(n_splits=4, shuffle=True)
            score = 'accuracy'
            # SMOTE for binary classification
            if (param == 'dangerous'):
                sm = SMOTE(sampling_strategy=0.6, random_state=0)
                x_train, y_train = sm.fit_resample(x_train, y_train)
                score = 'roc_auc'

        grid = GridSearchCV(rf_model,
                            param_grid=grid_params,
                            scoring=score,
                            cv=splitter.split(x_train, y_train))
        grid.fit(x_train, y_train)

        print('gridsearch result for param: ', param)
        print(grid.best_params_)

        #save best parameters:
        pickle.dump(
            grid.best_params_,
            open('results/params/rf_' + param + '_' + str(steps_out) + '.pkl',
                 'wb'))
        best_model = grid.best_estimator_

        #save model into a pickle file
        pickle.dump(
            best_model,
            open(
                'results/trained_models/rf_' + str(param) + '_' +
                str(steps_out) + '.pkl', 'wb'))

        #record results
        predict_test[param] = pd.Series(best_model.predict(x_test))
        predict_train[param] = pd.Series(best_model.predict(x_train))

        if param == 'dangerous':
            predict_test['dangerous_proba'] = pd.Series(
                best_model.predict_proba(x_test)[:, 1])
            predict_train['dangerous_proba'] = pd.Series(
                best_model.predict_proba(x_train)[:, 1])

            #record baseline and truth
            predict_test['true'] = pd.Series(
                np.array(y_test).reshape(-1))  #y_test.flatten())
            predict_train['true'] = pd.Series(np.array(y_train).reshape(-1))
            predict_test['baseline'] = x_df['dangerous_forecast'][-len(y_test
                                                                       ):]
            predict_train['baseline'] = x_df['dangerous_forecast'][:len(y_train
                                                                        )]

    return predict_train, predict_test  #, true, baseline