def run_xgb(args, steps_out): #Parameter list: param_list = ['speed', 'cos_wind_dir', 'sin_wind_dir'] predict = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) true = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) baseline = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) for param in param_list: x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, args.steps_in, steps_out, param) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False) xg = XGBRegressor(max_depth=args.max_depth, n_estimators=args.n_estimators, colsample_bytree=args.colsample_bytree, min_child_weight=args.min_child_weight, subsample=args.subsample, learning_rate=args.lr) xg.fit(X_train, y_train) y_hat = xg.predict(X_test) predict[param] = pd.Series(y_hat) #print(np.array(y_test).reshape(-1)) true[param] = pd.Series( np.array(y_test).reshape(-1)) #y_test.flatten()) baseline[param] = x_df[param + '_forecast'][-len(y_hat):] #reset index baseline.reset_index(inplace=True) return predict, true, baseline
def run_regression(steps_in, steps_out): # Parameter list: param_list = ['speed', 'cos_wind_dir', 'sin_wind_dir'] predict = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) true = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) baseline = pd.DataFrame(columns={'speed', 'cos_wind_dir', 'sin_wind_dir'}) for param in param_list: x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in, steps_out, param) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False) xg = XGBRegressor(max_depth=5) xg.fit(X_train, y_train) y_hat = xg.predict(X_test) predict[param] = pd.Series(y_hat) true[param] = pd.Series(y_test.flatten()) baseline[param] = x_df[param + '_forecast'][-len(y_hat):] # reset index baseline.reset_index(inplace=True) return predict, true, baseline
def train_xgb(measurement, forecast, steps_in, steps_out): #flag message print('running xgb for steps_out=', steps_out) #Parameter list: param_list = [ 'scenario', 'dangerous' ] #['speed','cos_wind_dir','sin_wind_dir','scenario','dangerous'] for param in param_list: print(param) #train on the entire data x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in, steps_out, param) #gridsearch if param in ['speed', 'cos_wind_dir', 'sin_wind_dir']: xgb_model = XGBRegressor() splitter = KFold(n_splits=4, shuffle=True) score = 'neg_mean_absolute_error' if param in ['scenario', 'dangerous']: xgb_model = XGBClassifier() splitter = StratifiedKFold(n_splits=4, shuffle=True) score = 'accuracy' if (param == 'dangerous'): sm = SMOTE(sampling_strategy=0.6, random_state=0) x, y = sm.fit_resample(x, y) score = 'roc_auc' grid = GridSearchCV(xgb_model, param_grid=grid_params, scoring=score, cv=splitter.split(x, y)) grid.fit(x, y) best_model = grid.best_estimator_ #print grid parameters print('gridsearch result for param: ', param) print(grid.best_params_) #save model into a pickle file pickle.dump( best_model, open('trained_models/' + str(param) + '_t_' + str(steps_out), 'wb')) return
def run_rf(steps_in, steps_out): #flag message print('running random forrest for steps_out=', steps_out) #Parameter list: param_list = [ 'scenario', 'dangerous', 'speed', 'cos_wind_dir', 'sin_wind_dir' ] #['scenario','dangerous'] # predict_test = pd.DataFrame( columns={ 'speed', 'cos_wind_dir', 'sin_wind_dir', 'scenario', 'dangerous', 'dangerous_proba' }) predict_train = pd.DataFrame( columns={ 'speed', 'cos_wind_dir', 'sin_wind_dir', 'scenario', 'dangerous', 'dangerous_proba' }) for param in param_list: x_df, y_df, x, y = proc.prepare_x_y(measurement, forecast, steps_in, steps_out, param) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False) #gridsearch if param in ['speed', 'cos_wind_dir', 'sin_wind_dir']: rf_model = RandomForestRegressor() splitter = KFold(n_splits=4, shuffle=True) score = 'neg_mean_absolute_error' #MAE if param in ['scenario', 'dangerous']: rf_model = RandomForestClassifier() splitter = StratifiedKFold(n_splits=4, shuffle=True) score = 'accuracy' # SMOTE for binary classification if (param == 'dangerous'): sm = SMOTE(sampling_strategy=0.6, random_state=0) x_train, y_train = sm.fit_resample(x_train, y_train) score = 'roc_auc' grid = GridSearchCV(rf_model, param_grid=grid_params, scoring=score, cv=splitter.split(x_train, y_train)) grid.fit(x_train, y_train) print('gridsearch result for param: ', param) print(grid.best_params_) #save best parameters: pickle.dump( grid.best_params_, open('results/params/rf_' + param + '_' + str(steps_out) + '.pkl', 'wb')) best_model = grid.best_estimator_ #save model into a pickle file pickle.dump( best_model, open( 'results/trained_models/rf_' + str(param) + '_' + str(steps_out) + '.pkl', 'wb')) #record results predict_test[param] = pd.Series(best_model.predict(x_test)) predict_train[param] = pd.Series(best_model.predict(x_train)) if param == 'dangerous': predict_test['dangerous_proba'] = pd.Series( best_model.predict_proba(x_test)[:, 1]) predict_train['dangerous_proba'] = pd.Series( best_model.predict_proba(x_train)[:, 1]) #record baseline and truth predict_test['true'] = pd.Series( np.array(y_test).reshape(-1)) #y_test.flatten()) predict_train['true'] = pd.Series(np.array(y_train).reshape(-1)) predict_test['baseline'] = x_df['dangerous_forecast'][-len(y_test ):] predict_train['baseline'] = x_df['dangerous_forecast'][:len(y_train )] return predict_train, predict_test #, true, baseline