def ensemble(root_path,original_series,station,predictor,predict_pattern,variables,decomposer=None,wavelet_level='db10-2'):
    lags_dict = variables['lags_dict']
    full_len = variables['full_len']
    train_len = variables['train_len']
    dev_len = variables['dev_len']
    test_len = variables['test_len']
    logger.info('Ensemble forecasting results...')
    logger.info('Root path:{}'.format(root_path))
    logger.info('original series:\n{}'.format(original_series))
    logger.info('Station:{}'.format(station))
    logger.info('Decomposer:{}'.format(decomposer))   
    logger.info('Lags dict:{}'.format(lags_dict))
    logger.info('Predictor:{}'.format(predictor))
    logger.info('Predict pattern:{}'.format(predict_pattern))
    logger.info('Training length:{}'.format(train_len))
    logger.info('Development length:{}'.format(test_len))
    logger.info('Testing length:{}'.format(test_len))
    logger.info('Entire length:{}'.format(full_len))
    logger.info('Wavelet and decomposition level of WA:{}'.format(wavelet_level))
    
    original = original_series
    if decomposer=='dwt' or decomposer=='modwt':
        models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+wavelet_level+'/'+predict_pattern+'/'
    elif decomposer==None:
        models_path = root_path+'/'+station+'/projects/'+predictor+'/'+predict_pattern+'/'
    else:
        models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+predict_pattern+'/'
    logger.info("Model path:{}".format(models_path))

    if 'multi_step' not in predict_pattern:
        models_history = models_path+'history/'
        optimal_model = ''
        min_dev_mse = np.inf
        for file_ in os.listdir(models_history):
            if '.csv' in file_ and 'optimized_params' not in file_:
                logger.info('read model results:{}'.format(file_))
                dev_mse = pd.read_csv(models_history+file_)['dev_mse'][0]
                if dev_mse < min_dev_mse:
                    min_dev_mse = dev_mse
                    optimal_model = file_
        logger.info('Optimal model:{}'.format(optimal_model))
        logger.info('Minimum MSE={}'.format(min_dev_mse))
        optimal_model = pd.DataFrame([optimal_model],columns=['optimal_model'])
        optimal_results = pd.read_csv(models_history+optimal_model['optimal_model'][0])
        if predictor=='esvr' or predictor=='gbrt':
            optimal_params = pd.read_csv(models_history+optimal_model['optimal_model'][0].split('.csv')[0]+'_optimized_params.csv')
            optimal_results = pd.concat([optimal_model,optimal_params,optimal_results],axis=1)
        elif predictor=='lstm':
            optimal_results = pd.concat([optimal_model,optimal_results],axis=1)
        optimal_results.to_csv(models_path+'optimal_model_results.csv')
        plot_rela_pred(optimal_results['train_y'],optimal_results['train_pred'],models_path+'train_pred.png')
        plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],optimal_results['dev_pred'][0:data_part['dev_len']],models_path+'dev_pred.png')
        plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']],optimal_results['test_pred'][0:data_part['test_len']],models_path+'test_pred.png')
Beispiel #2
0
def ensemble(root_path,
             original_series,
             station,
             predictor,
             predict_pattern,
             variables,
             decomposer=None,
             wavelet_level='db10-2',
             framework='WDDFF'):

    if decomposer == 'modwt':
        if framework == 'TSDP':
            lags_dict = variables['lags_dict'][wavelet_level]
        else:
            lags_dict = None
    elif decomposer == 'dwt':
        lags_dict = variables['lags_dict'][wavelet_level]
    else:
        lags_dict = variables['lags_dict']
    full_len = variables['full_len']
    train_len = variables['train_len']
    dev_len = variables['dev_len']
    test_len = variables['test_len']
    logger.info('Ensemble forecasting results...')
    logger.info('Root path:{}'.format(root_path))
    logger.info('original series:\n{}'.format(original_series))
    logger.info('Station:{}'.format(station))
    logger.info('Decomposer:{}'.format(decomposer))
    logger.info('Lags dict:{}'.format(lags_dict))
    logger.info('Predictor:{}'.format(predictor))
    logger.info('Predict pattern:{}'.format(predict_pattern))
    logger.info('Training length:{}'.format(train_len))
    logger.info('Development length:{}'.format(test_len))
    logger.info('Testing length:{}'.format(test_len))
    logger.info('Entire length:{}'.format(full_len))
    logger.info(
        'Wavelet and decomposition level of WA:{}'.format(wavelet_level))

    if decomposer == 'modwt':
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '-' + framework.lower(
        ) + '/' + wavelet_level + '/' + predict_pattern + '/'
    elif decomposer == 'dwt':
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + wavelet_level + '/' + predict_pattern + '/'
    elif decomposer == None:
        models_path = root_path + '/' + station + '/projects/' + predictor + '/' + predict_pattern + '/'
    else:
        models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + predict_pattern + '/'
    logger.info("Model path:{}".format(models_path))

    if 'multi_step' not in predict_pattern:
        models_history = models_path + 'history/'
        optimal_model = ''
        min_dev_mse = np.inf
        for file_ in os.listdir(models_history):
            if '.csv' in file_ and 'optimized_params' not in file_:
                logger.info('read model results:{}'.format(file_))
                dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0]
                if dev_mse < min_dev_mse:
                    min_dev_mse = dev_mse
                    optimal_model = file_
        logger.info('Optimal model:{}'.format(optimal_model))
        logger.info('Minimum MSE={}'.format(min_dev_mse))
        res = load(models_history +
                   (optimal_model.split('.csv')[0] + '_result.pkl'))
        dump(res, models_path + 'result.pkl')
        optimal_model = pd.DataFrame([optimal_model],
                                     columns=['optimal_model'])
        optimal_results = pd.read_csv(models_history +
                                      optimal_model['optimal_model'][0])
        if predictor == 'esvr' or predictor == 'gbrt':
            optimal_params = pd.read_csv(
                models_history +
                optimal_model['optimal_model'][0].split('.csv')[0] +
                '_optimized_params.csv')
            optimal_results = pd.concat(
                [optimal_model, optimal_params, optimal_results], axis=1)
        elif predictor == 'lstm':
            optimal_results = pd.concat([optimal_model, optimal_results],
                                        axis=1)
        optimal_results.to_csv(models_path + 'optimal_model_results.csv')
        plot_rela_pred(optimal_results['train_y'],
                       optimal_results['train_pred'],
                       models_path + 'train_pred.png')
        plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],
                       optimal_results['dev_pred'][0:data_part['dev_len']],
                       models_path + 'dev_pred.png')
        plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']],
                       optimal_results['test_pred'][0:data_part['test_len']],
                       models_path + 'test_pred.png')
    else:
        for i in range(len(lags_dict)):
            print(len(lags_dict))
            model_path = models_path + 's' + str(i + 1) + '/'
            models_history = model_path + 'history/'
            optimal_model = ''
            min_dev_mse = np.inf
            for file_ in os.listdir(models_history):
                if '.csv' in file_ and 'optimized_params' not in file_:
                    logger.info('read model results:{}'.format(file_))
                    dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0]
                    if dev_mse < min_dev_mse:
                        min_dev_mse = dev_mse
                        optimal_model = file_
            logger.info('Optimal model:{}'.format(optimal_model))
            logger.info('Minimum MSE={}'.format(min_dev_mse))
            res = load(models_history +
                       (optimal_model.split('.csv')[0] + '_result.pkl'))
            dump(res, model_path + 'result.pkl')
            optimal_model = pd.DataFrame([optimal_model],
                                         columns=['optimal_model'])
            optimal_results = pd.read_csv(models_history +
                                          optimal_model['optimal_model'][0])
            if predictor == 'esvr' or predictor == 'gbrt':
                optimal_params = pd.read_csv(
                    models_history +
                    optimal_model['optimal_model'][0].split('.csv')[0] +
                    '_optimized_params.csv')
                optimal_results = pd.concat(
                    [optimal_model, optimal_params, optimal_results], axis=1)
            elif predictor == 'lstm':
                optimal_results = pd.concat([optimal_model, optimal_results],
                                            axis=1)
            optimal_results.to_csv(model_path + 'optimal_model_results.csv')
            plot_rela_pred(optimal_results['train_y'],
                           optimal_results['train_pred'],
                           model_path + 'train_pred.png')
            plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],
                           optimal_results['dev_pred'][0:data_part['dev_len']],
                           model_path + 'dev_pred.png')
            plot_rela_pred(
                optimal_results['test_y'][0:data_part['test_len']],
                optimal_results['test_pred'][0:data_part['test_len']],
                model_path + 'test_pred.png')
        train_len_ = train_len - max(lags_dict.values())
        train_sum_pred = pd.DataFrame()
        dev_sum_pred = pd.DataFrame()
        test_sum_pred = pd.DataFrame()
        time_cost_sum = 0.0
        for i in range(len(lags_dict)):
            model_path = models_path + 's' + str(i + 1) + '/'
            results = pd.read_csv(model_path + 'optimal_model_results.csv')
            time_cost_sum = time_cost_sum + results['time_cost'][0]
            train_pred = results['train_pred']
            train_pred = train_pred[train_pred.shape[0] - train_len_:]
            train_pred = train_pred.reset_index(drop=True)
            dev_pred = results['dev_pred'][0:dev_len]
            test_pred = results['test_pred'][0:test_len]
            train_sum_pred = pd.concat([train_sum_pred, train_pred], axis=1)
            dev_sum_pred = pd.concat([dev_sum_pred, dev_pred], axis=1)
            test_sum_pred = pd.concat([test_sum_pred, test_pred], axis=1)
        train_sum_pred = train_sum_pred.sum(axis=1)
        dev_sum_pred = dev_sum_pred.sum(axis=1)
        test_sum_pred = test_sum_pred.sum(axis=1)
        train_sum_pred[train_sum_pred < 0.0] = 0.0
        dev_sum_pred[dev_sum_pred < 0.0] = 0.0
        test_sum_pred[test_sum_pred < 0.0] = 0.0
        original_series = original_series.reset_index(drop=True)
        train_y = original_series[train_len - train_len_:train_len]
        dev_y = original_series[train_len:train_len + dev_len]
        test_y = original_series[train_len + dev_len:]
        train_y = train_y.reset_index(drop=True)
        dev_y = dev_y.reset_index(drop=True)
        test_y = test_y.reset_index(drop=True)

        train_nse = r2_score(train_y.values, train_sum_pred.values)
        train_mse = mean_squared_error(train_y.values, train_sum_pred.values)
        train_nrmse = math.sqrt(
            mean_squared_error(train_y.values, train_sum_pred.values)) / (
                sum(train_y.values) / len(train_y.values))
        train_ppts = PPTS(train_y.values, train_sum_pred.values, 5)

        dev_nse = r2_score(dev_y.values, dev_sum_pred.values)
        dev_mse = mean_squared_error(dev_y.values, dev_sum_pred.values)
        dev_nrmse = math.sqrt(
            mean_squared_error(dev_y.values, dev_sum_pred.values)) / (
                sum(dev_y.values) / len(dev_y.values))
        dev_ppts = PPTS(dev_y.values, dev_sum_pred.values, 5)

        test_nse = r2_score(test_y.values, test_sum_pred.values)
        test_mse = mean_squared_error(test_y.values, test_sum_pred.values)
        test_nrmse = math.sqrt(
            mean_squared_error(test_y.values, test_sum_pred.values)) / (
                sum(test_y.values) / len(test_y.values))
        test_ppts = PPTS(test_y.values, test_sum_pred.values, 5)

        metrics = {
            'train_nse': train_nse,
            'train_mse': train_mse,
            'train_nrmse': train_nrmse,
            'train_ppts': train_ppts,
            'dev_nse': dev_nse,
            'dev_mse': dev_mse,
            'dev_nrmse': dev_nrmse,
            'dev_ppts': dev_ppts,
            'test_nse': test_nse,
            'test_mse': test_mse,
            'test_nrmse': test_nrmse,
            'test_ppts': test_ppts,
            'time_cost': time_cost_sum,
        }
        metrics_df = pd.DataFrame(metrics, index=[0])
        print(metrics_df)
        train_results = pd.concat([train_y, train_sum_pred], axis=1)
        train_results = pd.DataFrame(train_results.values,
                                     columns=['train_y', 'train_pred'])
        dev_results = pd.concat([dev_y, dev_sum_pred], axis=1)
        dev_results = pd.DataFrame(dev_results.values,
                                   columns=['dev_y', 'dev_pred'])
        test_results = pd.concat([test_y, test_sum_pred], axis=1)
        test_results = pd.DataFrame(test_results.values,
                                    columns=['test_y', 'test_pred'])
        optimal_results = pd.concat(
            [train_results, dev_results, test_results, metrics_df], axis=1)
        optimal_results.to_csv(models_path + 'optimal_results.csv')
        plot_rela_pred(train_y, train_sum_pred, models_path + 'train_pred.png')
        plot_rela_pred(dev_y, dev_sum_pred, models_path + 'dev_pred.png')
        plot_rela_pred(test_y, test_sum_pred, models_path + 'test_pred.png')
def BuildDNN(train_samples,
             dev_samples,
             test_samples,
             norm_id,
             model_path,
             lags=None,
             seed=None,
             batch_size=512,
             n_epochs=5,
             max_trials=5,
             executions_per_trial=3,
             max_hidden_layers=3,
             min_units=16,
             max_units=64,
             unit_step=16,
             min_droprate=0.0,
             max_droprate=0.5,
             droprate_step=0.05,
             min_learnrate=1e-4,
             max_learnrate=1e-1,
             n_tune_epochs=5,
             cast_to_zero=True,
             early_stop=True,
             early_stop_patience=10,
             retrain=False,
             warm_up=False,
             initial_epoch=None,
             measurement_time='day',
             measurement_unit='$m^3/s$'):
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    setting_info = {
        "model_path": model_path,
        "lags": lags,
        "seed": seed,
        "batch_size": batch_size,
        "n_epoch": n_epochs,
        "max_trials": max_trials,
        "executions_per_trial": executions_per_trial,
        "max_hidden_layers": max_hidden_layers,
        "min_units": min_units,
        "max_units": max_units,
        "unit_step": unit_step,
        "min_droprate": min_droprate,
        "max_droprate": max_droprate,
        "droprate_step": droprate_step,
        "min_learnrate": min_learnrate,
        "max_learnrate": max_learnrate,
        "n_tune_epochs": n_tune_epochs,
        "cast_to_zero": cast_to_zero,
        "early_stop": early_stop,
        "early_stop_patience": early_stop_patience,
        "retrain": retrain,
    }

    with open(model_path + 'setting.json', 'w') as outfile:
        json.dump(setting_info, outfile)

    sMin = norm_id['series_min']
    sMax = norm_id['series_max']
    # sMin = train_samples.min(axis=0)
    # sMax = train_samples.max(axis=0)
    # train_samples = 2*(train_samples-sMin)/(sMax-sMin)-1
    # dev_samples = 2*(dev_samples-sMin)/(sMax-sMin)-1
    # test_samples = 2*(test_samples-sMin)/(sMax-sMin)-1
    cal_samples = pd.concat([train_samples, dev_samples], axis=0)
    cal_samples = cal_samples.sample(frac=1)
    cal_samples = cal_samples.reset_index(drop=True)
    train_samples = cal_samples.iloc[:train_samples.shape[0]]
    dev_samples = cal_samples.iloc[train_samples.shape[0]:]
    X = cal_samples
    y = (cal_samples.pop('Y')).values
    train_x = train_samples
    train_y = train_samples.pop('Y')
    train_y = train_y.values
    dev_x = dev_samples
    dev_y = dev_samples.pop('Y')
    dev_y = dev_y.values
    test_x = test_samples
    test_y = test_samples.pop('Y')
    test_y = test_y.values

    # Config path to save optimal results
    opt_path = model_path + '\\optimal\\'
    cp_path = model_path + '\\optimal\\checkpoints\\'
    if not os.path.exists(cp_path):
        os.makedirs(cp_path)
    # restore only the latest checkpoint after every update
    checkpoint_path = cp_path + 'cp.h5'
    checkpoint_dir = os.path.dirname(checkpoint_path)
    # Define callbacks
    cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                  save_best_only=True,
                                                  mode='min',
                                                  save_weights_only=True,
                                                  verbose=1)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                  min_lr=0.00001,
                                                  factor=0.2,
                                                  verbose=1,
                                                  patience=10,
                                                  mode='min')
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        verbose=1,
        patience=early_stop_patience,
        restore_best_weights=True)

    def build_model(hp):
        input_shape = (train_x.shape[1], )
        model = keras.Sequential()
        num_layers = hp.Int('num_layers',
                            min_value=1,
                            max_value=max_hidden_layers,
                            step=1,
                            default=1)
        for i in range(num_layers):
            units = hp.Int('units_' + str(i),
                           min_value=min_units,
                           max_value=max_units,
                           step=unit_step)
            dropout_rate = hp.Float('drop_rate_' + str(i),
                                    min_value=min_droprate,
                                    max_value=max_droprate,
                                    step=droprate_step)
            if i == 0:
                model.add(
                    layers.Dense(units=units,
                                 activation='relu',
                                 input_shape=input_shape))
            else:
                model.add(layers.Dense(units=units, activation='relu'))
            model.add(
                layers.Dropout(rate=dropout_rate, noise_shape=None, seed=seed))
        model.add(layers.Dense(1))
        model.compile(optimizer=keras.optimizers.Adam(
            hp.Float('learning_rate',
                     min_value=min_learnrate,
                     max_value=max_learnrate,
                     sampling='LOG',
                     default=1e-2)),
                      loss='mean_squared_error',
                      metrics=['mean_absolute_error', 'mean_squared_error'])
        return model

    tuner = BayesianOptimization(build_model,
                                 objective='mean_squared_error',
                                 max_trials=max_trials,
                                 executions_per_trial=executions_per_trial,
                                 directory=model_path,
                                 project_name='BayesianOpt')

    tuner.search_space_summary()
    start = time.process_time()
    tuner.search(x=train_x,
                 y=train_y,
                 epochs=n_tune_epochs,
                 validation_data=(dev_x, dev_y),
                 callbacks=[early_stopping])
    end = time.process_time()
    time_cost = end - start
    tuner.results_summary()
    best_hps = tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters
    model = build_model(best_hps)

    if retrain or not os.path.exists(checkpoint_path):
        history = model.fit(X,
                            y,
                            epochs=n_epochs,
                            batch_size=batch_size,
                            validation_data=(X, y),
                            verbose=1,
                            callbacks=[
                                cp_callback,
                                early_stopping,
                            ])
        hist = pd.DataFrame(history.history)
        hist.to_csv(opt_path + 'PARAMS-CAL-HISTORY.csv')
        plot_history(history, opt_path + 'MAE-HISTORY.png',
                     opt_path + 'MSE-HISTORY.png')
    else:
        model.load_weights(checkpoint_path)

    train_predictions = model.predict(train_x).flatten()
    dev_predictions = model.predict(dev_x).flatten()
    test_predictions = model.predict(test_x).flatten()
    sMax = sMax[sMax.shape[0] - 1]
    sMin = sMin[sMin.shape[0] - 1]
    train_y = np.multiply(train_y + 1, sMax - sMin) / 2 + sMin
    dev_y = np.multiply(dev_y + 1, sMax - sMin) / 2 + sMin
    test_y = np.multiply(test_y + 1, sMax - sMin) / 2 + sMin
    train_predictions = np.multiply(train_predictions + 1,
                                    sMax - sMin) / 2 + sMin
    dev_predictions = np.multiply(dev_predictions + 1, sMax - sMin) / 2 + sMin
    test_predictions = np.multiply(test_predictions + 1,
                                   sMax - sMin) / 2 + sMin
    if cast_to_zero:
        train_predictions[train_predictions < 0.0] = 0.0
        dev_predictions[dev_predictions < 0.0] = 0.0
        test_predictions[test_predictions < 0.0] = 0.0
    dump_pred_results(
        path=opt_path + '/opt_pred.csv',
        train_y=train_y,
        train_predictions=train_predictions,
        dev_y=dev_y,
        dev_predictions=dev_predictions,
        test_y=test_y,
        test_predictions=test_predictions,
        time_cost=time_cost,
    )
    plot_rela_pred(train_y,
                   train_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=opt_path + 'TRAIN-PRED.png')
    plot_rela_pred(dev_y,
                   dev_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=opt_path + "DEV-PRED.png")
    plot_rela_pred(test_y,
                   test_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=opt_path + "TEST-PRED.png")
    plot_error_distribution(test_predictions, test_y,
                            opt_path + 'TEST-ERROR-DSTRI.png')
    plt.show()
test_pred = list()
for t in range(len(test)):
    model = ARIMA(history, order=order)
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    test_pred.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
end = time.process_time()
time_cost = end - start

# plot_rela_pred(train,train_pred,fig_savepath=model_path  + 'arima'+str(order)+'_train_pred.png')
# plot_rela_pred(dev,dev_pred,fig_savepath=model_path  + "arima"+str(order)+"_dev_pred.png")
plot_rela_pred(test,
               test_pred,
               fig_savepath=model_path + "arima" + str(order) +
               "_test_pred.png")

dump_pred_results(
    path=model_path + 'arima' + str(order) + '_results.csv',
    # train_y = train,
    # train_predictions=train_pred,
    # dev_y = dev,
    # dev_predictions = dev_pred,
    test_y=test,
    test_predictions=test_pred,
    time_cost=time_cost,
)
Beispiel #5
0
def BuildSVR(train_samples,dev_samples,test_samples,model_path,n_calls,cast_to_zero=True,optimizer='gp',measurement_time='day',measurement_unit='$m^3/s$'):
    with start_action(action_type="Initialize Model Path") as action:
        if not os.path.exists(model_path):
            action.log(message_type="The model path does not exist!")
            os.makedirs(model_path)
            action.log(message_type="The model path is initialized.")
      
    sMin = train_samples.min(axis=0)
    sMax = train_samples.max(axis=0)
    norm = pd.concat([sMax,sMin],axis=1)
    norm =pd.DataFrame(norm.values,columns=['sMax','sMin'],index=train_samples.columns.values)
    norm.to_csv(model_path+'norm.csv')
    joblib.dump(norm,model_path+'norm.pkl')
    train_samples = 2*(train_samples-sMin)/(sMax-sMin)-1
    dev_samples = 2*(dev_samples-sMin)/(sMax-sMin)-1
    test_samples = 2*(test_samples-sMin)/(sMax-sMin)-1
    cal_samples = pd.concat([train_samples,dev_samples],axis=0)
    cal_samples = cal_samples.sample(frac=1)
    train_y = train_samples['Y']
    train_x = train_samples.drop('Y', axis=1)
    dev_y = dev_samples['Y']
    dev_x = dev_samples.drop('Y', axis=1)
    test_y = test_samples['Y']
    test_x = test_samples.drop('Y', axis=1)
    cal_y = cal_samples['Y']
    cal_x = cal_samples.drop('Y', axis=1)
    
    
    predictor_columns = list(train_x.columns)
    joblib.dump(predictor_columns, model_path+'predictor_columns.pkl')
    
    reg = SVR(tol=1e-6)
    # Set the space of hyper-parameters for tuning them
    space = [
        # Penalty parameter `C` of the error term
        Real(0.1, 200, name='C'),   
        # `epsilon` in epsilon-SVR model. It specifies the epsilon-tube
        # within which no penalty is associated in the training loss
        # function with points predicted within a distance epsilon from the actual value.
        Real(10**-6, 10**0, name='epsilon'),    
        # kernel coefficient for 'rbf','poly' and 'sigmoid'
        Real(10**-6, 10**0, name='gamma'),  
    ]
    # Define an objective function of hyper-parameters tuning
    @use_named_args(space)
    def objective(**params):
        reg.set_params(**params)
        return -np.mean(cross_val_score(reg,cal_x,cal_y,cv=10,n_jobs=-1,scoring='neg_mean_squared_error'))
    # Tuning the hyper-parameters using Bayesian Optimization based on Gaussion Process
    start = time.process_time()
    if optimizer=='gp':
        res = gp_minimize(objective,space,n_calls=n_calls ,random_state=0,verbose=True,n_jobs=-1)
    elif optimizer=='fr_et':
        res = forest_minimize(objective,space,n_calls=n_calls,base_estimator='ET',random_state=0,verbose=True,n_jobs=-1)
    elif optimizer=='fr_rf':
        res = forest_minimize(objective,space,n_calls=n_calls,base_estimator='RF',random_state=0,verbose=True,n_jobs=-1)
    elif optimizer=='dm':
        res = dummy_minimize(objective,space,n_calls=n_calls)
    end = time.process_time()
    time_cost = end-start
    dump(res,model_path+'tune_history.pkl',store_objective=False)
    # returned_results = load(model_path+'tune_history.pkl')
    DIMENSION_ESVR = ['C','epsilon','gamma']
    # Visualizing the results of hyper-parameaters tuning
    plot_objective_(res,dimensions=DIMENSION_ESVR,fig_savepath=model_path+'objective.png')
    plot_evaluations_(res,dimensions=DIMENSION_ESVR,fig_savepath=model_path+'evaluation.png')
    plot_convergence_(res,fig_savepath=model_path+'convergence.png')
    # Plot the optimal hyperparameters
    # logger.info('Best score=%.4f'%res.fun)
    # logger.info(""" Best parameters:
    #  -C = %.8f
    #  -epsilon = %.8f
    #  -gamma = %.8f
    #  """%(res.x[0],res.x[1],res.x[2]))
    # logger.info('Time cost:{} seconds'.format(time_cost))
    # Construct the optimal hyperparameters to restore them
    params_dict={
        'C':res.x[0],
        'epsilon':res.x[1],
        'gamma':res.x[2],
        'time_cost':time_cost,
        'n_calls':n_calls,
    }
    # Transform the optimal hyperparameters dict to pandas DataFrame and restore it
    params_df = pd.DataFrame(params_dict,index=[0])
    params_df.to_csv(model_path +'optimized_params.csv')
    # Initialize a SVR with the optimal hyperparameters
    esvr = SVR(C=res.x[0], epsilon=res.x[1], gamma=res.x[2]).fit(cal_x,cal_y)
    joblib.dump(esvr,model_path+'model.pkl')

    # Load the optimized model
    esvr = joblib.load(model_path+'model.pkl')
    # Do prediction with the optimal model
    train_predictions = esvr.predict(train_x)
    dev_predictions = esvr.predict(dev_x)
    test_predictions = esvr.predict(test_x)
    train_y=(train_y.values).flatten()
    dev_y=(dev_y.values).flatten()
    test_y=(test_y.values).flatten()
    sMax = sMax[sMax.shape[0]-1]
    sMin = sMin[sMin.shape[0]-1]
    train_y = np.multiply(train_y + 1,sMax - sMin) / 2 + sMin
    dev_y = np.multiply(dev_y + 1,sMax - sMin) / 2 + sMin
    test_y = np.multiply(test_y + 1,sMax - sMin) / 2 + sMin
    train_predictions = np.multiply(train_predictions + 1, sMax -sMin) / 2 + sMin
    dev_predictions = np.multiply(dev_predictions + 1, sMax -sMin) / 2 + sMin
    test_predictions = np.multiply(test_predictions + 1, sMax -sMin) / 2 + sMin
    if cast_to_zero:
        train_predictions[train_predictions<0.0]=0.0
        dev_predictions[dev_predictions<0.0]=0.0
        test_predictions[test_predictions<0.0]=0.0
    dump_pred_results(
        path = model_path+'opt_pred.csv',
        train_y = train_y,
        train_predictions=train_predictions,
        dev_y = dev_y,
        dev_predictions = dev_predictions,
        test_y = test_y,
        test_predictions = test_predictions,
        time_cost = time_cost,
    )
    plot_rela_pred(train_y,train_predictions,measurement_time=measurement_time,measurement_unit=measurement_unit,fig_savepath=model_path  + 'TRAIN-PRED.png')
    plot_rela_pred(dev_y,dev_predictions,measurement_time=measurement_time,measurement_unit=measurement_unit,fig_savepath=model_path  + "DEV-PRED.png")
    plot_rela_pred(test_y,test_predictions,measurement_time=measurement_time,measurement_unit=measurement_unit,fig_savepath=model_path  + "TEST-PRED.png")
    plot_error_distribution(test_y,test_predictions,fig_savepath=model_path+"TEST-ERROR-DSTRI.png")
    plt.show()
    plt.close('all')
def BuildGBRT(train_samples,
              dev_samples,
              test_samples,
              model_path,
              n_calls,
              cast_to_zero=True,
              optimizer='gp',
              measurement_time='day',
              measurement_unit='$m^3/s$'):
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    sMin = train_samples.min(axis=0)
    sMax = train_samples.max(axis=0)
    norm = pd.concat([sMax, sMin], axis=1)
    norm = pd.DataFrame(norm.values,
                        columns=['sMax', 'sMin'],
                        index=train_samples.columns.values)
    norm.to_csv(model_path + 'norm.csv')
    joblib.dump(norm, model_path + 'norm.pkl')

    train_samples = 2 * (train_samples - sMin) / (sMax - sMin) - 1
    dev_samples = 2 * (dev_samples - sMin) / (sMax - sMin) - 1
    test_samples = 2 * (test_samples - sMin) / (sMax - sMin) - 1
    cal_samples = pd.concat([train_samples, dev_samples], axis=0)
    cal_samples = cal_samples.sample(frac=1)
    train_y = train_samples['Y']
    train_x = train_samples.drop('Y', axis=1)
    dev_y = dev_samples['Y']
    dev_x = dev_samples.drop('Y', axis=1)
    test_y = test_samples['Y']
    test_x = test_samples.drop('Y', axis=1)
    cal_y = cal_samples['Y']
    cal_x = cal_samples.drop('Y', axis=1)

    predictor_columns = list(train_x.columns)
    joblib.dump(predictor_columns, model_path + 'predictor_columns.pkl')

    # Get the feature num
    n_features = cal_x.shape[1]
    reg = GradientBoostingRegressor(n_estimators=100, random_state=0)
    # The list hyper-parameters we want
    space = [
        Integer(1, 25, name='max_depth'),
        Real(10**-5, 10**0, 'log-uniform', name='learning_rate'),
        Integer(1, n_features, name='max_features'),
        Integer(2, 100, name='min_samples_split'),
        Integer(1, 100, name='min_samples_leaf'),
    ]

    @use_named_args(space)
    def objective(**params):
        reg.set_params(**params)
        return -np.mean(
            cross_val_score(reg,
                            cal_x,
                            cal_y,
                            cv=10,
                            n_jobs=-1,
                            scoring='neg_mean_squared_error'))

    start = time.process_time()
    if optimizer == 'gp':
        res = gp_minimize(objective,
                          space,
                          n_calls=n_calls,
                          random_state=0,
                          verbose=True,
                          n_jobs=-1)
    elif optimizer == 'fr_bt':
        res = forest_minimize(objective,
                              space,
                              n_calls=n_calls,
                              base_estimator='ET',
                              random_state=0,
                              verbose=True,
                              n_jobs=-1)
    elif optimizer == 'fr_rf':
        res = forest_minimize(objective,
                              space,
                              n_calls=n_calls,
                              base_estimator='RF',
                              random_state=0,
                              verbose=True,
                              n_jobs=-1)
    elif optimizer == 'dm':
        res = dummy_minimize(objective, space, n_calls=n_calls)
    end = time.process_time()
    time_cost = end - start

    dump(res, model_path + 'tune_history.pkl', store_objective=False)
    # returned_results = load(model_path+'tune_history.pkl')
    DIMENSION_GBRT = [
        'max depth', 'learning rate', 'max features', 'min samples split',
        'min samples leaf'
    ]
    plot_objective_(res,
                    dimensions=DIMENSION_GBRT,
                    fig_savepath=model_path + 'objective.png')
    plot_evaluations_(res,
                      dimensions=DIMENSION_GBRT,
                      fig_savepath=model_path + 'evaluation.png')
    plot_convergence_(res, fig_savepath=model_path + 'convergence.png')

    # logger.info('Best score=%.4f'%res.fun)
    # logger.info("""Best parameters:
    # - max_depth=%d
    # - learning_rate=%.6f
    # - max_features=%d
    # - min_samples_split=%d
    # - min_samples_leaf=%d""" % (res.x[0], res.x[1], res.x[2], res.x[3],
    #                             res.x[4]))
    # logger.info('Time cost:{}'.format(time_cost))

    params_dict = {
        'max_depth': res.x[0],
        'learning_rate': res.x[1],
        'max_features': res.x[2],
        'min_samples_split': res.x[3],
        'min_samples_leaf': res.x[4],
        'time_cost': time_cost,
        'n_calls': n_calls,
    }

    params_df = pd.DataFrame(params_dict, index=[0])
    params_df.to_csv(model_path + 'optimized_params.csv')

    GBR = GradientBoostingRegressor(max_depth=res.x[0],
                                    learning_rate=res.x[1],
                                    max_features=res.x[2],
                                    min_samples_split=res.x[3],
                                    min_samples_leaf=res.x[4]).fit(
                                        cal_x, cal_y)

    joblib.dump(GBR, model_path + 'model.pkl')

    GBR = joblib.load(model_path + 'model.pkl')
    train_predictions = GBR.predict(train_x)
    dev_predictions = GBR.predict(dev_x)
    test_predictions = GBR.predict(test_x)
    train_y = (train_y.values).flatten()
    dev_y = (dev_y.values).flatten()
    test_y = (test_y.values).flatten()
    sMax = sMax[sMax.shape[0] - 1]
    sMin = sMin[sMin.shape[0] - 1]
    train_y = np.multiply(train_y + 1, sMax - sMin) / 2 + sMin
    dev_y = np.multiply(dev_y + 1, sMax - sMin) / 2 + sMin
    test_y = np.multiply(test_y + 1, sMax - sMin) / 2 + sMin
    train_predictions = np.multiply(train_predictions + 1,
                                    sMax - sMin) / 2 + sMin
    dev_predictions = np.multiply(dev_predictions + 1, sMax - sMin) / 2 + sMin
    test_predictions = np.multiply(test_predictions + 1,
                                   sMax - sMin) / 2 + sMin
    if cast_to_zero:
        train_predictions[train_predictions < 0.0] = 0.0
        dev_predictions[dev_predictions < 0.0] = 0.0
        test_predictions[test_predictions < 0.0] = 0.0
    dump_pred_results(
        path=model_path + 'opt_pred.csv',
        train_y=train_y,
        train_predictions=train_predictions,
        dev_y=dev_y,
        dev_predictions=dev_predictions,
        test_y=test_y,
        test_predictions=test_predictions,
        time_cost=time_cost,
    )
    plot_rela_pred(train_y,
                   train_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + 'TRAIN-PRED.png')
    plot_rela_pred(dev_y,
                   dev_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + "DEV-PRED.png")
    plot_rela_pred(test_y,
                   test_predictions,
                   measurement_time=measurement_time,
                   measurement_unit=measurement_unit,
                   fig_savepath=model_path + "TEST-PRED.png")
    plot_error_distribution(test_y,
                            test_predictions,
                            fig_savepath=model_path + "TEST-ERROR-DSTRI.png")
    plt.show()
    plt.close('all')