def plot_decompositions( signal, figsize=None, save_path=None, measurement_time='month', measurement_unit="$10^8m^3$", ): cols = signal.columns.values logger.info('cols={}'.format(cols)) T = signal.shape[0] t = np.arange(start=1, stop=T + 1, step=1, dtype=np.float) / T freqs = t - 0.5 - 1 / T if figsize == None: figsize = (7.48, 1 * len(cols)) plt.figure(figsize=figsize) for i in range(len(cols)): subsignal = signal[cols[i]].values plt.subplot(len(cols), 2, 2 * i + 1) plt.title(cols[i]) plt.plot(subsignal, c='b') if i == len(cols) - 1: plt.xlabel('Time(' + measurement_time + ')') else: plt.xticks([]) plt.ylabel(r"Streamflow(" + measurement_unit + ")", ) plt.subplot(len(cols), 2, 2 * i + 2) plt.title(cols[i]) plt.plot(freqs, abs(fft(subsignal)), c='b', lw=0.8, zorder=0) if i == len(cols) - 1: plt.xlabel('Frequency(1/' + measurement_time + ')') else: plt.xticks([]) plt.ylabel('Amplitude') plt.tight_layout()
def dum_pred_results(path, train_y, train_predictions, dev_y, dev_predictions, test_y, test_predictions, time_cost=None): """ Dump real records (labels) and predictions as well as evaluation criteria (metrix R2,RMSE,MAE,MAPE,PPTS,time_cost) to csv. Args: path: The local disk path to dump data into. train_y: records of training set with numpy array type. train_predictions: predictions of training set with numpy array type. dev_y: records of development set with numpy array type. dev_predictions: predictions of development set with numpy array type. test_y: records of testing set with numpy array type. test_predictions: predictions of testing set with numpy array type. time_cost: Time cost for profiling. Return: A csv file """ logger.info('Dump records, predictions and evaluation criteria...') logger.info('Compute Nash-Sutcliffe efficiency (NSE)...') train_nse = r2_score(train_y, train_predictions) dev_nse = r2_score(dev_y, dev_predictions) test_nse = r2_score(test_y, test_predictions) logger.info('Compute Mean Square Error (MSE)...') train_mse = mean_squared_error(y_true=train_y, y_pred=train_predictions) dev_mse = mean_squared_error(y_true=dev_y, y_pred=dev_predictions) test_mse = mean_squared_error(y_true=test_y, y_pred=test_predictions) logger.info('Compute normalized mean square error (NRMSE)...') train_nrmse = math.sqrt(mean_squared_error( train_y, train_predictions)) / (sum(train_y) / len(train_y)) dev_nrmse = math.sqrt(mean_squared_error( dev_y, dev_predictions)) / (sum(dev_y) / len(dev_y)) test_nrmse = math.sqrt(mean_squared_error( test_y, test_predictions)) / (sum(test_y) / len(test_y)) logger.info('Compute mean absolute error (MAE)...') train_mae = mean_absolute_error(train_y, train_predictions) dev_mae = mean_absolute_error(dev_y, dev_predictions) test_mae = mean_absolute_error(test_y, test_predictions) logger.info('Compute mean absolute percentage error (MAPE)...') train_mape = np.mean(np.abs((train_y - train_predictions) / train_y)) * 100 dev_mape = np.mean(np.abs((dev_y - dev_predictions) / dev_y)) * 100 test_mape = np.mean(np.abs((test_y - test_predictions) / test_y)) * 100 logger.info('Compute peak percentage of threshold statistic (PPTS)...') train_ppts = PPTS(train_y, train_predictions, 5) dev_ppts = PPTS(dev_y, dev_predictions, 5) test_ppts = PPTS(test_y, test_predictions, 5) logger.info('Dumping the model results.') dump_train_dev_test_to_csv( path=path, train_y=train_y, train_pred=train_predictions, train_nse=train_nse, train_mse=train_mse, train_nrmse=train_nrmse, train_mae=train_mae, train_mape=train_mape, train_ppts=train_ppts, dev_y=dev_y, dev_pred=dev_predictions, dev_nse=dev_nse, dev_mse=dev_mse, dev_nrmse=dev_nrmse, dev_mae=dev_mae, dev_mape=dev_mape, dev_ppts=dev_ppts, test_y=test_y, test_pred=test_predictions, test_nse=test_nse, test_mse=test_mse, test_nrmse=test_nrmse, test_mae=test_mae, test_mape=test_mape, test_ppts=test_ppts, time_cost=time_cost, )
def ensemble(root_path, original_series, station, predictor, predict_pattern, variables, decomposer=None, wavelet_level='db10-2', framework='WDDFF'): if decomposer == 'modwt': if framework == 'TSDP': lags_dict = variables['lags_dict'][wavelet_level] else: lags_dict = None elif decomposer == 'dwt': lags_dict = variables['lags_dict'][wavelet_level] else: lags_dict = variables['lags_dict'] full_len = variables['full_len'] train_len = variables['train_len'] dev_len = variables['dev_len'] test_len = variables['test_len'] logger.info('Ensemble forecasting results...') logger.info('Root path:{}'.format(root_path)) logger.info('original series:\n{}'.format(original_series)) logger.info('Station:{}'.format(station)) logger.info('Decomposer:{}'.format(decomposer)) logger.info('Lags dict:{}'.format(lags_dict)) logger.info('Predictor:{}'.format(predictor)) logger.info('Predict pattern:{}'.format(predict_pattern)) logger.info('Training length:{}'.format(train_len)) logger.info('Development length:{}'.format(test_len)) logger.info('Testing length:{}'.format(test_len)) logger.info('Entire length:{}'.format(full_len)) logger.info( 'Wavelet and decomposition level of WA:{}'.format(wavelet_level)) if decomposer == 'modwt': models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '-' + framework.lower( ) + '/' + wavelet_level + '/' + predict_pattern + '/' elif decomposer == 'dwt': models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + wavelet_level + '/' + predict_pattern + '/' elif decomposer == None: models_path = root_path + '/' + station + '/projects/' + predictor + '/' + predict_pattern + '/' else: models_path = root_path + '/' + station + '_' + decomposer + '/projects/' + predictor + '/' + predict_pattern + '/' logger.info("Model path:{}".format(models_path)) if 'multi_step' not in predict_pattern: models_history = models_path + 'history/' optimal_model = '' min_dev_mse = np.inf for file_ in os.listdir(models_history): if '.csv' in file_ and 'optimized_params' not in file_: logger.info('read model results:{}'.format(file_)) dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0] if dev_mse < min_dev_mse: min_dev_mse = dev_mse optimal_model = file_ logger.info('Optimal model:{}'.format(optimal_model)) logger.info('Minimum MSE={}'.format(min_dev_mse)) res = load(models_history + (optimal_model.split('.csv')[0] + '_result.pkl')) dump(res, models_path + 'result.pkl') optimal_model = pd.DataFrame([optimal_model], columns=['optimal_model']) optimal_results = pd.read_csv(models_history + optimal_model['optimal_model'][0]) if predictor == 'esvr' or predictor == 'gbrt': optimal_params = pd.read_csv( models_history + optimal_model['optimal_model'][0].split('.csv')[0] + '_optimized_params.csv') optimal_results = pd.concat( [optimal_model, optimal_params, optimal_results], axis=1) elif predictor == 'lstm': optimal_results = pd.concat([optimal_model, optimal_results], axis=1) optimal_results.to_csv(models_path + 'optimal_model_results.csv') plot_rela_pred(optimal_results['train_y'], optimal_results['train_pred'], models_path + 'train_pred.png') plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']], optimal_results['dev_pred'][0:data_part['dev_len']], models_path + 'dev_pred.png') plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']], optimal_results['test_pred'][0:data_part['test_len']], models_path + 'test_pred.png') else: for i in range(len(lags_dict)): print(len(lags_dict)) model_path = models_path + 's' + str(i + 1) + '/' models_history = model_path + 'history/' optimal_model = '' min_dev_mse = np.inf for file_ in os.listdir(models_history): if '.csv' in file_ and 'optimized_params' not in file_: logger.info('read model results:{}'.format(file_)) dev_mse = pd.read_csv(models_history + file_)['dev_mse'][0] if dev_mse < min_dev_mse: min_dev_mse = dev_mse optimal_model = file_ logger.info('Optimal model:{}'.format(optimal_model)) logger.info('Minimum MSE={}'.format(min_dev_mse)) res = load(models_history + (optimal_model.split('.csv')[0] + '_result.pkl')) dump(res, model_path + 'result.pkl') optimal_model = pd.DataFrame([optimal_model], columns=['optimal_model']) optimal_results = pd.read_csv(models_history + optimal_model['optimal_model'][0]) if predictor == 'esvr' or predictor == 'gbrt': optimal_params = pd.read_csv( models_history + optimal_model['optimal_model'][0].split('.csv')[0] + '_optimized_params.csv') optimal_results = pd.concat( [optimal_model, optimal_params, optimal_results], axis=1) elif predictor == 'lstm': optimal_results = pd.concat([optimal_model, optimal_results], axis=1) optimal_results.to_csv(model_path + 'optimal_model_results.csv') plot_rela_pred(optimal_results['train_y'], optimal_results['train_pred'], model_path + 'train_pred.png') plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']], optimal_results['dev_pred'][0:data_part['dev_len']], model_path + 'dev_pred.png') plot_rela_pred( optimal_results['test_y'][0:data_part['test_len']], optimal_results['test_pred'][0:data_part['test_len']], model_path + 'test_pred.png') train_len_ = train_len - max(lags_dict.values()) train_sum_pred = pd.DataFrame() dev_sum_pred = pd.DataFrame() test_sum_pred = pd.DataFrame() time_cost_sum = 0.0 for i in range(len(lags_dict)): model_path = models_path + 's' + str(i + 1) + '/' results = pd.read_csv(model_path + 'optimal_model_results.csv') time_cost_sum = time_cost_sum + results['time_cost'][0] train_pred = results['train_pred'] train_pred = train_pred[train_pred.shape[0] - train_len_:] train_pred = train_pred.reset_index(drop=True) dev_pred = results['dev_pred'][0:dev_len] test_pred = results['test_pred'][0:test_len] train_sum_pred = pd.concat([train_sum_pred, train_pred], axis=1) dev_sum_pred = pd.concat([dev_sum_pred, dev_pred], axis=1) test_sum_pred = pd.concat([test_sum_pred, test_pred], axis=1) train_sum_pred = train_sum_pred.sum(axis=1) dev_sum_pred = dev_sum_pred.sum(axis=1) test_sum_pred = test_sum_pred.sum(axis=1) train_sum_pred[train_sum_pred < 0.0] = 0.0 dev_sum_pred[dev_sum_pred < 0.0] = 0.0 test_sum_pred[test_sum_pred < 0.0] = 0.0 original_series = original_series.reset_index(drop=True) train_y = original_series[train_len - train_len_:train_len] dev_y = original_series[train_len:train_len + dev_len] test_y = original_series[train_len + dev_len:] train_y = train_y.reset_index(drop=True) dev_y = dev_y.reset_index(drop=True) test_y = test_y.reset_index(drop=True) train_nse = r2_score(train_y.values, train_sum_pred.values) train_mse = mean_squared_error(train_y.values, train_sum_pred.values) train_nrmse = math.sqrt( mean_squared_error(train_y.values, train_sum_pred.values)) / ( sum(train_y.values) / len(train_y.values)) train_ppts = PPTS(train_y.values, train_sum_pred.values, 5) dev_nse = r2_score(dev_y.values, dev_sum_pred.values) dev_mse = mean_squared_error(dev_y.values, dev_sum_pred.values) dev_nrmse = math.sqrt( mean_squared_error(dev_y.values, dev_sum_pred.values)) / ( sum(dev_y.values) / len(dev_y.values)) dev_ppts = PPTS(dev_y.values, dev_sum_pred.values, 5) test_nse = r2_score(test_y.values, test_sum_pred.values) test_mse = mean_squared_error(test_y.values, test_sum_pred.values) test_nrmse = math.sqrt( mean_squared_error(test_y.values, test_sum_pred.values)) / ( sum(test_y.values) / len(test_y.values)) test_ppts = PPTS(test_y.values, test_sum_pred.values, 5) metrics = { 'train_nse': train_nse, 'train_mse': train_mse, 'train_nrmse': train_nrmse, 'train_ppts': train_ppts, 'dev_nse': dev_nse, 'dev_mse': dev_mse, 'dev_nrmse': dev_nrmse, 'dev_ppts': dev_ppts, 'test_nse': test_nse, 'test_mse': test_mse, 'test_nrmse': test_nrmse, 'test_ppts': test_ppts, 'time_cost': time_cost_sum, } metrics_df = pd.DataFrame(metrics, index=[0]) print(metrics_df) train_results = pd.concat([train_y, train_sum_pred], axis=1) train_results = pd.DataFrame(train_results.values, columns=['train_y', 'train_pred']) dev_results = pd.concat([dev_y, dev_sum_pred], axis=1) dev_results = pd.DataFrame(dev_results.values, columns=['dev_y', 'dev_pred']) test_results = pd.concat([test_y, test_sum_pred], axis=1) test_results = pd.DataFrame(test_results.values, columns=['test_y', 'test_pred']) optimal_results = pd.concat( [train_results, dev_results, test_results, metrics_df], axis=1) optimal_results.to_csv(models_path + 'optimal_results.csv') plot_rela_pred(train_y, train_sum_pred, models_path + 'train_pred.png') plot_rela_pred(dev_y, dev_sum_pred, models_path + 'dev_pred.png') plot_rela_pred(test_y, test_sum_pred, models_path + 'test_pred.png')
def plot_cv_error(data_path, labels, mode='avg'): logger.info('Plot cross validation MSE...') logger.info('Data path:{}'.format(data_path)) logger.info('Labels:{}'.format(labels)) if isinstance(data_path, str): data_path = [data_path] labels = [labels] plt.figure(figsize=(7.48, 7.48)) plt.xlabel('CV') plt.ylabel('MSE') for path, label in zip(data_path, labels): logger.info('Read cv results of {}'.format(path)) dev_cv = {} test_cv = {} for file_ in os.listdir(path): if '.csv' in file_ and 'seed' not in file_ and 'optimized_params' not in file_: logger.info('cv-file:{}'.format(file_)) cv = int(re.findall(r"(?<=cv)\d+", file_)[0]) logger.info('cv={}'.format(cv)) data = pd.read_csv(path + file_) dev_metrics = data['dev_nrmse'][0] test_metrics = data['test_nrmse'][0] logger.info('Development metrics={}'.format(dev_metrics)) dev_cv[cv] = dev_metrics test_cv[cv] = test_metrics logger.debug('Development cv dict before sort:{}'.format(dev_cv)) logger.debug('Testing cv dict before sort:{}'.format(test_cv)) dev_cv = dict(sorted(dev_cv.items())) test_cv = dict(sorted(test_cv.items())) logger.info('Cross validation development dict:{}'.format(dev_cv)) logger.info('Cross validation folds:{}'.format(dev_cv.keys())) logger.info('Cross validation MSE:{}'.format(dev_cv.values())) plt.plot(list(dev_cv.keys()), list(dev_cv.values()), marker='o', label=label + 'dev') plt.plot(list(test_cv.keys()), list(test_cv.values()), marker='o', label=label + 'test') plt.legend() plt.tight_layout()
print(len(cal_pred)) train_pred = cal_pred[0:train_len] dev_pred = cal_pred[train_len:] print(model_fit.summary()) print(len(train_pred)) print(len(dev_pred)) residuals = pd.DataFrame(model_fit.resid) residuals.plot() plt.show() residuals.plot(kind='kde') plt.show() print(residuals.describe()) if os.path.exists(model_path + 'arima' + str(order) + '_results.csv'): logger.info("The arima" + str(order) + " was already tuned") history = [x for x in cal] test_pred = list() for t in range(len(test)): model = ARIMA(history, order=order) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] test_pred.append(yhat) obs = test[t] history.append(obs) print('predicted=%f, expected=%f' % (yhat, obs)) end = time.process_time() time_cost = end - start
def plot_rela_pred(records, predictions, fig_savepath, measurement_time='month', measurement_unit="$10^8m^3$", figsize=(7.48, 3), format='PNG', dpi=300): """ Plot the relations between the records and predictions. Args: records: the actual measured records. predictions: the predictions obtained by model fig_savepath: the path where the plot figure will be saved. """ logger.info('Plot predictions and correlations...') if isinstance(records, pd.DataFrame) or isinstance(records, pd.Series): records = records.values elif isinstance(predictions, pd.DataFrame) or isinstance( predictions, pd.Series): predictions = predictions.values length = records.size t = np.linspace(start=1, stop=length, num=length) plt.figure(figsize=figsize) ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=3) ax2 = plt.subplot2grid((1, 5), (0, 3), colspan=2, aspect='equal') # ax1.set_xticks([]) # ax1.set_yticks([]) ax1.set_xlabel('Time(' + measurement_time + ')', ) ax1.set_ylabel(r'Streamflow(' + measurement_unit + ')', ) ax1.plot(t, records, '-', color='blue', label='Records', linewidth=1.0) ax1.plot(t, predictions, '--', color='red', label='Predictions', linewidth=1.0) ax1.legend( # loc='upper left', loc=0, # bbox_to_anchor=(0.005,1.2), shadow=False, frameon=False, ) logger.info('records=\n{}'.format(records)) logger.info('predictions=\n{}'.format(predictions)) pred_min = predictions.min() pred_max = predictions.max() record_min = records.min() record_max = records.max() if pred_min < record_min: xymin = pred_min else: xymin = record_min if pred_max > record_max: xymax = pred_max else: xymax = record_max logger.info('xymin={}'.format(xymin)) logger.info('xymax={}'.format(xymax)) xx = np.arange(start=xymin, stop=xymax + 1, step=1.0) coeff = np.polyfit(predictions, records, 1) linear_fit = coeff[0] * xx + coeff[1] # print('a:{}'.format(coeff[0])) # print('b:{}'.format(coeff[1])) # ax2.set_xticks() # ax2.set_yticks() ax2.set_xlabel(r'Predictions(' + measurement_unit + ')', ) ax2.set_ylabel(r'Records(' + measurement_unit + ')', ) # ax2.plot(predictions, records, 'o', color='blue', label='',markersize=6.5) ax2.plot(predictions, records, 'o', markerfacecolor='w', markeredgecolor='blue', markersize=6.5) # ax2.plot(predictions, linear_fit, '--', color='red', label='Linear fit',linewidth=1.0) # ax2.plot(predictions, ideal_fit, '-', color='black', label='Ideal fit',linewidth=1.0) ax2.plot(xx, linear_fit, '--', color='red', label='Linear fit', linewidth=1.0) ax2.plot([xymin, xymax], [xymin, xymax], '-', color='black', label='Ideal fit', linewidth=1.0) ax2.set_xlim([xymin, xymax]) ax2.set_ylim([xymin, xymax]) ax2.legend( # loc='upper left', loc=0, # bbox_to_anchor=(0.05,1), shadow=False, frameon=False, ) # plt.subplots_adjust(left=0.08, bottom=0.12, right=0.98, top=0.98, hspace=0.1, wspace=0.2) plt.tight_layout() plt.savefig(fig_savepath, format=format, dpi=dpi)
def dump_pred_results(path, train_y=None, train_predictions=None, dev_y=None, dev_predictions=None, test_y=None, test_predictions=None, time_cost=None): """ Dump real records (labels) and predictions as well as evaluation criteria (metrix R2,RMSE,MAE,MAPE,PPTS,time_cost) to csv. Args: path: The local disk path to dump data into. train_y: records of training set with numpy array type. train_predictions: predictions of training set with numpy array type. dev_y: records of development set with numpy array type. dev_predictions: predictions of development set with numpy array type. test_y: records of testing set with numpy array type. test_predictions: predictions of testing set with numpy array type. time_cost: Time cost for profiling. Return: A csv file """ logger.info('Dump records, predictions and evaluation criteria...') logger.info('Compute Nash-Sutcliffe efficiency (NSE)...') # if train_y==None or train_predictions==None: # train_nse = None # train_mse = None # train_nrmse = None # train_mae = None # train_mape = None # else: train_nse = r2_score(train_y, train_predictions) train_mse = mean_squared_error(y_true=train_y, y_pred=train_predictions) train_nrmse = math.sqrt(mean_squared_error( train_y, train_predictions)) / (sum(train_y) / len(train_y)) train_mae = mean_absolute_error(train_y, train_predictions) train_mape = np.mean(np.abs((train_y - train_predictions) / train_y)) * 100 train_ppts = PPTS(train_y, train_predictions, 5) # if dev_y==None or dev_predictions==None: # dev_nse = None # dev_mse = None # dev_nrmse = None # dev_mae = None # dev_mape = None # else: dev_nse = r2_score(dev_y, dev_predictions) dev_mse = mean_squared_error(y_true=dev_y, y_pred=dev_predictions) dev_nrmse = math.sqrt(mean_squared_error( dev_y, dev_predictions)) / (sum(dev_y) / len(dev_y)) dev_mae = mean_absolute_error(dev_y, dev_predictions) dev_mape = np.mean(np.abs((dev_y - dev_predictions) / dev_y)) * 100 dev_ppts = PPTS(dev_y, dev_predictions, 5) # if test_y==None or test_predictions==None: # test_nse = None # test_mse = None # test_nrmse = None # test_mae = None # test_mape = None # else: test_nse = r2_score(test_y, test_predictions) test_mse = mean_squared_error(y_true=test_y, y_pred=test_predictions) test_nrmse = math.sqrt(mean_squared_error( test_y, test_predictions)) / (sum(test_y) / len(test_y)) test_mae = mean_absolute_error(test_y, test_predictions) test_mape = np.mean(np.abs((test_y - test_predictions) / test_y)) * 100 test_ppts = PPTS(test_y, test_predictions, 5) dump_train_dev_test_to_csv( path=path, train_y=train_y, train_pred=train_predictions, train_nse=train_nse, train_mse=train_mse, train_nrmse=train_nrmse, train_mae=train_mae, train_mape=train_mape, train_ppts=train_ppts, dev_y=dev_y, dev_pred=dev_predictions, dev_nse=dev_nse, dev_mse=dev_mse, dev_nrmse=dev_nrmse, dev_mae=dev_mae, dev_mape=dev_mape, dev_ppts=dev_ppts, test_y=test_y, test_pred=test_predictions, test_nse=test_nse, test_mse=test_mse, test_nrmse=test_nrmse, test_mae=test_mae, test_mape=test_mape, test_ppts=test_ppts, time_cost=time_cost, )
def ensemble(root_path,original_series,station,predictor,predict_pattern,variables,decomposer=None,wavelet_level='db10-2'): lags_dict = variables['lags_dict'] full_len = variables['full_len'] train_len = variables['train_len'] dev_len = variables['dev_len'] test_len = variables['test_len'] logger.info('Ensemble forecasting results...') logger.info('Root path:{}'.format(root_path)) logger.info('original series:\n{}'.format(original_series)) logger.info('Station:{}'.format(station)) logger.info('Decomposer:{}'.format(decomposer)) logger.info('Lags dict:{}'.format(lags_dict)) logger.info('Predictor:{}'.format(predictor)) logger.info('Predict pattern:{}'.format(predict_pattern)) logger.info('Training length:{}'.format(train_len)) logger.info('Development length:{}'.format(test_len)) logger.info('Testing length:{}'.format(test_len)) logger.info('Entire length:{}'.format(full_len)) logger.info('Wavelet and decomposition level of WA:{}'.format(wavelet_level)) original = original_series if decomposer=='dwt' or decomposer=='modwt': models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+wavelet_level+'/'+predict_pattern+'/' elif decomposer==None: models_path = root_path+'/'+station+'/projects/'+predictor+'/'+predict_pattern+'/' else: models_path = root_path+'/'+station+'_'+decomposer+'/projects/'+predictor+'/'+predict_pattern+'/' logger.info("Model path:{}".format(models_path)) if 'multi_step' not in predict_pattern: models_history = models_path+'history/' optimal_model = '' min_dev_mse = np.inf for file_ in os.listdir(models_history): if '.csv' in file_ and 'optimized_params' not in file_: logger.info('read model results:{}'.format(file_)) dev_mse = pd.read_csv(models_history+file_)['dev_mse'][0] if dev_mse < min_dev_mse: min_dev_mse = dev_mse optimal_model = file_ logger.info('Optimal model:{}'.format(optimal_model)) logger.info('Minimum MSE={}'.format(min_dev_mse)) optimal_model = pd.DataFrame([optimal_model],columns=['optimal_model']) optimal_results = pd.read_csv(models_history+optimal_model['optimal_model'][0]) if predictor=='esvr' or predictor=='gbrt': optimal_params = pd.read_csv(models_history+optimal_model['optimal_model'][0].split('.csv')[0]+'_optimized_params.csv') optimal_results = pd.concat([optimal_model,optimal_params,optimal_results],axis=1) elif predictor=='lstm': optimal_results = pd.concat([optimal_model,optimal_results],axis=1) optimal_results.to_csv(models_path+'optimal_model_results.csv') plot_rela_pred(optimal_results['train_y'],optimal_results['train_pred'],models_path+'train_pred.png') plot_rela_pred(optimal_results['dev_y'][0:data_part['dev_len']],optimal_results['dev_pred'][0:data_part['dev_len']],models_path+'dev_pred.png') plot_rela_pred(optimal_results['test_y'][0:data_part['test_len']],optimal_results['test_pred'][0:data_part['test_len']],models_path+'test_pred.png')
def read_long_leading_time(station, decomposer, mode='pearson', pearson_threshold=0.2, wavelet_level="db10-2"): logger.info('reading long lead time model results...') logger.info('station:{}'.format(station)) logger.info('decomposer:{}'.format(decomposer)) logger.info('mode:{}'.format(mode)) logger.info('pearson threshold:{}'.format(pearson_threshold)) logger.info('wavelet level:{}'.format(wavelet_level)) records = [] predictions = [] nse = [] nrmse = [] ppts = [] if decomposer == 'modwt': m1 = read_two_stage( station=station, decomposer=decomposer, predict_pattern="single_hybrid_1_ahead_mi_ts0.1", ) else: m1 = read_two_stage( station=station, decomposer=decomposer, predict_pattern="one_step_1_ahead_forecast_pacf", ) records.append(m1['test_y']) predictions.append(m1['test_pred']) nse.append(m1['test_nse']) nrmse.append(m1['test_nrmse']) ppts.append(m1['test_ppts']) # averaging the trained svr with different seed leading_times = [3, 5, 7, 9] for leading_time in leading_times: if decomposer == 'modwt': model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr-wddff\\" + wavelet_level + "\\" elif decomposer == "dwt": model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + wavelet_level + "\\" else: model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" print("Reading mode:{}".format(mode)) if mode == 'pacf': model_path = model_path + "one_step_" + str( leading_time) + "_ahead_forecast_pacf//" elif mode == 'pearson': model_path = model_path + "one_step_" + str( leading_time) + "_ahead_forecast_pearson" + str( pearson_threshold) + "//" elif mode == 'mi': model_path = model_path + "single_hybrid_" + str( leading_time) + "_ahead_mi_ts0.1//" logger.info('model path:{}'.format(model_path)) results = pd.read_csv(model_path + 'optimal_model_results.csv') test_pred = (results['test_pred'][0:120]).values.flatten() test_y = (results['test_y'][0:120]).values.flatten() records.append(test_y) predictions.append(test_pred) nse.append(results['test_nse'][0]) nrmse.append(results['test_nrmse'][0]) ppts.append(results['test_ppts'][0]) results = { 'records': records, 'predictions': predictions, 'nse': nse, 'nrmse': nrmse, 'ppts': ppts, } logger.info('results.records:{}'.format(pd.DataFrame(results)['records'])) logger.info('results.predictions:{}'.format( pd.DataFrame(results)['predictions'])) return results
import pandas as pd import numpy as np import math from statistics import mean from sklearn.decomposition import PCA from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error import os root_path = os.path.dirname(os.path.abspath('__file__')) import sys sys.path.append(root_path) from tools.metrics_ import PPTS, mean_absolute_percentage_error from config.globalLog import logger logger.info('results_reader') def read_two_stage(station, decomposer, predict_pattern, wavelet_level="db10-2", framework='WDDFF'): if decomposer == 'modwt': model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr-" + framework.lower( ) + "\\" + wavelet_level + "\\" + predict_pattern + "\\" elif decomposer == "dwt": model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + wavelet_level + "\\" + predict_pattern + "\\" else: model_path = root_path + "\\" + station + "_" + decomposer + "\\projects\\esvr\\" + predict_pattern + "\\" results = pd.read_csv(model_path + 'optimal_model_results.csv') test_pred = (results['test_pred'][0:120]).values.flatten() test_y = (results['test_y'][0:120]).values.flatten()
# cb_ax = fig.add_axes([0.85, 0.06, 0.05, 0.38])#[x,y,width,height] # cbar = fig.colorbar(im, cax=cb_ax) # cbar.set_ticks(np.arange(0, 1.1, 0.5)) # cbar.set_label(r"$Corr_{i,j}$") # # cbar.set_ticklabels(['low', 'medium', 'high']) # plt.savefig(graphs_path+"Fig.9.Pearson corr of Huaxian.tif",format="TIFF",dpi=1200) # plt.savefig(graphs_path+"Fig.9.Pearson corr of Huaxian.pdf",format="PDF",dpi=1200) # plt.show() fig = plt.figure(figsize=(7.4861,1.7)) for i in range(len(corrs)): ax = plt.subplot(1,5,i+1) ax.set_title(titles[i],fontsize=6) sign_num=corrs[i].shape[1] logger.info('Number of sub-signals:{}'.format(sign_num)) ticks = list(range(sign_num)) logger.info('ticks:{}'.format(ticks)) labels=[] for j in ticks: if titles[i].find('VMD')>=0: labels.append(r'$IMF_{'+str(j+1)+'}$') elif titles[i].find('EEMD')>=0: if j==sign_num-1: labels.append(r'$R$') else: labels.append(r'$IMF_{'+str(j+1)+'}$') elif titles[i].find('MODWT')>=0: if j==sign_num-1: labels.append(r'$V_{'+str(j)+'}$') else: