def get_return_levels_and_unc_using_bootstrap(rconfig, varname="STFL"): """ return the extreme properties object result where result.return_lev_dict are all the return levels for a given simulation result.std_dict - are all the standard deviations from the bootstrap :param rconfig: :param varname: """ result = ExtremeProperties() proc_pool = Pool(processes=20) all_bootstrap_indices = None for extr_type, months in ExtremeProperties.extreme_type_to_month_of_interest.items( ): result.return_lev_dict[extr_type] = {} result.std_dict[extr_type] = {} return_periods = ExtremeProperties.extreme_type_to_return_periods[ extr_type].copy() # Do not do the calculations for the cached return periods cached_periods = [] for return_period in list(return_periods): # Construct the name of the cache file cache_file = get_cache_file_name(rconfig, months=months, ret_period=return_period, extreme_type=extr_type, varname=varname) p = Path(cache_file) if p.is_file(): cached_periods.append(return_period) return_periods.remove(return_period) cache_levs, cache_stds = pickle.load(p.open("rb")) print("Using cache from {}".format(str(p))) result.return_lev_dict[extr_type][return_period] = cache_levs result.std_dict[extr_type][return_period] = cache_stds # Do not do anything if the return levels for all periods are cached # for this type of extreme events if len(return_periods) == 0: continue # 3D array of annual extremes for each grid point t0 = time.clock() ext_values = analysis.get_annual_extrema( rconfig=rconfig, varname=varname, months_of_interest=months, n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[extr_type], high_flow=ExtremeProperties.high == extr_type) print("Got extreme values for {}-{} in {}s".format( rconfig.start_year, rconfig.end_year, time.clock() - t0)) nx, ny = ext_values.shape[1:] result.return_lev_dict[extr_type].update( {k: -np.ones((nx, ny)) for k in return_periods}) result.std_dict[extr_type].update( {k: -np.ones((nx, ny)) for k in return_periods}) import matplotlib.pyplot as plt plt.figure() # to_plot = ext_values[0].transpose().copy() # to_plot = np.ma.masked_where(to_plot >= 0, to_plot) # qm = plt.pcolormesh(to_plot) # datacursor(qm) # plt.colorbar() # plt.show() ext_values = np.where(ext_values >= 0, ext_values, 0) if all_bootstrap_indices is None: np.random.seed(seed=ExtremeProperties.seed) all_bootstrap_indices = np.array([ np.random.random_integers(0, ext_values.shape[0] - 1, ext_values.shape[0]) for _ in range(ExtremeProperties.nbootstrap) ]) # Probably needs to be optimized ... for i in range(nx): input_data = zip(ext_values[:, i, :].transpose(), itt.repeat(extr_type, ny), itt.repeat(return_periods, ny), itt.repeat(all_bootstrap_indices, ny)) ret_level_and_std_pairs = proc_pool.map( do_gevfit_for_a_point_single_arg, input_data) ret_levels, std_deviations = zip(*ret_level_and_std_pairs) for return_period in return_periods: result.return_lev_dict[extr_type][return_period][i, :] = [ ret_levels[j][return_period] for j in range(ny) ] result.std_dict[extr_type][return_period][i, :] = [ std_deviations[j][return_period] for j in range(ny) ] # Show the progress if i % 10 == 0: print("progress {}/{}".format(i, nx)) # Save the computed return levels and standard deviations to the cache file for return_period in return_periods: # Construct the name of the cache file cache_file = get_cache_file_name(rconfig, months=months, ret_period=return_period, extreme_type=extr_type) p = Path(cache_file) to_save = [ result.return_lev_dict[extr_type][return_period], result.std_dict[extr_type][return_period] ] pickle.dump(to_save, p.open("wb")) return result
def get_return_levels_and_unc_using_bootstrap(rconfig, varname="STFL"): """ return the extreme properties object result where result.return_lev_dict are all the return levels for a given simulation result.std_dict - are all the standard deviations from the bootstrap :param rconfig: :param varname: """ result = ExtremeProperties() proc_pool = Pool(processes=20) all_bootstrap_indices = None for extr_type, months in ExtremeProperties.extreme_type_to_month_of_interest.items(): result.return_lev_dict[extr_type] = {} result.std_dict[extr_type] = {} return_periods = ExtremeProperties.extreme_type_to_return_periods[extr_type].copy() # Do not do the calculations for the cached return periods cached_periods = [] for return_period in list(return_periods): # Construct the name of the cache file cache_file = get_cache_file_name(rconfig, months=months, ret_period=return_period, extreme_type=extr_type, varname=varname) p = Path(cache_file) if p.is_file(): cached_periods.append(return_period) return_periods.remove(return_period) cache_levs, cache_stds = pickle.load(p.open("rb")) print("Using cache from {}".format(str(p))) result.return_lev_dict[extr_type][return_period] = cache_levs result.std_dict[extr_type][return_period] = cache_stds # Do not do anything if the return levels for all periods are cached # for this type of extreme events if len(return_periods) == 0: continue # 3D array of annual extremes for each grid point t0 = time.clock() ext_values = analysis.get_annual_extrema(rconfig=rconfig, varname=varname, months_of_interest=months, n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[extr_type], high_flow=ExtremeProperties.high == extr_type) print("Got extreme values for {}-{} in {}s".format(rconfig.start_year, rconfig.end_year, time.clock() - t0)) nx, ny = ext_values.shape[1:] result.return_lev_dict[extr_type].update({k: -np.ones((nx, ny)) for k in return_periods}) result.std_dict[extr_type].update({k: -np.ones((nx, ny)) for k in return_periods}) import matplotlib.pyplot as plt plt.figure() # to_plot = ext_values[0].transpose().copy() # to_plot = np.ma.masked_where(to_plot >= 0, to_plot) # qm = plt.pcolormesh(to_plot) # datacursor(qm) # plt.colorbar() # plt.show() ext_values = np.where(ext_values >= 0, ext_values, 0) if all_bootstrap_indices is None: np.random.seed(seed=ExtremeProperties.seed) all_bootstrap_indices = np.array([np.random.random_integers(0, ext_values.shape[0] - 1, ext_values.shape[0]) for _ in range(ExtremeProperties.nbootstrap)]) # Probably needs to be optimized ... for i in range(nx): input_data = zip(ext_values[:, i, :].transpose(), itt.repeat(extr_type, ny), itt.repeat(return_periods, ny), itt.repeat(all_bootstrap_indices, ny)) ret_level_and_std_pairs = proc_pool.map(do_gevfit_for_a_point_single_arg, input_data) ret_levels, std_deviations = zip(*ret_level_and_std_pairs) for return_period in return_periods: result.return_lev_dict[extr_type][return_period][i, :] = [ret_levels[j][return_period] for j in range(ny)] result.std_dict[extr_type][return_period][i, :] = [std_deviations[j][return_period] for j in range(ny)] # Show the progress if i % 10 == 0: print("progress {}/{}".format(i, nx)) # Save the computed return levels and standard deviations to the cache file for return_period in return_periods: # Construct the name of the cache file cache_file = get_cache_file_name(rconfig, months=months, ret_period=return_period, extreme_type=extr_type) p = Path(cache_file) to_save = [ result.return_lev_dict[extr_type][return_period], result.std_dict[extr_type][return_period] ] pickle.dump(to_save, p.open("wb")) return result
def main(hdf_folder="/home/huziy/skynet3_rech1/hdf_store", start_year=1980, end_year=2010): prepare() all_markers = ["*", "s", "p", "+", "x", "d", "h"] excluded = ["white", "w", "aliceblue", "azure"] excluded.extend([ci for ci in colors.cnames if "yellow" in ci]) all_colors = ["k", "b", "r", "g", "m"] + sorted([ci for ci in colors.cnames if ci not in excluded]) # Station ids to get from the CEHQ database ids_with_lakes_upstream = [ "104001", "093806", "093801", "081002", "081007", "080718" ] selected_ids = ids_with_lakes_upstream filedir = Path(hdf_folder) sim_name_to_file_path = OrderedDict([ # ("CRCM5-LI", filedir.joinpath("quebec_0.1_crcm5-hcd-r.hdf5").as_posix()), ("ERAI-CRCM5-L", filedir.joinpath("quebec_0.1_crcm5-hcd-rl.hdf5").as_posix()), # ("CanESM2-CRCM5-NL", filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-r-cc-canesm2-1980-2010.hdf5").as_posix()), ("CanESM2-CRCM5-L", filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-hcd-rl-cc-canesm2-1980-2010.hdf5").as_posix()), # ("CanESM2-CRCM5-LI", filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-hcd-rl-intfl-cc-canesm2-1980-2010.hdf5").as_posix()), ]) obs_label = "Obs." labels = [obs_label, ] + list(sim_name_to_file_path.keys()) label_to_marker = dict(zip(labels, all_markers)) label_to_color = dict(zip(labels, all_colors)) # Get the list of stations to do the comparison with start_date = datetime(start_year, 1, 1) end_date = datetime(end_year, 12, 31) stations = cehq_station.read_station_data( start_date=start_date, end_date=end_date, selected_ids=selected_ids ) # Get geophysical fields from one of the model simulations path0 = list(sim_name_to_file_path.values())[0] lons2d, lats2d, basemap = analysis.get_basemap_from_hdf(file_path=path0) flow_directions = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_FLOW_DIRECTIONS_NAME) lake_fraction = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_LAKE_FRACTION_NAME) accumulation_area_km2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_ACCUMULATION_AREA_NAME) area_m2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_CELL_AREA_NAME_M2) # Try to read cell areas im meters if it is not Ok then try in km2 if area_m2 is not None: cell_area_km2 = area_m2 * 1.0e-6 else: cell_area_km2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_CELL_AREA_NAME_KM2) # Create a cell manager if it is not provided cell_manager = CellManager(flow_directions, accumulation_area_km2=accumulation_area_km2, lons2d=lons2d, lats2d=lats2d) # Get the list of the corresponding model points station_to_modelpoint = cell_manager.get_model_points_for_stations( station_list=stations, lake_fraction=lake_fraction, drainaige_area_reldiff_limit=0.1) # plot_utils.apply_plot_params(font_size=10, width_cm=20, height_cm=18) fig = plt.figure() ncols = max([len(rp_list) for et, rp_list in ExtremeProperties.extreme_type_to_return_periods.items()]) nrows = len(ExtremeProperties.extreme_types) gs = GridSpec(nrows, ncols) ext_type_to_rp_to_ax = OrderedDict() ax_with_legend = None label_to_ax_to_xdata = {} label_to_ax_to_ydata = {} for row, ext_type in enumerate(ExtremeProperties.extreme_types): ext_type_to_rp_to_ax[ext_type] = OrderedDict() for col, rperiod in enumerate(ExtremeProperties.extreme_type_to_return_periods[ext_type]): ax = fig.add_subplot(gs[row, col]) ext_type_to_rp_to_ax[ext_type][rperiod] = ax if col == 0: ax.set_ylabel(ext_type) if row == nrows - 1 and col == ncols - 1: ax_with_legend = ax # Set axes labels if row == nrows - 1: ax.set_xlabel("Observations") if col == 0: ax.set_ylabel("Model") for label in sim_name_to_file_path: if label not in label_to_ax_to_xdata: label_to_ax_to_xdata[label] = {ax: []} label_to_ax_to_ydata[label] = {ax: []} else: label_to_ax_to_xdata[label][ax] = [] label_to_ax_to_ydata[label][ax] = [] ax.set_xscale("log") ax.set_yscale("log") print("Initial list of stations:") sim_label_to_handle = {} for s in stations: print("{0}".format(s)) assert isinstance(s, Station) print(len([y for y in s.get_list_of_complete_years() if start_year <= y <= end_year])) df_ext_obs = extreme_commons.get_annual_extrema(ts_times=s.dates, ts_vals=s.values, start_year=start_year, end_year=end_year) mp = station_to_modelpoint[s] assert isinstance(mp, ModelPoint) years_of_interest = df_ext_obs.index label_to_extrema_model = {} # label -> ext_type -> [return period -> ret level, return period -> std] label_to_return_levels = OrderedDict( [(obs_label, OrderedDict())] ) for sim_label, sim_path in sim_name_to_file_path.items(): label_to_return_levels[sim_label] = OrderedDict() label_to_extrema_model[sim_label] = OrderedDict() # Calculate the return levels and standard deviations for ext_type in ExtremeProperties.extreme_types: return_periods = ExtremeProperties.extreme_type_to_return_periods[ext_type] # fit GEV distribution and apply non-parametric bootstrap to get std label_to_return_levels[obs_label][ext_type] = gevfit.do_gevfit_for_a_point(df_ext_obs[ext_type].values, extreme_type=ext_type, return_periods=return_periods) return_levels_obs, rl_stds_obs = label_to_return_levels[obs_label][ext_type] # get annual extremas for the model output at the points colose to the stations for sim_label, sim_path in sim_name_to_file_path.items(): label_to_return_levels[sim_label] = OrderedDict() ext_field = analysis.get_annual_extrema( rconfig=RunConfig(data_path=sim_path, start_year=start_year, end_year=end_year), varname="STFL", months_of_interest=ExtremeProperties.extreme_type_to_month_of_interest[ext_type], n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[ext_type], high_flow=ext_type == ExtremeProperties.high) # Select only those years when obs are available ts_data = [v for y, v in zip(range(start_year, end_year + 1), ext_field[:, mp.ix, mp.jy]) if y in years_of_interest] ts_data = np.array(ts_data) return_levels, rl_stds = gevfit.do_gevfit_for_a_point(ts_data, extreme_type=ext_type, return_periods=return_periods) # Do the plotting for rp in return_periods: ax = ext_type_to_rp_to_ax[ext_type][rp] ax.set_title("T = {rp}-year".format(rp=rp)) # h = ax.errorbar(return_levels_obs[rp], return_levels[rp], # marker=label_to_marker[sim_label], color=label_to_color[sim_label], label=sim_label, # xerr=rl_stds_obs[rp] * 1.96, yerr=rl_stds[rp] * 1.96) h = ax.scatter(return_levels_obs[rp], return_levels[rp], marker=label_to_marker[sim_label], color=label_to_color[sim_label], label=sim_label) # save the data for maybe further calculation of the correlation coefficients label_to_ax_to_xdata[sim_label][ax].append(return_levels_obs[rp]) label_to_ax_to_ydata[sim_label][ax].append(return_levels[rp]) sim_label_to_handle[sim_label] = h # Calculate the biases for sim_label in sim_name_to_file_path: for ext_type in ExtremeProperties.extreme_types: ret_periods = ExtremeProperties.extreme_type_to_return_periods[ext_type] for rp in ret_periods: ax = ext_type_to_rp_to_ax[ext_type][rp] mod = np.asarray(label_to_ax_to_ydata[sim_label][ax]) obs = np.asarray(label_to_ax_to_xdata[sim_label][ax]) bias = np.mean((mod - obs)/obs) corr, pv = stats.pearsonr(mod, obs) print("({sim_label}) Mean bias for {rp}-year {ext_type}-flow return level is: {bias}; corr={corr:.2f}; corr_pval={corr_pval:2g}".format( sim_label=sim_label, rp=rp, bias=bias, corr=corr, corr_pval=pv, ext_type=ext_type )) sfmt = ScalarFormatter(useMathText=True) sfmt.set_powerlimits((-2, 2)) for et, rp_to_ax in ext_type_to_rp_to_ax.items(): for rp, ax in rp_to_ax.items(): xmin, xmax = ax.get_xlim() ymin, ymax = ax.get_ylim() x1 = min(xmin, ymin) x2 = min(xmax, ymax) ax.plot([x1, x2], [x1, x2], "k--") # ax.xaxis.set_major_locator(MaxNLocator(nbins=5)) # ax.yaxis.set_major_locator(MaxNLocator(nbins=5)) # ax.xaxis.set_major_formatter(sfmt) # ax.yaxis.set_major_formatter(sfmt) sim_labels = list(sim_name_to_file_path.keys()) ax_with_legend.legend([sim_label_to_handle[sl] for sl in sim_labels], sim_labels, bbox_to_anchor=(1, -0.25), borderaxespad=0.0, loc="upper right", ncol=2, scatterpoints=1, numpoints=1) # Save the plot img_file = "{}.eps".format("_".join(sorted(label_to_marker.keys()))) img_file = img_folder.joinpath(img_file) fig.tight_layout() with img_file.open("wb") as f: fig.savefig(f, bbox_inches="tight")