Beispiel #1
0
def get_return_levels_and_unc_using_bootstrap(rconfig, varname="STFL"):
    """
    return the extreme properties object result
        where result.return_lev_dict are all the return levels for a given simulation
              result.std_dict - are all the standard deviations from the bootstrap
    :param rconfig:
    :param varname:
    """
    result = ExtremeProperties()

    proc_pool = Pool(processes=20)
    all_bootstrap_indices = None

    for extr_type, months in ExtremeProperties.extreme_type_to_month_of_interest.items(
    ):

        result.return_lev_dict[extr_type] = {}
        result.std_dict[extr_type] = {}

        return_periods = ExtremeProperties.extreme_type_to_return_periods[
            extr_type].copy()

        # Do not do the calculations for the cached return periods
        cached_periods = []
        for return_period in list(return_periods):
            # Construct the name of the cache file
            cache_file = get_cache_file_name(rconfig,
                                             months=months,
                                             ret_period=return_period,
                                             extreme_type=extr_type,
                                             varname=varname)

            p = Path(cache_file)

            if p.is_file():
                cached_periods.append(return_period)
                return_periods.remove(return_period)

                cache_levs, cache_stds = pickle.load(p.open("rb"))
                print("Using cache from {}".format(str(p)))

                result.return_lev_dict[extr_type][return_period] = cache_levs
                result.std_dict[extr_type][return_period] = cache_stds

        # Do not do anything if the return levels for all periods are cached
        # for this type of extreme events
        if len(return_periods) == 0:
            continue

        # 3D array of annual extremes for each grid point
        t0 = time.clock()
        ext_values = analysis.get_annual_extrema(
            rconfig=rconfig,
            varname=varname,
            months_of_interest=months,
            n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[extr_type],
            high_flow=ExtremeProperties.high == extr_type)

        print("Got extreme values for {}-{} in {}s".format(
            rconfig.start_year, rconfig.end_year,
            time.clock() - t0))

        nx, ny = ext_values.shape[1:]

        result.return_lev_dict[extr_type].update(
            {k: -np.ones((nx, ny))
             for k in return_periods})
        result.std_dict[extr_type].update(
            {k: -np.ones((nx, ny))
             for k in return_periods})

        import matplotlib.pyplot as plt
        plt.figure()

        # to_plot = ext_values[0].transpose().copy()
        # to_plot = np.ma.masked_where(to_plot >= 0, to_plot)
        # qm = plt.pcolormesh(to_plot)
        # datacursor(qm)
        # plt.colorbar()
        # plt.show()

        ext_values = np.where(ext_values >= 0, ext_values, 0)

        if all_bootstrap_indices is None:
            np.random.seed(seed=ExtremeProperties.seed)
            all_bootstrap_indices = np.array([
                np.random.random_integers(0, ext_values.shape[0] - 1,
                                          ext_values.shape[0])
                for _ in range(ExtremeProperties.nbootstrap)
            ])

        # Probably needs to be optimized ...
        for i in range(nx):
            input_data = zip(ext_values[:, i, :].transpose(),
                             itt.repeat(extr_type, ny),
                             itt.repeat(return_periods, ny),
                             itt.repeat(all_bootstrap_indices, ny))

            ret_level_and_std_pairs = proc_pool.map(
                do_gevfit_for_a_point_single_arg, input_data)

            ret_levels, std_deviations = zip(*ret_level_and_std_pairs)

            for return_period in return_periods:
                result.return_lev_dict[extr_type][return_period][i, :] = [
                    ret_levels[j][return_period] for j in range(ny)
                ]
                result.std_dict[extr_type][return_period][i, :] = [
                    std_deviations[j][return_period] for j in range(ny)
                ]

            # Show the progress
            if i % 10 == 0:
                print("progress {}/{}".format(i, nx))

        # Save the computed return levels and standard deviations to the cache file
        for return_period in return_periods:
            # Construct the name of the cache file
            cache_file = get_cache_file_name(rconfig,
                                             months=months,
                                             ret_period=return_period,
                                             extreme_type=extr_type)

            p = Path(cache_file)

            to_save = [
                result.return_lev_dict[extr_type][return_period],
                result.std_dict[extr_type][return_period]
            ]

            pickle.dump(to_save, p.open("wb"))

    return result
def get_return_levels_and_unc_using_bootstrap(rconfig, varname="STFL"):
    """
    return the extreme properties object result
        where result.return_lev_dict are all the return levels for a given simulation
              result.std_dict - are all the standard deviations from the bootstrap
    :param rconfig:
    :param varname:
    """
    result = ExtremeProperties()

    proc_pool = Pool(processes=20)
    all_bootstrap_indices = None

    for extr_type, months in ExtremeProperties.extreme_type_to_month_of_interest.items():

        result.return_lev_dict[extr_type] = {}
        result.std_dict[extr_type] = {}

        return_periods = ExtremeProperties.extreme_type_to_return_periods[extr_type].copy()

        # Do not do the calculations for the cached return periods
        cached_periods = []
        for return_period in list(return_periods):
            # Construct the name of the cache file
            cache_file = get_cache_file_name(rconfig, months=months,
                                             ret_period=return_period,
                                             extreme_type=extr_type,
                                             varname=varname)

            p = Path(cache_file)

            if p.is_file():
                cached_periods.append(return_period)
                return_periods.remove(return_period)

                cache_levs, cache_stds = pickle.load(p.open("rb"))
                print("Using cache from {}".format(str(p)))

                result.return_lev_dict[extr_type][return_period] = cache_levs
                result.std_dict[extr_type][return_period] = cache_stds

        # Do not do anything if the return levels for all periods are cached
        # for this type of extreme events
        if len(return_periods) == 0:
            continue

        # 3D array of annual extremes for each grid point
        t0 = time.clock()
        ext_values = analysis.get_annual_extrema(rconfig=rconfig, varname=varname,
                                                 months_of_interest=months,
                                                 n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[extr_type],
                                                 high_flow=ExtremeProperties.high == extr_type)

        print("Got extreme values for {}-{} in {}s".format(rconfig.start_year, rconfig.end_year, time.clock() - t0))

        nx, ny = ext_values.shape[1:]

        result.return_lev_dict[extr_type].update({k: -np.ones((nx, ny)) for k in return_periods})
        result.std_dict[extr_type].update({k: -np.ones((nx, ny)) for k in return_periods})


        import matplotlib.pyplot as plt
        plt.figure()

        # to_plot = ext_values[0].transpose().copy()
        # to_plot = np.ma.masked_where(to_plot >= 0, to_plot)
        # qm = plt.pcolormesh(to_plot)
        # datacursor(qm)
        # plt.colorbar()
        # plt.show()

        ext_values = np.where(ext_values >= 0, ext_values, 0)


        if all_bootstrap_indices is None:
            np.random.seed(seed=ExtremeProperties.seed)
            all_bootstrap_indices = np.array([np.random.random_integers(0, ext_values.shape[0] - 1, ext_values.shape[0])
                                              for _ in range(ExtremeProperties.nbootstrap)])





        # Probably needs to be optimized ...
        for i in range(nx):
            input_data = zip(ext_values[:, i, :].transpose(), itt.repeat(extr_type, ny),
                             itt.repeat(return_periods, ny), itt.repeat(all_bootstrap_indices, ny))

            ret_level_and_std_pairs = proc_pool.map(do_gevfit_for_a_point_single_arg, input_data)

            ret_levels, std_deviations = zip(*ret_level_and_std_pairs)

            for return_period in return_periods:
                result.return_lev_dict[extr_type][return_period][i, :] = [ret_levels[j][return_period] for j in range(ny)]
                result.std_dict[extr_type][return_period][i, :] = [std_deviations[j][return_period] for j in range(ny)]


            # Show the progress
            if i % 10 == 0:
                print("progress {}/{}".format(i, nx))


        # Save the computed return levels and standard deviations to the cache file
        for return_period in return_periods:
            # Construct the name of the cache file
            cache_file = get_cache_file_name(rconfig, months=months,
                                             ret_period=return_period,
                                             extreme_type=extr_type)

            p = Path(cache_file)

            to_save = [
                result.return_lev_dict[extr_type][return_period],
                result.std_dict[extr_type][return_period]
            ]

            pickle.dump(to_save, p.open("wb"))

    return result
def main(hdf_folder="/home/huziy/skynet3_rech1/hdf_store", start_year=1980, end_year=2010):
    prepare()

    all_markers = ["*", "s", "p", "+", "x", "d", "h"]

    excluded = ["white", "w", "aliceblue", "azure"]
    excluded.extend([ci for ci in colors.cnames if "yellow" in ci])

    all_colors = ["k", "b", "r", "g", "m"] + sorted([ci for ci in colors.cnames if ci not in excluded])

    # Station ids to get from the CEHQ database
    ids_with_lakes_upstream = [
        "104001", "093806", "093801", "081002", "081007", "080718"
    ]

    selected_ids = ids_with_lakes_upstream

    filedir = Path(hdf_folder)
    sim_name_to_file_path = OrderedDict([
        # ("CRCM5-LI", filedir.joinpath("quebec_0.1_crcm5-hcd-r.hdf5").as_posix()),

        ("ERAI-CRCM5-L", filedir.joinpath("quebec_0.1_crcm5-hcd-rl.hdf5").as_posix()),

        # ("CanESM2-CRCM5-NL", filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-r-cc-canesm2-1980-2010.hdf5").as_posix()),

        ("CanESM2-CRCM5-L",
         filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-hcd-rl-cc-canesm2-1980-2010.hdf5").as_posix()),

        # ("CanESM2-CRCM5-LI", filedir.joinpath("cc-canesm2-driven/quebec_0.1_crcm5-hcd-rl-intfl-cc-canesm2-1980-2010.hdf5").as_posix()),


    ])

    obs_label = "Obs."
    labels = [obs_label, ] + list(sim_name_to_file_path.keys())

    label_to_marker = dict(zip(labels, all_markers))
    label_to_color = dict(zip(labels, all_colors))

    # Get the list of stations to do the comparison with
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    stations = cehq_station.read_station_data(
        start_date=start_date, end_date=end_date, selected_ids=selected_ids
    )

    # Get geophysical fields from one of the model simulations
    path0 = list(sim_name_to_file_path.values())[0]
    lons2d, lats2d, basemap = analysis.get_basemap_from_hdf(file_path=path0)
    flow_directions = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_FLOW_DIRECTIONS_NAME)
    lake_fraction = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_LAKE_FRACTION_NAME)

    accumulation_area_km2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_ACCUMULATION_AREA_NAME)
    area_m2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_CELL_AREA_NAME_M2)

    # Try to read cell areas im meters if it is not Ok then try in km2
    if area_m2 is not None:
        cell_area_km2 = area_m2 * 1.0e-6
    else:
        cell_area_km2 = analysis.get_array_from_file(path=path0, var_name=infovar.HDF_CELL_AREA_NAME_KM2)

    # Create a cell manager if it is not provided
    cell_manager = CellManager(flow_directions, accumulation_area_km2=accumulation_area_km2,
                               lons2d=lons2d, lats2d=lats2d)

    # Get the list of the corresponding model points
    station_to_modelpoint = cell_manager.get_model_points_for_stations(
        station_list=stations,
        lake_fraction=lake_fraction,
        drainaige_area_reldiff_limit=0.1)

    # plot_utils.apply_plot_params(font_size=10, width_cm=20, height_cm=18)
    fig = plt.figure()

    ncols = max([len(rp_list) for et, rp_list in ExtremeProperties.extreme_type_to_return_periods.items()])
    nrows = len(ExtremeProperties.extreme_types)
    gs = GridSpec(nrows, ncols)

    ext_type_to_rp_to_ax = OrderedDict()
    ax_with_legend = None

    label_to_ax_to_xdata = {}
    label_to_ax_to_ydata = {}
    for row, ext_type in enumerate(ExtremeProperties.extreme_types):
        ext_type_to_rp_to_ax[ext_type] = OrderedDict()
        for col, rperiod in enumerate(ExtremeProperties.extreme_type_to_return_periods[ext_type]):
            ax = fig.add_subplot(gs[row, col])
            ext_type_to_rp_to_ax[ext_type][rperiod] = ax

            if col == 0:
                ax.set_ylabel(ext_type)

            if row == nrows - 1 and col == ncols - 1:
                ax_with_legend = ax

            # Set axes labels
            if row == nrows - 1:
                ax.set_xlabel("Observations")

            if col == 0:
                ax.set_ylabel("Model")

            for label in sim_name_to_file_path:

                if label not in label_to_ax_to_xdata:
                    label_to_ax_to_xdata[label] = {ax: []}
                    label_to_ax_to_ydata[label] = {ax: []}
                else:
                    label_to_ax_to_xdata[label][ax] = []
                    label_to_ax_to_ydata[label][ax] = []

            ax.set_xscale("log")
            ax.set_yscale("log")

    print("Initial list of stations:")

    sim_label_to_handle = {}
    for s in stations:
        print("{0}".format(s))
        assert isinstance(s, Station)

        print(len([y for y in s.get_list_of_complete_years() if start_year <= y <= end_year]))
        df_ext_obs = extreme_commons.get_annual_extrema(ts_times=s.dates, ts_vals=s.values,
                                                        start_year=start_year, end_year=end_year)
        mp = station_to_modelpoint[s]

        assert isinstance(mp, ModelPoint)

        years_of_interest = df_ext_obs.index

        label_to_extrema_model = {}



        # label -> ext_type -> [return period -> ret level, return period -> std]
        label_to_return_levels = OrderedDict(
            [(obs_label, OrderedDict())]
        )
        for sim_label, sim_path in sim_name_to_file_path.items():
            label_to_return_levels[sim_label] = OrderedDict()
            label_to_extrema_model[sim_label] = OrderedDict()



        # Calculate the return levels and standard deviations
        for ext_type in ExtremeProperties.extreme_types:

            return_periods = ExtremeProperties.extreme_type_to_return_periods[ext_type]

            # fit GEV distribution and apply non-parametric bootstrap to get std
            label_to_return_levels[obs_label][ext_type] = gevfit.do_gevfit_for_a_point(df_ext_obs[ext_type].values,
                                                                                       extreme_type=ext_type,
                                                                                       return_periods=return_periods)
            return_levels_obs, rl_stds_obs = label_to_return_levels[obs_label][ext_type]


            # get annual extremas for the model output at the points colose to the stations
            for sim_label, sim_path in sim_name_to_file_path.items():
                label_to_return_levels[sim_label] = OrderedDict()

                ext_field = analysis.get_annual_extrema(
                    rconfig=RunConfig(data_path=sim_path, start_year=start_year, end_year=end_year),
                    varname="STFL", months_of_interest=ExtremeProperties.extreme_type_to_month_of_interest[ext_type],
                    n_avg_days=ExtremeProperties.extreme_type_to_n_agv_days[ext_type],
                    high_flow=ext_type == ExtremeProperties.high)

                # Select only those years when obs are available
                ts_data = [v for y, v in zip(range(start_year, end_year + 1), ext_field[:, mp.ix, mp.jy]) if
                           y in years_of_interest]
                ts_data = np.array(ts_data)
                return_levels, rl_stds = gevfit.do_gevfit_for_a_point(ts_data, extreme_type=ext_type,
                                                                      return_periods=return_periods)





                # Do the plotting
                for rp in return_periods:
                    ax = ext_type_to_rp_to_ax[ext_type][rp]
                    ax.set_title("T = {rp}-year".format(rp=rp))

                    # h = ax.errorbar(return_levels_obs[rp], return_levels[rp],
                    # marker=label_to_marker[sim_label], color=label_to_color[sim_label], label=sim_label,
                    #                 xerr=rl_stds_obs[rp] * 1.96, yerr=rl_stds[rp] * 1.96)

                    h = ax.scatter(return_levels_obs[rp], return_levels[rp],
                                   marker=label_to_marker[sim_label], color=label_to_color[sim_label], label=sim_label)



                    # save the data for maybe further calculation of the correlation coefficients
                    label_to_ax_to_xdata[sim_label][ax].append(return_levels_obs[rp])
                    label_to_ax_to_ydata[sim_label][ax].append(return_levels[rp])

                    sim_label_to_handle[sim_label] = h



    # Calculate the biases
    for sim_label in sim_name_to_file_path:
        for ext_type in ExtremeProperties.extreme_types:
            ret_periods = ExtremeProperties.extreme_type_to_return_periods[ext_type]
            for rp in ret_periods:

                ax = ext_type_to_rp_to_ax[ext_type][rp]
                mod = np.asarray(label_to_ax_to_ydata[sim_label][ax])
                obs = np.asarray(label_to_ax_to_xdata[sim_label][ax])

                bias = np.mean((mod - obs)/obs)
                corr, pv = stats.pearsonr(mod, obs)
                print("({sim_label}) Mean bias for {rp}-year {ext_type}-flow return level is: {bias}; corr={corr:.2f}; corr_pval={corr_pval:2g}".format(
                    sim_label=sim_label, rp=rp, bias=bias, corr=corr, corr_pval=pv,
                    ext_type=ext_type
                ))




    sfmt = ScalarFormatter(useMathText=True)
    sfmt.set_powerlimits((-2, 2))
    for et, rp_to_ax in ext_type_to_rp_to_ax.items():
        for rp, ax in rp_to_ax.items():
            xmin, xmax = ax.get_xlim()
            ymin, ymax = ax.get_ylim()
            x1 = min(xmin, ymin)
            x2 = min(xmax, ymax)
            ax.plot([x1, x2], [x1, x2], "k--")
            # ax.xaxis.set_major_locator(MaxNLocator(nbins=5))
            # ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
            # ax.xaxis.set_major_formatter(sfmt)
            # ax.yaxis.set_major_formatter(sfmt)

    sim_labels = list(sim_name_to_file_path.keys())
    ax_with_legend.legend([sim_label_to_handle[sl] for sl in sim_labels], sim_labels,
                          bbox_to_anchor=(1, -0.25), borderaxespad=0.0, loc="upper right",
                          ncol=2, scatterpoints=1, numpoints=1)

    # Save the plot
    img_file = "{}.eps".format("_".join(sorted(label_to_marker.keys())))
    img_file = img_folder.joinpath(img_file)

    fig.tight_layout()
    with img_file.open("wb") as f:
        fig.savefig(f, bbox_inches="tight")