Example #1
0
def plot_cats_best_prms_1d(dbs_dir, n_cpus):
    '''Plot every best kfold parameter set for all catchments.'''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs
    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    cats_paths_gen = (cat_db for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_best_prms_1d, cats_paths_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for cat_paths in cats_paths_gen:
            plot_cat_best_prms_1d(cat_paths)

    return
Example #2
0
def plot_cats_prm_vecs(dbs_dir, n_cpus):
    '''Plot final parameter set from kfold for every catchments along with
    objective function value distribution.
    '''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    n_cpus = min(n_cats, n_cpus)

    opt_res_gen = (cat_db for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_prm_vecs, opt_res_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for opt_res in opt_res_gen:
            plot_cat_prm_vecs(opt_res)

    return
Example #3
0
    def _prep_anomaly_bjs_mp(anoms_arr, bjs_arr, n_cpus, fig_out_dir):

        assert anoms_arr.shape == bjs_arr.shape

        _idxs = ret_mp_idxs(anoms_arr.shape[1], n_cpus)
        _idxs_list = [_idxs[i:i + 2] for i in range(n_cpus)]

        _anoms_gen = ((anoms_arr[:, _idxs_list[i][0]:_idxs_list[i][1]])
                      for i in range(n_cpus))

        _bjs_gen = ((bjs_arr[:, _idxs_list[i][0]:_idxs_list[i][1]])
                    for i in range(n_cpus))

        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        try:
            print(
                list(
                    mp_pool.uimap(Anomaly._plot_anomaly_bjs_cdf, _idxs_list,
                                  _anoms_gen, _bjs_gen,
                                  [fig_out_dir] * n_cpus)))

            mp_pool.clear()

        except Exception as msg:
            mp_pool.close()
            mp_pool.join()
            print('Error in _plot_anomaly_bjs_cdf:', msg)
        return
Example #4
0
def plot_cats_kfold_effs(dbs_dir, hgs_db_path, compare_ann_cyc_flag, n_cpus):
    '''Plot the k-fold efficiency results.'''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    n_cpus = min(n_cats, n_cpus)

    const_args = (compare_ann_cyc_flag, hgs_db_path)
    cats_paths_gen = ((cat_db, const_args) for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_kfold_effs, cats_paths_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for cat_paths in cats_paths_gen:
            plot_cat_kfold_effs(cat_paths)

    return
Example #5
0
def plot_cats_vars_errors(dbs_dir, err_var_labs, n_cpus):

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    n_cpus = min(n_cats, n_cpus)

    cats_paths_gen = ((cat_db, err_var_labs) for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_vars_errors, cats_paths_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for cat_paths in cats_paths_gen:
            plot_cat_vars_errors(cat_paths)

    return
Example #6
0
def plot_cats_qsims(dbs_dir, n_cpus=1):
    '''Plot discharge simulations for every catchment for every
    kfold using its prm_vecs.'''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    plot_gen = (cat_db for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_qsims, plot_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for plot_args in plot_gen:
            plot_cat_qsims(plot_args)

    return
Example #7
0
def pairs_construction(seqs: List[List[Union[str, int]]], window_size: int = 2,
                       drop_duplicates: bool = True,
                       n_jobs: int = 4, **kwargs):
    """
    Helper function to make pairs from sequences in parallel
    Parameters
    ----------
    seqs : input sequences of nodes
    window_size : int, default is 2
    drop_duplicates : bool, default if True
        Delete pairs where both elements are the same
    n_jobs : int, default is 4
        Number of workers to be created in parallel pool

    Returns
    -------
    List of pairs of nodes as <cur_vertex, context_vertex>
    """
    set_new_config(window_size=window_size, **kwargs)
    local_logger = logging.getLogger(f"{__name__}")
    max_processes = max(n_jobs, os.cpu_count())
    pairs_pool = ProcessPool(nodes=max_processes)
    pairs_pool.terminate()
    pairs_pool.restart()
    local_logger.info("Started making pairs from the sequences.")
    pairs = pairs_pool.map(_make_pairs, seqs)
    local_logger.info(f"Total number of raw sampled pairs is {len(pairs)}")
    if drop_duplicates:
        pairs = [item for sublist in pairs for item in sublist if item[0] != item[1]]
    else:
        pairs = [item for sublist in pairs for item in sublist]
    pairs = [item for item in pairs if (item[0] != -3) & (item[1] != -3)]
    pairs_pool.terminate()
    pairs_pool.restart()
    return pairs
Example #8
0
def plot_cats_hbv_sim(dbs_dir,
                      water_bal_step_size,
                      full_flag=False,
                      wat_bal_flag=False,
                      show_warm_up_steps_flag=False,
                      n_cpus=1):
    '''Plot hbv simulations for every catchment for every kfold.'''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    const_args = (water_bal_step_size, full_flag, wat_bal_flag,
                  show_warm_up_steps_flag)

    plot_gen = ((cat_db, const_args) for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_hbv_sim, plot_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for plot_args in plot_gen:
            plot_cat_hbv_sim(plot_args)

    return
def parallelize_simulations(simulation_execs: List[Callable],
                            var_dict_list: List[VarDictType],
                            states_lists: List[StatesListsType],
                            configs_structs: List[ConfigsType],
                            env_processes_list: List[EnvProcessesType],
                            Ts: List[range], SimIDs, Ns: List[int],
                            ExpIDs: List[int], SubsetIDs, SubsetWindows,
                            configured_n):

    print(f'Execution Mode: parallelized')
    params = list(
        zip(simulation_execs, var_dict_list, states_lists, configs_structs,
            env_processes_list, Ts, SimIDs, Ns, SubsetIDs, SubsetWindows))

    len_configs_structs = len(configs_structs)

    unique_runs = Counter(SimIDs)
    sim_count = max(unique_runs.values())
    highest_divisor = int(len_configs_structs / sim_count)

    new_configs_structs, new_params = [], []
    for count in range(len(params)):
        if count == 0:
            new_params.append(params[count:highest_divisor])
            new_configs_structs.append(configs_structs[count:highest_divisor])
        elif count > 0:
            new_params.append(params[count * highest_divisor:(count + 1) *
                                     highest_divisor])
            new_configs_structs.append(
                configs_structs[count * highest_divisor:(count + 1) *
                                highest_divisor])

    def threaded_executor(params):
        if len_configs_structs > 1:
            tp = TPool()
            results = tp.map(
                lambda t: t[0](t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8],
                               t[9], configured_n), params)
            tp.close()
        else:
            t = params[0]
            results = t[0](t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8],
                           t[9], configured_n)
        return results

    pp = PPool()
    results = flatten(
        list(pp.map(lambda params: threaded_executor(params), new_params)))
    # results = flatten(list(map(lambda params: threaded_executor(params), new_params)))
    pp.close()
    pp.join()
    pp.clear()
    pp.restart()

    return results
Example #10
0
    def threaded_contents_to_text(
        content_series,
        processes=None,
        none_content='raise',
    ):
        """Threaded version of content_to_text method

        It takes as input a series which index is the uid of the products,
        and the values are the content (in the form of bytes) of the
        documents.
        processes argument is the number of processes to launch. If omitted,
        it defaults to the number of cpu cores on the machine.
        none_content arg can be 'raise' (default) or to_empty
        """
        processer = partial(
            PDFDecoder.content_to_text,
            none_content=none_content,
        )
        processes = processes if processes else cpu_count()
        print(f'Launching {processes} processes.')
        in_ds = content_series.apply(BytesIO)

        # Pool with context manager do not seem to work due to issue 38501 of
        # standard python library. It hangs when running tests through pytest
        # see: https://bugs.python.org/issue38501
        # Below content should be tested again whenever this issue is closed
        #
        # with Pool(nodes=processes) as pool:
        #     tuples = (list(in_ds.index),
        #               pool.map(processer, in_ds))
        #
        # End of block

        # This temporary solution should be removed when tests mentioned above
        # are successful.
        # This just closes each pool after execution or exception.
        try:
            pool = Pool(nodes=processes)
            pool.restart(force=True)
            tuples = (list(in_ds.index), pool.map(processer, in_ds))
        except Exception:
            pool.close()
            raise
        pool.close()
        # End of block

        ds = pd.Series(tuples[1], index=tuples[0])
        return (ds)
Example #11
0
    def threaded_texts_to_blocks(text_series,
                                 processes=None,
                                 split_func=lambda x: x.split('\n\n'),
                                 return_type='along_index'):
        """Threaded version of text_to_blocks_series method

        It takes as input a series which index is the uid of the products,
        and the values are the content (in the form of bytes) of the
        documents..
        processes argument is the number of processes to launch. If omitted,
        it defaults to the number of cpu cores on the machine.
        As for text_to_blocks_series function, return_type can be 'along_axis'
        or 'list_like'.
        """
        processer = partial(PDFDecoder.text_to_blocks_series,
                            split_func=split_func,
                            return_type=return_type)
        processes = processes if processes else cpu_count()
        print(f'Launching {processes} processes.')

        # Pool with context manager do not seem to work due to issue 38501 of
        # standard python library. It hangs when running tests through pytest
        # see: https://bugs.python.org/issue38501
        # Below content should be tested again whenever this issue is closed
        #
        # with Pool(nodes=processes) as pool:
        #     ds_list = pool.map(processer, text_series, text_series.index)
        #
        # End of block

        # This temporary solution should be removed when tests mentioned above
        # are successful.
        # This just closes each pool after execution or exception.
        try:
            pool = Pool(nodes=processes)
            pool.restart(force=True)
            ds_list = pool.map(processer, text_series, text_series.index)
        except Exception:
            pool.close()
            raise
        pool.close()
        # End of block

        ds = pd.concat(ds_list, axis=0)
        return (ds)
Example #12
0
def plot_cats_prm_vecs_evo(dbs_dir,
                           save_obj_flag,
                           save_png_flag,
                           save_gif_flag,
                           anim_secs,
                           n_cpus=1):
    '''Plot the evolution of parameter vectors and convex hull for every
    catchment for every kfold.
    '''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    n_cpus = min(n_cats, n_cpus)

    opt_res_gen = ((cat_db, save_obj_flag, save_png_flag, save_gif_flag,
                    anim_secs) for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_prm_vecs_evo, opt_res_gen)))
        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for opt_res in opt_res_gen:
            plot_cat_prm_vecs_evo(opt_res)

    return
Example #13
0
    def _cmpt_lim_phsrand_obj_vals(self, phs_red_rate, idxs_sclr):

        beg_tm = default_timer()

        _ = phs_red_rate
        _ = idxs_sclr

        self._sett_lim_phsrand_dir.mkdir(exist_ok=True)

        ptrb_ratios = np.linspace(self._sett_lim_phsrand_ptrb_lbd,
                                  self._sett_lim_phsrand_ptrb_ubd,
                                  self._sett_lim_phsrand_n_ptrb_vals,
                                  endpoint=True)

        ptrb_obj_vals = np.empty((self._sett_lim_phsrand_n_ptrb_vals,
                                  self._sett_lim_phsrand_iters_per_atpt))

        n_cpus = min(self._sett_lim_phsrand_n_ptrb_vals,
                     self._sett_misc_n_cpus)

        ubd_sclr = 1.2
        search_attempts = 0
        ress = []
        sel_stat_ftn = getattr(np, self._alg_lim_phsrand_sel_stat)

        if self._vb:
            print('Attempt,', 'Perturb ratio,', '   Minimum,', '      Mean,',
                  '   Maximum')

        if n_cpus > 1:
            self._lock = Manager().Lock()

            mp_pool = ProcessPool(n_cpus)
            mp_pool.restart(True)

            for i in range(0, self._sett_lim_phsrand_n_ptrb_vals, n_cpus):

                end_idx = min(self._sett_lim_phsrand_n_ptrb_vals, n_cpus + i)

                assert i < end_idx, 'This was not supposed to happen!'

                search_attempts += end_idx - i

                # Don't use ret_mp_idxs, it will be inefficient.
                args_gen = ((j, ptrb_ratios[j]) for j in range(i, end_idx))

                ptrb_obj_vals_iter = (list(
                    mp_pool.imap(self._cmpt_lim_phsrand_obj_vals_single,
                                 args_gen)))

                ress.extend(ptrb_obj_vals_iter)

                if np.any([
                        sel_stat_ftn(ptrb_obj_vals_iter[k][1]) >=
                    (self._sett_lim_phsrand_obj_ubd * ubd_sclr)
                        for k in range(len(ptrb_obj_vals_iter))
                ]):

                    break

            mp_pool.close()
            mp_pool.join()

            self._lock = None

            mp_pool = None

        else:
            self._lock = Lock()

            for j in range(self._sett_lim_phsrand_n_ptrb_vals):
                search_attempts += 1

                ress.append(
                    self._cmpt_lim_phsrand_obj_vals_single(
                        (j, ptrb_ratios[j])))

                if (sel_stat_ftn(ress[-1][1]) >=
                    (self._sett_lim_phsrand_obj_ubd * ubd_sclr)):

                    break

            self._lock = None

        take_idxs = []
        for res in ress:
            take_idxs.append(res[0])
            ptrb_obj_vals[take_idxs[-1], :] = res[1]

        take_idxs.sort()
        take_idxs = np.array(take_idxs)

        ptrb_ratios = ptrb_ratios[take_idxs]
        ptrb_obj_vals = ptrb_obj_vals[take_idxs]

        res = ress = None

        assert np.all(
            np.isfinite(ptrb_ratios)), ('Invalid values in ptrb_ratios!')

        assert np.all(
            ptrb_ratios >= 0), ('Values less than zero in ptrb_ratios!')

        assert np.all(
            np.isfinite(ptrb_obj_vals)), ('Invalid values in ptrb_obj_vals!')

        assert np.all(
            ptrb_obj_vals >= 0), ('Values less than zero in ptrb_obj_vals!')

        self._alg_lim_phsrand_ptrb_ratios = ptrb_ratios
        self._alg_lim_phsrand_ptrb_obj_vals = ptrb_obj_vals

        self._set_lim_phsrand_ptrb_ratio()

        self._plot_lim_phsrand_obj_vals()

        end_tm = default_timer()

        if self._vb:
            print(f'Found perturbation ratio of '
                  f'{self._alg_lim_phsrand_ptrb_ratio:5.3E} in '
                  f'{end_tm - beg_tm:0.1f} '
                  f'seconds using {search_attempts} attempts.')

        return
Example #14
0
def main():

    main_dir = Path(
        r'P:\Synchronize\IWS\Testings\fourtrans_practice\multisite_phs_spec_corr'
    )

    os.chdir(main_dir)

    interp_var = 'temp'

    ft_type = 'mag'

    #==========================================================================
    if interp_var == 'temp':
        # MEAN TEMPERATURE
        in_data_file = os.path.join(f'temperature_{ft_type}_spec_df.csv')

        in_vgs_file = os.path.join(r'temperature_cftns.csv')

        in_stns_coords_file = os.path.join(os.path.dirname(in_data_file),
                                           r'temperature_avg_coords.csv')

        out_dir = r'temperature_kriging'
        var_units = u'\u2103'  # 'centigrade'
        var_name = 'temperature'
        out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc'

        # interpolated values
        # can be int, float, 'min_in'/'max_in' or None
        # min_var_val = 'min_in'
        # max_var_val = 'max_in'
#         min_var_val = None
#         max_var_val = None

#==========================================================================

#==========================================================================
    elif interp_var == 'ppt':
        # PRECIPITATION
        in_data_file = os.path.join(f'precipitation_{ft_type}_spec_df.csv')

        in_vgs_file = os.path.join(r'precipitation_cftns.csv')

        in_stns_coords_file = os.path.join(os.path.dirname(in_data_file),
                                           r'precipitation_coords.csv')

        out_dir = r'precipitation_kriging'
        var_units = 'mm'
        var_name = 'precipitation'
        out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc'

        # interpolated values
        # can be int, float, 'min_in'/'max_in' or None
        # min_var_val = 'min_in'
        # max_var_val = 'max_in'
#         min_var_val = None
#         max_var_val = None

#==========================================================================
    else:
        raise ValueError(f'Invalid value for interp_var: {interp_var}!')

    out_krig_net_cdf_file = out_krig_net_cdf_file

    # assuming in_drift_raster and in_stns_coords_file and in_bounds_shp_file
    # have the same coordinates system
    # assuming in_drift_rasters_list have the same cell sizes, bounds and NDVs
    # basically they are copies of each other except for the drift values
    in_drift_rasters_list = ([
        r'P:\Synchronize\IWS\QGIS_Neckar\raster\lower_de_gauss_z3_1km.tif'
    ])

    #     in_bounds_shp_file = (
    #         os.path.join(r'P:\Synchronize\IWS\QGIS_Neckar\raster',
    #                      r'taudem_out_spate_rockenau\watersheds.shp'))

    in_bounds_shp_file = (os.path.join(
        r'P:\Synchronize\IWS\QGIS_Neckar\raster\taudem_out_spate_rockenau\watersheds.shp'
    ))

    align_ras_file = in_drift_rasters_list[0]

    out_figs_dir = os.path.join(out_dir, 'krige_figs')

    x_coords_lab = 'X'
    y_coords_lab = 'Y'
    time_dim_lab = 'freq'
    nc_mode = 'w'

    #     min_ppt_thresh = 1.0

    idw_exp = 5
    n_cpus = 1
    buffer_dist = 20e3
    sec_buffer_dist = 2e3

    in_sep = str(';')

    ord_krige_flag = True
    sim_krige_flag = True
    edk_krige_flag = True
    idw_flag = True
    plot_figs_flag = True

    #     ord_krige_flag = False
    sim_krige_flag = False
    edk_krige_flag = False
    idw_flag = False
    plot_figs_flag = False

    os.chdir(main_dir)

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    if (not os.path.exists(out_figs_dir)) and plot_figs_flag:
        os.mkdir(out_figs_dir)

#     print('min_var_val:', min_var_val)
#     print('max_var_val:', max_var_val)
    print('idw_exp:', idw_exp)
    print('n_cpus:', n_cpus)
    print('nc_mode:', nc_mode)
    print('var_name:', var_name)
    print('out_dir:', out_dir)
    print('in_bounds_shp_file:', in_bounds_shp_file)
    print('out_krig_net_cdf_file:', out_krig_net_cdf_file)

    assert any([ord_krige_flag, sim_krige_flag, edk_krige_flag, idw_flag])

    #==========================================================================
    # read the data frames
    #==========================================================================
    in_data_df = pd.read_csv(in_data_file,
                             sep=in_sep,
                             index_col=0,
                             encoding='utf-8')

    in_vgs_df = pd.read_csv(in_vgs_file,
                            sep=in_sep,
                            index_col=0,
                            encoding='utf-8')

    in_stns_coords_df = pd.read_csv(in_stns_coords_file,
                                    sep=in_sep,
                                    index_col=0,
                                    encoding='utf-8')

    all_stns = in_data_df.columns.intersection(in_stns_coords_df.index)
    assert all_stns.shape[0]

    in_data_df = in_data_df.loc[:, all_stns]
    in_stns_coords_df = in_stns_coords_df.loc[all_stns, :]

    #==========================================================================
    # Get stations that are around/in the bounds_shp only
    #==========================================================================

    bds_vec = ogr.Open(in_bounds_shp_file)
    assert bds_vec

    bds_lyr = bds_vec.GetLayer(0)

    feat_buffs_list = []
    feat_sec_buffs_list = []
    for feat in bds_lyr:  # just to get the names of the catchments
        geom = feat.GetGeometryRef().Clone()
        assert geom

        feat_buffs_list.append(geom.Buffer(buffer_dist))
        feat_sec_buffs_list.append(geom.Buffer(sec_buffer_dist))

    bds_vec.Destroy()

    assert feat_buffs_list and feat_sec_buffs_list

    print(len(feat_buffs_list), 'polygons in the in_bounds_shp_file...')

    fin_stns = []
    for poly in feat_buffs_list:
        for stn in all_stns:
            if stn in fin_stns:
                continue

            curr_pt = cnvt_to_pt(*in_stns_coords_df.loc[stn,
                                                        ['X', 'Y']].values)

            if chk_cntmt(curr_pt, poly):
                fin_stns.append(stn)

    assert fin_stns

    print('%d stations out of %d within buffer zone of in_bounds_shp_file' %
          (len(fin_stns), in_stns_coords_df.shape[0]))

    fin_stns = np.unique(fin_stns)
    in_data_df = in_data_df.loc[:, fin_stns]
    in_stns_coords_df = in_stns_coords_df.loc[fin_stns, :]

    #==========================================================================
    # Read the DEM
    #==========================================================================

    #     if edk_krige_flag:
    #         in_drift_arr_list = []
    #         _rows_list = []
    #         _cols_list = []
    #
    #         for in_drift_raster in in_drift_rasters_list:
    #             in_drift_ds = gdal.Open(in_drift_raster)
    #
    #             assert in_drift_ds, 'GDAL cannot open %s' % in_drift_raster
    #
    #             drift_rows = in_drift_ds.RasterYSize
    #             drift_cols = in_drift_ds.RasterXSize
    #
    #             drift_geotransform = in_drift_ds.GetGeoTransform()
    #
    #             _drift_x_min = drift_geotransform[0]
    #             _drift_y_max = drift_geotransform[3]
    #
    #             drift_band = in_drift_ds.GetRasterBand(1)
    #             drift_ndv = drift_band.GetNoDataValue()
    #
    #             cell_width = drift_geotransform[1]
    #             cell_height = abs(drift_geotransform[5])
    #
    #             _drift_x_max = _drift_x_min + (drift_cols * cell_width)
    #             _drift_y_min = _drift_y_max - (drift_rows * cell_height)
    #
    #             _arr = in_drift_ds.ReadAsArray()
    #
    #             in_drift_arr_list.append(_arr)
    #             _rows_list.append(_arr.shape[0])
    #             _cols_list.append(_arr.shape[1])
    #
    #         assert all(_ == _rows_list[0] for _ in _rows_list), (
    #             'Drift raster have unequal number of rows!')
    #
    #         assert all(_ == _cols_list[0] for _ in _cols_list), (
    #             'Drift raster have unequal number of columns!')

    #==========================================================================
    # Read the bounding shapefile
    #==========================================================================
    #     sf = shp.Reader(in_bounds_shp_file)
    #     polys_list = [i.__geo_interface__ for i in sf.iterShapes()]

    ((fin_x_min, fin_x_max, fin_y_min, fin_y_max),
     cell_width) = get_aligned_shp_bds_and_cell_size(in_bounds_shp_file,
                                                     align_ras_file)

    cell_height = cell_width

    fin_x_min -= 2 * cell_width
    fin_x_max += 2 * cell_width
    fin_y_min -= 2 * cell_height
    fin_y_max += 2 * cell_height

    #     if edk_krige_flag:
    #         assert fin_x_min > _drift_x_min
    #         assert fin_x_max < _drift_x_max
    #         assert fin_y_min > _drift_y_min
    #         assert fin_y_max < _drift_y_max
    #
    #         min_col = int(max(0, (fin_x_min - _drift_x_min) / cell_width))
    #         max_col = int(ceil((fin_x_max - _drift_x_min) / cell_width))
    #
    #         min_row = int(max(0, (_drift_y_max - fin_y_max) / cell_height))
    #         max_row = int(ceil((_drift_y_max - fin_y_min) / cell_height))
    #
    #     else:
    min_col = 0
    max_col = int(ceil((fin_x_max - fin_x_min) / cell_width))

    min_row = 0
    max_row = int(ceil((fin_y_max - fin_y_min) / cell_height))

    #==========================================================================
    # Calculate coordinates at which to krige
    #==========================================================================

    assert 0 <= min_col <= max_col, (min_col, max_col)
    assert 0 <= min_row <= max_row, (min_row, max_row)

    strt_x_coord = fin_x_min + (0.5 * cell_width)
    end_x_coord = strt_x_coord + ((max_col - min_col) * cell_width)

    strt_y_coord = fin_y_max - (0.5 * cell_height)
    end_y_coord = strt_y_coord - ((max_row - min_row) * cell_height)

    krige_x_coords = np.linspace(strt_x_coord, end_x_coord,
                                 (max_col - min_col + 1))

    krige_y_coords = np.linspace(strt_y_coord, end_y_coord,
                                 (max_row - min_row + 1))

    krige_x_coords_mesh, krige_y_coords_mesh = np.meshgrid(
        krige_x_coords, krige_y_coords)

    krige_coords_orig_shape = krige_x_coords_mesh.shape

    #     if plot_figs_flag:
    #         # xy coords for pcolormesh
    #         pcolmesh_x_coords = np.linspace(
    #             fin_x_min, fin_x_max, (max_col - min_col + 1))
    #
    #         pcolmesh_y_coords = np.linspace(
    #             fin_y_max, fin_y_min, (max_row - min_row + 1))
    #
    #         krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = (
    #             np.meshgrid(pcolmesh_x_coords, pcolmesh_y_coords))
    #
    #     else:
    #         krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = None, None

    krige_x_coords_mesh = krige_x_coords_mesh.ravel()
    krige_y_coords_mesh = krige_y_coords_mesh.ravel()

    #     print('\n\n')
    #     print('#' * 10)
    #
    #     _beg_t = timeit.default_timer()
    #
    #     print(krige_x_coords_mesh.shape[0],
    #           'cells to interpolate per step before intersection!')
    #
    fin_cntn_idxs = np.ones(krige_x_coords_mesh.shape[0], dtype=bool)
    #     fin_cntn_idxs = np.zeros(krige_x_coords_mesh.shape[0], dtype=bool)
    #     ogr_pts = np.vectorize(cnvt_to_pt)(krige_x_coords_mesh, krige_y_coords_mesh)
    #
    #     for poly in feat_sec_buffs_list:
    #         curr_cntn_idxs = np.vectorize(chk_cntmt)(ogr_pts, poly)
    #         fin_cntn_idxs = fin_cntn_idxs | curr_cntn_idxs
    #
    #     print(fin_cntn_idxs.sum(),
    #           'cells to interpolate per step after intersection!')
    #
    #     _end_t = timeit.default_timer()
    #     _tot_t = _end_t - _beg_t
    #
    #     print(f'Took {_tot_t:0.4f} seconds!')
    #     print('#' * 10)
    #
    #     krige_x_coords_mesh = krige_x_coords_mesh[fin_cntn_idxs]
    #     krige_y_coords_mesh = krige_y_coords_mesh[fin_cntn_idxs]

    #     if edk_krige_flag:
    #         drift_vals_list = []
    #
    #         krige_cols = np.arange(min_col, max_col + 1, dtype=int)
    #         krige_rows = np.arange(min_row, max_row + 1, dtype=int)
    #
    #         assert krige_x_coords.shape[0] == krige_cols.shape[0]
    #         assert krige_y_coords.shape[0] == krige_rows.shape[0]
    #
    #         (krige_drift_cols_mesh,
    #          krige_drift_rows_mesh) = np.meshgrid(krige_cols, krige_rows)
    #
    #         krige_drift_cols_mesh = krige_drift_cols_mesh.ravel()
    #         krige_drift_rows_mesh = krige_drift_rows_mesh.ravel()
    #
    #         krige_drift_cols_mesh = krige_drift_cols_mesh[fin_cntn_idxs]
    #         krige_drift_rows_mesh = krige_drift_rows_mesh[fin_cntn_idxs]
    #
    #         for _drift_arr in in_drift_arr_list:
    #             _drift_vals = _drift_arr[
    #                 krige_drift_rows_mesh, krige_drift_cols_mesh]
    #
    #             drift_vals_list.append(_drift_vals)
    #
    # #         drift_vals_arr = np.array(drift_vals_list, dtype=float)
    #
    #         drift_df_cols = list(range(len(in_drift_rasters_list)))
    #         in_stns_drift_df = pd.DataFrame(
    #             index=in_stns_coords_df.index,
    #             columns=drift_df_cols,
    #             dtype=float)
    #
    #         for stn in in_stns_drift_df.index:
    #             stn_x = in_stns_coords_df.loc[stn, x_coords_lab]
    #             stn_y = in_stns_coords_df.loc[stn, y_coords_lab]
    #
    #             stn_col = int((stn_x - _drift_x_min) / cell_width)
    #             stn_row = int((_drift_y_max - stn_y) / cell_height)
    #
    #             for col, _arr in zip(drift_df_cols, in_drift_arr_list):
    #                 try:
    #                     _ = _arr[stn_row, stn_col]
    #                     if not np.isclose(drift_ndv, _):
    #                         in_stns_drift_df.loc[stn, col] = _
    #
    #                 except IndexError:
    #                     pass
    #
    #         in_stns_drift_df.dropna(inplace=True)

    #==========================================================================
    # Open NC
    #==========================================================================
    out_nc = nc.Dataset(os.path.join(out_dir, out_krig_net_cdf_file),
                        mode=str(nc_mode))

    if nc_mode == 'w':
        out_nc.set_auto_mask(False)
        out_nc.createDimension(x_coords_lab, krige_x_coords.shape[0])
        out_nc.createDimension(y_coords_lab, krige_y_coords.shape[0])
        out_nc.createDimension(time_dim_lab, in_data_df.shape[0])

        x_coords_nc = out_nc.createVariable(x_coords_lab,
                                            'd',
                                            dimensions=x_coords_lab)

        x_coords_nc[:] = krige_x_coords

        y_coords_nc = out_nc.createVariable(y_coords_lab,
                                            'd',
                                            dimensions=y_coords_lab)

        y_coords_nc[:] = krige_y_coords

        time_nc = out_nc.createVariable(time_dim_lab,
                                        'i8',
                                        dimensions=time_dim_lab)

        time_nc[:] = np.arange(in_data_df.shape[0])

    else:
        raise RuntimeError('Not configured for this option!')

        time_nc = out_nc.variables[time_dim_lab]
        krige_y_coords = y_coords_nc[:]
        krige_x_coords = x_coords_nc[:]

    #==========================================================================
    # MP stuff
    #==========================================================================
    mp_cond = False

    if ((n_cpus > 1) and (in_data_df.shape[0] > (n_cpus + 1))):
        idxs = pd.np.linspace(0,
                              in_data_df.shape[0], (n_cpus) + 1,
                              endpoint=True,
                              dtype=int)

        idxs = np.unique(idxs)
        print('MP idxs:', idxs)

        if idxs.shape[0] == 1:
            idxs = np.concatenate((np.array([0]), idxs))

        mp_cond = True

    else:
        idxs = [0, in_data_df.shape[0]]

    #==========================================================================
    # Krige
    #==========================================================================
    if ord_krige_flag:
        print('\n\n')
        print('#' * 10)

        _beg_t = timeit.default_timer()

        print('Ordinary Kriging...')

        if 'OK' not in out_nc.variables:
            ok_nc = out_nc.createVariable('OK',
                                          'd',
                                          dimensions=(time_dim_lab,
                                                      y_coords_lab,
                                                      x_coords_lab),
                                          fill_value=False)

        else:
            ok_nc = out_nc.variables['OK']

        ok_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]],
                        in_stns_coords_df, in_vgs_df.loc[ft_type][0],
                        krige_x_coords_mesh, krige_y_coords_mesh,
                        krige_coords_orig_shape, (idxs[i],
                                                  idxs[i + 1]), fin_cntn_idxs)
                       for i in range(n_cpus))

        if mp_cond:
            ok_krige_flds = np.full(
                (in_data_df.shape[0], krige_coords_orig_shape[0],
                 krige_coords_orig_shape[1]),
                np.nan,
                dtype=np.float32)

            mp_ress = []

            try:
                mp_pool = ProcessPool(n_cpus)
                mp_pool.restart(True)

                mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen))

                mp_pool.clear()

            except Exception as msg:
                mp_pool.close()
                mp_pool.join()
                print('Error in ordinary_kriging:', msg)

            for mp_res in mp_ress:
                if (len(mp_res) != 3) and (not isinstance(list)):
                    print('\n', mp_res, '\n')
                    continue

                [strt_index, end_index, sub_ok_krige_flds] = mp_res
                ok_krige_flds[strt_index:end_index] = sub_ok_krige_flds

                # free memory
                mp_res[2], sub_ok_krige_flds = None, None

            ok_nc[:] = ok_krige_flds

        else:
            [strt_index, end_index,
             ok_krige_flds] = ordinary_kriging(next(ok_vars_gen))

            ok_nc[:] = ok_krige_flds

        ok_nc.units = var_units
        ok_nc.standard_name = var_name + ' (ordinary kriging)'

        ok_krige_flds = None

        _end_t = timeit.default_timer()
        _tot_t = _end_t - _beg_t

        print(f'Took {_tot_t:0.4f} seconds!')
        print('#' * 10)


#     if sim_krige_flag:
#         print('\n\n')
#         print('#' * 10)
#
#         _beg_t = timeit.default_timer()
#
#         print('Simple Kriging...')
#         if 'SK' not in out_nc.variables:
#             sk_nc = out_nc.createVariable(
#                 'SK',
#                 'd',
#                 dimensions=(time_dim_lab, y_coords_lab, x_coords_lab),
#                 fill_value=False)
#
#         else:
#             sk_nc = out_nc.variables['SK']
#
#         sk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]],
#                         in_stns_coords_df,
#                         in_vgs_df.iloc[idxs[i]:idxs[i + 1]],
#                         min_ppt_thresh,
#                         var_name,
#                         krige_x_coords_mesh,
#                         krige_y_coords_mesh,
#                         krige_coords_orig_shape,
#                         min_var_val,
#                         max_var_val,
#                         (idxs[i], idxs[i + 1]),
#                         plot_figs_flag,
#                         krige_x_coords_plot_mesh,
#                         krige_y_coords_plot_mesh,
#                         var_units,
#                         polys_list,
#                         out_figs_dir,
#                         fin_cntn_idxs) for i in range(n_cpus))
#
#         if mp_cond:
#             sk_krige_flds = np.full(
#                 (in_data_df.shape[0],
#                  krige_coords_orig_shape[0],
#                  krige_coords_orig_shape[1]),
#                 np.nan,
#                 dtype=np.float32)
#
#             mp_ress = []
#
#             try:
#                 mp_pool = ProcessPool(n_cpus)
#                 mp_pool.restart(True)
#
#                 mp_ress = list(mp_pool.uimap(simple_kriging, sk_vars_gen))
#
#                 mp_pool.clear()
#
#             except Exception as msg:
#                 mp_pool.close()
#                 mp_pool.join()
#                 print('Error in simple_kriging:', msg)
#
#             for mp_res in mp_ress:
#                 if (len(mp_res) != 3) and (not isinstance(list)):
#                     print('\n', mp_res, '\n')
#                     continue
#
#                 [strt_index, end_index, sub_sk_krige_flds] = mp_res
#                 sk_krige_flds[strt_index:end_index] = sub_sk_krige_flds
#
#                 # free memory
#                 mp_res[2], sub_sk_krige_flds = None, None
#
#             sk_nc[:] = sk_krige_flds
#
#         else:
#             [strt_index,
#              end_index,
#              sk_krige_flds] = simple_kriging(next(sk_vars_gen))
#
#             sk_nc[:] = sk_krige_flds
#
#         sk_nc.units = var_units
#         sk_nc.standard_name = var_name + ' (simple kriging)'
#
#         sk_krige_flds = None
#
#         _end_t = timeit.default_timer()
#         _tot_t = _end_t - _beg_t
#
#         print(f'Took {_tot_t:0.4f} seconds!')
#         print('#' * 10)
#
#     if edk_krige_flag:
#         print('\n\n')
#         print('#' * 10)
#
#         _beg_t = timeit.default_timer()
#
#         print('External Drift Kriging...')
#         if 'EDK' not in out_nc.variables:
#             edk_nc = out_nc.createVariable(
#                 'EDK',
#                 'd',
#                 dimensions=(time_dim_lab, y_coords_lab, x_coords_lab),
#                 fill_value=False)
#
#         else:
#             edk_nc = out_nc.variables['EDK']
#
#         edk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]],
#                          in_stns_drift_df,
#                          in_stns_coords_df,
#                          in_vgs_df.iloc[idxs[i]:idxs[i + 1]],
#                          min_ppt_thresh,
#                          var_name,
#                          krige_x_coords_mesh,
#                          krige_y_coords_mesh,
#                          drift_vals_arr,
#                          krige_coords_orig_shape,
#                          drift_ndv,
#                          min_var_val,
#                          max_var_val,
#                          (idxs[i], idxs[i + 1]),
#                          plot_figs_flag,
#                          krige_x_coords_plot_mesh,
#                          krige_y_coords_plot_mesh,
#                          var_units,
#                          polys_list,
#                          out_figs_dir,
#                          fin_cntn_idxs) for i in range(n_cpus))
#
#         if mp_cond:
#             edk_krige_flds = np.full(
#                 (in_data_df.shape[0],
#                  krige_coords_orig_shape[0],
#                  krige_coords_orig_shape[1]),
#                 np.nan,
#                 dtype=np.float32)
#
#             mp_ress = []
#
#             try:
#                 mp_pool = ProcessPool(n_cpus)
#                 mp_pool.restart(True)
#
#                 mp_ress = list(mp_pool.uimap(
#                     external_drift_kriging, edk_vars_gen))
#
#                 mp_pool.clear()
#
#             except Exception as msg:
#                 mp_pool.close()
#                 mp_pool.join()
#                 print('Error in external_drift_kriging:', msg)
#
#             for mp_res in mp_ress:
#                 if (len(mp_res) != 3) and (not isinstance(list)):
#                     print('\n', mp_res, '\n')
#                     continue
#
#                 [strt_index, end_index, sub_edk_krige_flds] = mp_res
#                 edk_krige_flds[strt_index:end_index] = sub_edk_krige_flds
#
#                 print('sub_min:', np.nanmin(sub_edk_krige_flds))
#                 print('sub_max:', np.nanmax(sub_edk_krige_flds))
#
#                 # free memory
#                 mp_res[2], sub_edk_krige_flds = None, None
#
#         else:
#             [strt_index,
#              end_index,
#              edk_krige_flds] = external_drift_kriging(next(edk_vars_gen))
#
#         edk_nc[:] = edk_krige_flds
#
#         edk_nc.units = var_units
#         edk_nc.standard_name = var_name + ' (external drift kriging)'
#
#         edk_krige_flds = None
#
#         _end_t = timeit.default_timer()
#         _tot_t = _end_t - _beg_t
#
#         print(f'Took {_tot_t:0.4f} seconds!')
#         print('#' * 10)
#
#     #==========================================================================
#     # IDW
#     #==========================================================================
#     if idw_flag:
#         print('\n\n')
#         print('#' * 10)
#
#         _beg_t = timeit.default_timer()
#
#         print('Inverse Distance Weighting...')
#         if 'IDW' not in out_nc.variables:
#             idw_nc = out_nc.createVariable(
#                 'IDW',
#                 'd',
#                  dimensions=(time_dim_lab, y_coords_lab, x_coords_lab),
#                  fill_value=False)
#
#         else:
#             idw_nc = out_nc.variables['IDW']
#
#         idw_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]],
#                         in_stns_coords_df,
#                         min_ppt_thresh,
#                         idw_exp,
#                         var_name,
#                         krige_x_coords_mesh,
#                         krige_y_coords_mesh,
#                         krige_coords_orig_shape,
#                         min_var_val,
#                         max_var_val,
#                         (idxs[i], idxs[i + 1]),
#                         plot_figs_flag,
#                         krige_x_coords_plot_mesh,
#                         krige_y_coords_plot_mesh,
#                         var_units,
#                         polys_list,
#                         out_figs_dir,
#                         fin_cntn_idxs) for i in range(n_cpus))
#
#         if mp_cond:
#             idw_flds = np.full(
#                 (in_data_df.shape[0],
#                  krige_coords_orig_shape[0],
#                  krige_coords_orig_shape[1]),
#                 np.nan,
#                 dtype=np.float32)
#
#             mp_ress = []
#             try:
#                 mp_pool = ProcessPool(n_cpus)
#                 mp_pool.restart(True)
#
#                 mp_ress = list(mp_pool.uimap(
#                     inverse_distance_wtng, idw_vars_gen))
#
#                 mp_pool.clear()
#
#             except Exception as msg:
#                 mp_pool.close()
#                 mp_pool.join()
#                 print('Error in inverse_distance_wtng:', msg)
#
#             for mp_res in mp_ress:
#                 if (len(mp_res) != 3) and (not isinstance(list)):
#                     print('\n', mp_res, '\n')
#                     continue
#
#                 [strt_index, end_index, sub_idw_flds] = mp_res
#                 idw_flds[strt_index:end_index] = sub_idw_flds
#
#                 # free memory
#                 mp_res[2], sub_idw_flds = None, None
#
#         else:
#             [strt_index,
#              end_index,
#              idw_flds] = inverse_distance_wtng(next(idw_vars_gen))
#
#         idw_nc[:] = idw_flds
#
#         idw_nc.units = var_units
#         idw_nc.standard_name = (
#             var_name + ' (IDW (exp=%0.3f))' % float(idw_exp))
#
#         idw_flds = None
#
#         _end_t = timeit.default_timer()
#         _tot_t = _end_t - _beg_t
#
#         print(f'Took {_tot_t:0.4f} seconds!')
#         print('#' * 10)

    out_nc.Author = 'Faizan IWS Uni-Stuttgart'
    out_nc.Source = out_nc.filepath()
    out_nc.close()
    return
Example #15
0
        dat = MyDataset(cg.s, cg.ss, cg.r, cg.a, total,
                        [(*NUM_GRID, NUM_CHANNEL), (1, ), (1, ), (1, )]).new()
        dat = dat.prefetch(tf.data.AUTOTUNE)

        print('Processing Data Complete.')
        print("Training...")

        with tqdm.tqdm(total=10) as pbar:
            prog_callback = ProgCallback(pbar)

            hist = critic.fit(dat,
                              epochs=10,
                              verbose=0,
                              callbacks=[prog_callback])

        print(hist.history['loss'])

        print("Training Complete.")

        with open(f'ddrive/{gen}.txt', 'wb') as f:
            f.write(
                base64.b85encode(
                    lzma.compress(pickle.dumps([
                        arr.astype(np.float16) for arr in critic.get_weights()
                    ]),
                                  preset=9)))

        pool.close()
        pool.join()
        pool.restart()
Example #16
0
def graph_sampling(graph: FSN, strategy: Optional[str] = "MetaDiff",
                   n_jobs: Optional[int] = 4,
                   use_cache: Optional[bool] = True, **kwargs) \
        -> List[List[Union[str, int]]]:
    """
    Sampling the sequences of nodes from FSN w.r.t. chosen strategy
    Parameters
    ----------
    graph : FSN object
        Graph to be processed
    strategy : str, default is 'MetaDiff'
        Walking strategy to be used
    n_jobs : int, default is 4
        Number of workers to be created in parallel pool
    use_cache : bool, default is True
        To use the previously cached files

    Returns
    -------
    Sampled sequences of BP nodes
    """
    set_new_config(**kwargs)
    local_logger = logging.getLogger(f"{__name__}")
    if use_cache and os.path.isfile(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl"):
        local_logger.info("Loading sequences from cache... wait...")
        try:
            with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "rb") as file:
                res = pickle.load(file)
            local_logger.info(f"Total number of raw sampled sequences is {len(res)}")
            local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}")
            return res
        except FileNotFoundError:
            local_logger.info("File not found... Recalculate \n")
            pass
        except Exception as e:
            local_logger.error(f"Unexpected error: {e}")
    local_logger.info("Sampling sequences... wait...")
    max_processes = max(n_jobs, os.cpu_count())
    global walk
    if strategy in strategy_to_class.keys():
        walk = strategy_to_class[strategy](G=graph, walk_length=CONFIG.WALKS_LENGTH,
                                           direction=CONFIG.DIRECTION,
                                           pressure=CONFIG.PRESSURE, allow_back=CONFIG.ALLOW_BACK)
    else:
        raise KeyError(
            f"The given strategy {strategy} is unknown. The following ones are implemented: {strategy_to_class.keys()}")
    sampling_pool = ProcessPool(nodes=max_processes)
    local_logger.info("Created a Pool with " + str(max_processes) + " processes ")
    # required to restart pool to update CONFIG inside the parallel part
    sampling_pool.terminate()
    sampling_pool.restart()
    BPs = graph.get_BPs()
    n_BPs = len(BPs)
    sampled = list()
    try:
        with tqdm(total=n_BPs) as pbar:
            for i, res in enumerate(sampling_pool.uimap(wrappedWalk, BPs)):
                sampled.append(res)
                pbar.update()
    except KeyboardInterrupt:
        print('Got ^C while pool mapping, terminating the pool')
        sampling_pool.terminate()
    res = list(itertools.chain(*sampled))
    sampling_pool.terminate()
    sampling_pool.restart()
    local_logger.info("Cashing sampled sequences!")
    if use_cache:
        with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "wb") as file:
            pickle.dump(res, file)
    local_logger.info(f"Total number of raw sampled sequences is {len(res)}")
    local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}")
    return res
    def test_repeatability(self):
        import matplotlib.pyplot as mpl
        import numpy as np
        from pathos.multiprocessing import ProcessPool
        from itertools import product

        start, end = (10, 10), (350, 250)
        repeats = 2
        equal_paths = []
        rdrs = np.linspace(-100, 100, 10)
        jgs = [0]  # np.linspace(0, 5000, 2)
        jls = np.linspace(0, 50, 2)

        def make_path(start, end, rdr, jg, jl):
            algo = RiskJumpPointSearchAStar(ManhattanRiskHeuristic(
                self.large_diag_environment, risk_to_dist_ratio=rdr),
                                            jump_gap=jg,
                                            jump_limit=jl)
            return algo.find_path(self.large_diag_environment, start, end)

        def run_params(rdr, jg, jl):
            paths = [
                make_path(start, end, rdr, jg, jl) for _ in range(repeats)
            ]
            equal_paths.append(all([p == paths[0] for p in paths]))
            if not paths[0]:
                return [rdr, np.inf, jl, jg]
            risk_sum = sum([
                self.large_diag_environment.grid[n[0], n[1]] for n in paths[0]
            ])
            return [rdr, risk_sum, jl, jg]

        pool = ProcessPool(nodes=8)
        pool.restart(force=True)
        params = np.array(list(product(rdrs, jgs, jls)))
        risk_sums = pool.map(run_params, params[:, 0], params[:, 1], params[:,
                                                                            2])
        pool.close()

        # risk_sums = []
        # for rdr, jg, jl in product(rdrs, jgs, jls):
        #     paths = [make_path(start, end, rdr, jg, jl) for _ in range(repeats)]
        #     equal_paths.append(all([p == paths[0] for p in paths]))
        #     if not paths[0]:
        #         risk_sums.append([rdr, np.inf, jl, jg])
        #         continue
        #     risk_sum = sum([n.n for n in paths[0]])
        #     risk_sums.append([rdr, risk_sum, jl, jg])
        #
        #     fig = mpl.figure()
        #     ax = fig.add_subplot(111)
        #     for path in paths:
        #         ax.plot([n.x for n in path], [n.y for n in path], color='red')
        #     im = ax.imshow(self.large_diag_environment.grid)
        #     fig.colorbar(im, ax=ax, label='Population')
        #     ax.set_title(f'Risk JPS A* with RDR={rdr:.4g}, JL={jl} \n Risk sum={risk_sum:.4g}')
        #     fig.show()

        risk_sums = np.array(risk_sums)

        jl_fig = mpl.figure()
        ax = jl_fig.add_subplot(111)
        sc = ax.scatter(risk_sums[:, 0], risk_sums[:, 1], c=risk_sums[:, 2])
        ax.set_yscale('symlog')
        ax.set_xlabel('Risk-Distance Ratio')
        ax.set_ylabel('Path Risk sum')
        ax.set_title('R JPS+ A* Jump Limits')
        jl_fig.colorbar(sc, ax=ax, label='Jump Limit')
        jl_fig.show()

        jg_fig = mpl.figure()
        ax = jg_fig.add_subplot(111)
        sc = ax.scatter(risk_sums[:, 0], risk_sums[:, 1], c=risk_sums[:, 3])
        ax.set_yscale('symlog')
        ax.set_xlabel('Risk-Distance Ratio')
        ax.set_ylabel('Path Risk sum')
        ax.set_title('R JPS+ A* Jump Gaps')
        jg_fig.colorbar(sc, ax=ax, label='Jump Gap')
        jg_fig.show()

        self.assertTrue(all(equal_paths), 'Paths are not generated repeatably')
Example #18
0
def build_pt(sampler_class, pe_method, force_method, numdim = 5, masses = 1.0, \
    nT = 10, nproc = 1, Tmin = 1.0, Tmax = 100.0, max_iteration = 500, iters_to_swap = 1, \
    iters_to_waypoint = 5, iters_to_setdt = 10, iters_to_writestate = 1, run_token = 1, \
    dt = 1.0e-4, traj_len = 100, num_traj = 10, absxmax = 1.0e2, initial_rand_bounds = 1.0e2, \
    dt_max = None, min_rate = 0.6, max_rate = 0.7, gaussianprior_std = None):
    """Builds an instance of ParallelTempering. Reads restart file if it exists, or initialises a 
    fresh run.
    
    Args:
        sampler_class : Sampler class from module sampling. Eg. sampling.Hmc .
        pe_method : A method for evaluating the potential energy.
        force_method : A method for evaluating the forces.
        numdim (int) :The number of dimensions of the configuration space ('parameter space'). 
            (Defualt: 5)
        masses (single float or numpy array of floats, with length 1 or length numdim): specifies the
            masses associated with each dimension of the configuration space ('parameter space'). 
            (Default: 1.0)
        nT (int) : Number of temperatures to use. (Default: 10)
        nproc (int) : Number of processors to use. (Default: 1)
        Tmin (float) : Lowest temperature in ladder of temperatures. (Default 1.0)
        Tmax (float) : Maximum temperature in ladder of temperatures. (Default 100.0).
        max_iteration (int) : Max number of iterations to run. (Default 500).
        iters_to_swap (int) : Configuration swaps between neighbouring temperatures are attempted
            every iters_to_swap iterations. (Default 1).
        iters_to_waypoint (int) : Restart information is written after every iters_to_waypoint 
            iterations. (Default 5). 
        iters_to_setdt (int) : The step sizes (or equivalently time steps) are updated after every 
            iters_to_setdt interations. (Default 10).
        iters_to_writestate (int) : The latest potential energy values and coordinates are written
            out after every iters_to_writestate iterations. (Default 1).
        run_token (int) : An integer for labelling the restart and output files for this calculation.
            (Default 1).
        dt (float) : Initial time step (or step size). This will be updated algorithmically, but a 
            good starting point saves time. (Default 1.0e-4).
        traj_len (int) : The number of time steps in a single trajectory. (Default 100).
        num_traj (int) : The number of trajectories run per iteration, per sampler. (Default 10).
        absxmax (single float or numpy array of floats, with length 1 or length numdim) : During the 
            main calculation, the sampler is restricted to a region x in [-absxmax,absxmax]. 
            (Default: 1.0e2).
        initial_rand_bounds : The same as absxmax, but applied only during random initialisation of the
            sampler's coordinate (parameters). This enables initialisation into a particular region, 
            which might for example, be most likely to contain the global minimum. (Default: 1.0e2).
        dt_max (float) : maximum step size (time step). (Default: median(absxmax), which is set in 
            module sampling.)
        min_rate (float) : minimum acceptance rate of trajectories. Used for setting step size (time 
            step). (Default: 0.6. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65
            http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3).
        max_rate (float) : maximum acceptance rate of trajectories. Used for setting step size (time 
            step). (Default 0.7. The optimal acceptance rate for HMC on a multivariate Gaussian is 0.65
            http://www.mcmchandbook.net/HandbookChapter5.pdf, section 5.4.4.3).
        gaussianprior_std (single float or numpy array of floats, with length 1 or length numdim) : If 
            this is set to a real value then an additional term is applied to (H)MC acceptance/rejection 
            such that the target distribution is proportional to a multivariate Gaussian with this 
            standard deviation for each dimension. (Default: None.)

    Return:
        ParallelTempering class object

    """

    # CHECK FOR RESTART FILE AND DO RESTART IF PRESENT
    restrtfl = "restart_pt_" + str(run_token) + ".txt"
    if os.path.isfile("./" + restrtfl):  # read restart data from restart file
        didrestart = True

        print "Restarting from file ", restrtfl, time.ctime()
        nT, Tmin, Tmax, iteration, num_traj, samplers, walkers = \
            read_waypoint(restrtfl, sampler_class, pe_method, force_method)

    else:
        didrestart = False
        iteration = 0
        # a list of new walkers (which are class objects)
        samplers = build_samplers( sampler_class, pe_method, force_method, nT, Tmin, Tmax, dt, \
            traj_len, absxmax, dt_max, min_rate, max_rate, gaussianprior_std )

        print "Start initialise walkers ", time.ctime()
        walkers = np.asarray([])

        sampling.NewWalker.masses = masses
        sampling.NewWalker.numdim = numdim
        temp_pool = ProcessPool(nodes=nproc)

        # temporarily pass initial_random_bounds through samplers, since pathos multiprocessing is
        # restrictive with arguments
        for sampler in samplers:
            sampler.random_init_bounds = initial_rand_bounds

        outs = sampling.apply_pool(temp_pool, initialise_walker, samplers)
        for i in xrange(len(outs)):
            walkers = np.append(walkers, outs[i][0])
            samplers[i] = outs[i][1]

        temp_pool.terminate()  # close pool
        temp_pool.restart()  # close pool
        print "Done initialise walkers ", time.ctime()

    coutfl = "ptconfsout_" + str(run_token) + ".txt"
    ptoutfl = "ptout_" + str(run_token) + ".txt"

    thispt = ParallelTempering(samplers, walkers, num_traj, nT, nproc, Tmin, Tmax, iteration, \
        max_iteration, iters_to_swap, iters_to_waypoint, iters_to_setdt, iters_to_writestate, run_token, coutfl,\
        ptoutfl, restrtfl )

    if (not didrestart):
        thispt.set_dt_all(thispt.pt_pool, step_fac=0.1)

    return thispt
Example #19
0
                        plot_figs_flag, krige_x_coords_plot_mesh,
                        krige_y_coords_plot_mesh, var_units, polys_list,
                        out_figs_dir, fin_cntn_idxs) for i in range(n_cpus))

        if mp_cond:
            ok_krige_flds = np.full(
                (fin_date_range.shape[0], krige_coords_orig_shape[0],
                 krige_coords_orig_shape[1]),
                np.nan,
                dtype=np.float32)

            mp_ress = []

            try:
                mp_pool = ProcessPool(n_cpus)
                mp_pool.restart(True)

                mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen))

                mp_pool.clear()

            except Exception as msg:
                mp_pool.close()
                mp_pool.join()
                print('Error in ordinary_kriging:', msg)

            for mp_res in mp_ress:
                if (len(mp_res) != 3) and (not isinstance(list)):
                    print('\n', mp_res, '\n')
                    continue
Example #20
0
def plot_cats_prms_transfer_perfs(dbs_dir, n_cpus=1):
    '''Plot catchments performances' by using parameters from other
    catchment.
    '''

    cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5'))

    assert cats_dbs

    n_cats = len(cats_dbs)
    n_cpus = min(n_cats, n_cpus)

    kf_prms_dict = {}
    cats_vars_dict = {}
    for cat_db in cats_dbs:
        with h5py.File(cat_db, 'r') as db:
            kfolds = db['data'].attrs['kfolds']
            cat = db.attrs['cat']

            cv_flag = db['data'].attrs['cv_flag']

            if cv_flag:
                print('plot_prm_trans_perfs not possible with cv_flag!')
                return

            f_var_infos = db['cdata/aux_var_infos'][...]
            prms_idxs = db['cdata/use_prms_idxs'][...]
            f_vars = db['cdata/aux_vars'][...]
            prms_flags = db['cdata/all_prms_flags'][...]
            bds_arr = db['cdata/bds_arr'][...]

            cat_vars_dict = {}
            cat_vars_dict['f_var_infos'] = f_var_infos
            cat_vars_dict['prms_idxs'] = prms_idxs
            cat_vars_dict['f_vars'] = f_vars
            cat_vars_dict['prms_flags'] = prms_flags
            cat_vars_dict['bds_arr'] = bds_arr

            cats_vars_dict[cat] = cat_vars_dict

            for i in range(1, kfolds + 1):
                kf_str = f'kf_{i:02d}'
                cd_db = db[f'calib/{kf_str}']

                opt_prms = cd_db['opt_prms'][...]

                if i not in kf_prms_dict:
                    kf_prms_dict[i] = {}

                kf_prms_dict[i][cat] = {}

                kf_prms_dict[i][cat]['opt_prms'] = opt_prms

    const_args = (kf_prms_dict, cats_vars_dict)
    plot_gen = ((cat_db, const_args) for cat_db in cats_dbs)

    if (n_cpus > 1) and (n_cats > 1):
        mp_pool = ProcessPool(n_cpus)
        mp_pool.restart(True)

        print(list(mp_pool.uimap(plot_cat_prms_transfer_perfs, plot_gen)))

        mp_pool.clear()
        mp_pool.close()
        mp_pool.join()

    else:
        for plot_args in plot_gen:
            plot_cat_prms_transfer_perfs(plot_args)

    return
Example #21
0
def regex_sentencize(docs,
                     max_sentence_length=None,
                     min_sentence_length=None,
                     n_threads=1,
                     reg_split=r"((?:\s*\n)+\s*)",
                     reg_token=r"[\w*]+|[^\w\s\n*]",
                     text_col="text",
                     doc_id_col="doc_id",
                     with_tqdm=False,
                     verbose=0):
    """
    Simple split MIMIC docs into sentences:
    - sentences bounds are found when multiple newline occurs
    - sentences too long are cut into `max_sentence_length` length sentences
      by splitting each sentence into the tokens using a dumb regexp.
    Parameters
    ----------
    docs: pd.DataFrame
    max_sentence_length: int
    with_tqdm: bool
    verbose: int
    doc_id_col: str
    text_col: str

    Returns
    -------
    (np.ndarray, np.ndarray, np.ndarray)
    """
    n_threads = min(n_threads, len(docs))
    if n_threads > 1:
        text_chunks = np.array_split(np.arange(len(docs)), n_threads)
        pool = ProcessPool(nodes=n_threads)
        pool.restart(force=True)
        results = [
            pool.apipe(regex_sentencize,
                       docs.iloc[chunk],
                       max_sentence_length,
                       1,
                       with_tqdm=False) for chunk in text_chunks
        ]
        results = [r.get() for r in results]
        pool.close()
        return pd.concat(results, ignore_index=True)

    reg_split = re.compile(reg_split)
    reg_token = re.compile(reg_token)
    doc_ids = []
    sentence_idx_list = []
    begins = []
    ends = []
    sentences = []
    max_size = 0
    min_size = 10000000
    for doc_id, txt in zip(
            docs[doc_id_col],
        (tqdm(docs[text_col], desc="Splitting docs into sentences")
         if with_tqdm else docs[text_col])):
        idx = 0
        queued_spans = []
        sentence_idx = 0
        for i, part in enumerate(reg_split.split(txt)):
            if i % 2 == 0:  # we're in a sentence
                queued_spans.extend([(m.start() + idx, m.end() + idx)
                                     for m in reg_token.finditer(part)])
                if max_sentence_length is None:
                    max_sentence_length_ = len(queued_spans)
                else:
                    max_sentence_length_ = max_sentence_length
                while len(queued_spans) > max_sentence_length_:
                    b = queued_spans[0][0]
                    e = queued_spans[max_sentence_length_ - 1][1]
                    doc_ids.append(doc_id)
                    sentence_idx_list.append(sentence_idx)
                    begins.append(b)
                    ends.append(e)

                    max_size, min_size = max(max_size,
                                             max_sentence_length_), min(
                                                 min_size,
                                                 max_sentence_length_)
                    queued_spans = queued_spans[max_sentence_length_:]
                    sentences.append(txt[b:e])
                    sentence_idx += 1
                if min_sentence_length is not None and len(
                        queued_spans) < min_sentence_length:
                    idx += len(part)
                    continue
                if len(queued_spans):
                    b = queued_spans[0][0]
                    e = queued_spans[-1][1]
                    doc_ids.append(doc_id)
                    sentence_idx_list.append(sentence_idx)
                    begins.append(b)
                    ends.append(e)
                    max_size, min_size = max(max_size, len(queued_spans)), min(
                        min_size, len(queued_spans))
                    queued_spans = []
                    sentences.append(txt[b:e])
                    sentence_idx += 1
            if part is not None:
                idx += len(part)
    if verbose:
        print("Sentence size: max = {}, min = {}".format(max_size, min_size))
    df = pd.DataFrame({
        doc_id_col: doc_ids,
        "sentence_idx": sentence_idx_list,
        "begin": begins,
        "end": ends,
        "text": sentences,
    }).astype({doc_id_col: docs[doc_id_col].dtype})
    df = df.merge(docs[[doc_id_col] + [
        col for col in docs.columns if col not in df.columns and col != "text"
    ]])
    df["sentence_id"] = join_cols(df[[doc_id_col, "sentence_idx"]], "/")
    return df
Example #22
0
    def set(self,walkers,message_prefix, adjust_step_factor = 0.9):
        """Updates the stepsize to achieve a trajectory acceptance rate in or as close as possible
        to the range [self.sampler.min_rate, self.sampler.max_rate], with stepsize in the range
        [10^-50, self.sampler.dt_max]. 
        
        Args:
            walkers : This MUST be an array or list of NewWalker class objects. These are NOT updated
                by this method.
            message_prefix (str/None) : if message_prefix is not None, then a message is printed
                describing the change in dt. If message_prefix is None, then no message is printed.
            adjust_step_factor (float) : self.sampler.dt is updated by * or / by this value.

        Return:
            duration (float) : duration of call to set in seconds. Can be useful for checking the 
                fraction of time spent updating step lengths.

        """
        start_time = time.time()
        if (self.nproc > 1):
            set_pool = ProcessPool(nodes=self.nproc)
        else:
            set_pool = None

        steplength_store = self.sampler.dt
        steplength_in = self.sampler.dt
        # protects against possible future bugs that would be hard to detect

        walk_n_walkers = int(self.nproc * np.ceil(float(self.min_num_data_point)/self.nproc))
        # rounds up to next multiple of self.nproc for maximum usage of compute

        walkers_clone = copy.deepcopy(walkers)  # expensive, but prevents this routine overwriting 
                                                # walkers

        first_time = True # we will make at least two tries. Logical flag ensures this.

        # Step size calibration loop:
        while True:

            # collect statistics on trajectory acceptance rate
            run_outputs = apply_pool(set_pool, self.run, np.random.choice(walkers_clone, \
                size=walk_n_walkers))
            results = map(itemgetter(1), run_outputs)
            del run_outputs

            # The total number of accepted/rejected moves for this step size
            rate = float(np.sum(results))/walk_n_walkers

            if (rate>=self.sampler.min_rate and rate<=self.sampler.max_rate):
                # If the total acceptance rate is within the desired range, return this stepsize
                self.print_dt_change(steplength_in, self.sampler.dt, message_prefix)
                break
            else: # update the stepsize to get closer to the desired range
                if( not first_time ): # dodge this the first time round - no rate_store saved yet
                    # Check whether rate and rate_store are on different sides 
                    # of interval
                    if ((min(rate,rate_store) < self.sampler.min_rate) and (max(rate,rate_store) > self.sampler.max_rate)):
                        # We previously obtained an acceptance rate on one side of the desired range 
                        # and now find an acceptance rate on the other side. We return the step size 
                        # that gave an acceptance rate closest to the middle of the desired range.

                        target = 0.5*(self.sampler.min_rate+self.sampler.max_rate) # middle of range
                        if (abs(rate-target)<abs(rate_store-target)):
                            # take current step length
                            self.print_dt_change(steplength_in, self.sampler.dt, \
                                message_prefix)
                            break
                        else:
                            # take saved step length
                            self.sampler.dt = steplength_store
                            rate = rate_store
                            self.print_dt_change(steplength_in, self.sampler.dt, \
                                message_prefix)
                            break

                else: # this is the fist time - no rate_store saved yet
                    first_time = False

                # save current step length and acceptance rate
                steplength_store = self.sampler.dt
                rate_store = rate

                # update step length
                if rate < self.sampler.min_rate:
                    exp = 1.0
                elif rate >= self.sampler.max_rate:
                    exp = -1.0

                # try to adjust
                self.sampler.dt *= adjust_step_factor**exp

                # Check that step size is neither larger than max allowed value nor smaller than 
                # 10^-50 (useful for detecting errors).
                # Error check:
                if (self.sampler.dt < 1.0e-50):
                    if (message_prefix is not None):
                        prfx = message_prefix + " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \
                        (self.sampler.dt)
                    else:
                        prfx = " stepsizes got stepsize= '%e': too small. Is everything correct?\n" % \
                        (self.sampler.dt)

                    exit_error(prfx, 25)

                # sampling demands a step size larger than dt_max. Set to dt_max then break
                if (self.sampler.dt>self.sampler.dt_max):
                    self.sampler.dt = self.sampler.dt_max
                    self.print_dt_change(steplength_in, self.sampler.dt, \
                        message_prefix)
                    break

        # close pool
        if (set_pool is not None):
            set_pool.terminate()
            set_pool.restart()

        end_time = time.time()
        duration = end_time - start_time
        return duration