def plot_cats_hbv_sim(dbs_dir, water_bal_step_size, full_flag=False, wat_bal_flag=False, show_warm_up_steps_flag=False, n_cpus=1): '''Plot hbv simulations for every catchment for every kfold.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) const_args = (water_bal_step_size, full_flag, wat_bal_flag, show_warm_up_steps_flag) plot_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_hbv_sim, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_hbv_sim(plot_args) return
def plot_cats_qsims(dbs_dir, n_cpus=1): '''Plot discharge simulations for every catchment for every kfold using its prm_vecs.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) plot_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_qsims, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_qsims(plot_args) return
def plot_cats_prm_vecs(dbs_dir, n_cpus): '''Plot final parameter set from kfold for every catchments along with objective function value distribution. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) opt_res_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prm_vecs, opt_res_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for opt_res in opt_res_gen: plot_cat_prm_vecs(opt_res) return
def plot_cats_vars_errors(dbs_dir, err_var_labs, n_cpus): cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) cats_paths_gen = ((cat_db, err_var_labs) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_vars_errors, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_vars_errors(cat_paths) return
def plot_cats_kfold_effs(dbs_dir, hgs_db_path, compare_ann_cyc_flag, n_cpus): '''Plot the k-fold efficiency results.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) const_args = (compare_ann_cyc_flag, hgs_db_path) cats_paths_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_kfold_effs, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_kfold_effs(cat_paths) return
def plot_cats_best_prms_1d(dbs_dir, n_cpus): '''Plot every best kfold parameter set for all catchments.''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) cats_paths_gen = (cat_db for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_best_prms_1d, cats_paths_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for cat_paths in cats_paths_gen: plot_cat_best_prms_1d(cat_paths) return
def _prep_anomaly_bjs_mp(anoms_arr, bjs_arr, n_cpus, fig_out_dir): assert anoms_arr.shape == bjs_arr.shape _idxs = ret_mp_idxs(anoms_arr.shape[1], n_cpus) _idxs_list = [_idxs[i:i + 2] for i in range(n_cpus)] _anoms_gen = ((anoms_arr[:, _idxs_list[i][0]:_idxs_list[i][1]]) for i in range(n_cpus)) _bjs_gen = ((bjs_arr[:, _idxs_list[i][0]:_idxs_list[i][1]]) for i in range(n_cpus)) mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) try: print( list( mp_pool.uimap(Anomaly._plot_anomaly_bjs_cdf, _idxs_list, _anoms_gen, _bjs_gen, [fig_out_dir] * n_cpus))) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in _plot_anomaly_bjs_cdf:', msg) return
class BatchRunnerMP(BatchRunner): """ Child class of BatchRunner, extended with multiprocessing support. """ def __init__(self, model_cls, nr_processes=2, **kwargs): """ Create a new BatchRunnerMP for a given model with the given parameters. Args: model_cls: The class of model to batch-run. nr_processes: the number of separate processes the BatchRunner should start, all running in parallel. kwargs: the kwargs required for the parent BatchRunner class """ if not pathos_support: raise MPSupport super().__init__(model_cls, **kwargs) self.pool = ProcessPool(nodes=nr_processes) def run_all(self): """ Run the model at all parameter combinations and store results, overrides run_all from BatchRunner. """ # register the process pool and init a queue job_queue = [] param_names, param_ranges = zip(*self.variable_parameters.items()) run_count = count() total_iterations = self.iterations for param_range in param_ranges: total_iterations *= len(param_range) with tqdm(total_iterations, disable=not self.display_progress) as pbar: for param_values in product(*param_ranges): kwargs = dict(zip(param_names, param_values)) kwargs.update(self.fixed_parameters) # make a new process and add it to the queue for i in range(self.iterations): job_queue.append( self.pool.uimap(self._run_single_model, (param_values, ), (next(run_count), ), (kwargs, ))) # empty the queue results = [] for task in job_queue: for model_vars, agent_vars in list(task): results.append((model_vars, agent_vars)) pbar.update() # store the results for model_vars, agent_vars in results: if self.model_reporters: for model_key, model_val in model_vars.items(): self.model_vars[model_key] = model_val if self.agent_reporters: for agent_key, reports in agent_vars.items(): self.agent_vars[agent_key] = reports
class BatchRunnerMP(BatchRunner): """ Child class of BatchRunner, extended with multiprocessing support. """ def __init__(self, model_cls, nr_processes=2, **kwargs): """ Create a new BatchRunnerMP for a given model with the given parameters. Args: model_cls: The class of model to batch-run. nr_processes: the number of separate processes the BatchRunner should start, all running in parallel. kwargs: the kwargs required for the parent BatchRunner class """ if not pathos_support: raise MPSupport super().__init__(model_cls, **kwargs) self.pool = ProcessPool(nodes=nr_processes) def run_all(self): """ Run the model at all parameter combinations and store results, overrides run_all from BatchRunner. """ run_count = count() total_iterations, all_kwargs, all_param_values = self._make_model_args() # register the process pool and init a queue job_queue = [] with tqdm(total_iterations, disable=not self.display_progress) as pbar: for i, kwargs in enumerate(all_kwargs): param_values = all_param_values[i] for _ in range(self.iterations): # make a new process and add it to the queue job_queue.append(self.pool.uimap(self.run_iteration, (kwargs,), (param_values,), (next(run_count),))) # empty the queue results = [] for task in job_queue: for model_vars, agent_vars in list(task): results.append((model_vars, agent_vars)) pbar.update() # store the results for model_vars, agent_vars in results: if self.model_reporters: for model_key, model_val in model_vars.items(): self.model_vars[model_key] = model_val if self.agent_reporters: for agent_key, reports in agent_vars.items(): self.agent_vars[agent_key] = reports
def run_all(self, processes=8): """ Run the model at all parameter combinations and store results. Args: processes (int): number of processes to start """ # Register the process pool and init a queue pool = ProcessPool(nodes=processes) job_queue = [] if self.variable_parameters: param_names, param_sets = self.generate_samples() else: param_names = self.param_names param_sets = self.param_sets run_count = count() total_iterations = self.iterations * self.l_param_sets with tqdm(total=total_iterations, disable=not self.display_progress) as pbar: for param_values in param_sets: kwargs = dict(zip(param_names, param_values)) kwargs.update(self.fixed_parameters) # Make a new process and add it to the queue for i in range(self.iterations): job_queue.append( pool.uimap(self.iter, (kwargs, ), (param_values, ), (next(run_count), ))) # Empty the process queue results = [] for task in job_queue: for model_vars, agent_vars in list(task): results.append((model_vars, agent_vars)) pbar.update() for model_vars, agent_vars in results: if self.model_reporters: for model_key, model_val in model_vars.items(): getattr(self, "model_vars", None)[model_key] = model_val if self.agent_reporters: for agent_key, reports in agent_vars.items(): getattr(self, "agent_vars", None)[agent_key] = reports
def demonstrate(games, parallel=True): def evaluate(game): s = State() a = MinimaxAgent(max_depth=6, max_width=6) ss = [] pp = [] for x, y in game: d = a._get_dist(s) if len(d) != 1 or (d[0][0] >= 0 and d[0][1] >= 0): ss.append(s.featurize()) pp.append(util.dist_to_prob(d)) s.move(x, y) sys.stdout.write("=") sys.stdout.flush() return (np.array(ss), np.array(pp)) if parallel: pool = ProcessPool(nodes=7) results = list(pool.uimap(evaluate, games)) else: results = list(map(evaluate, games)) states = np.concatenate(list(map(lambda t: t[0], results)), axis=0) probs = np.concatenate(list(map(lambda t: t[1], results)), axis=0) return states, probs
def plot_cats_prm_vecs_evo(dbs_dir, save_obj_flag, save_png_flag, save_gif_flag, anim_secs, n_cpus=1): '''Plot the evolution of parameter vectors and convex hull for every catchment for every kfold. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) n_cpus = min(n_cats, n_cpus) opt_res_gen = ((cat_db, save_obj_flag, save_png_flag, save_gif_flag, anim_secs) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prm_vecs_evo, opt_res_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for opt_res in opt_res_gen: plot_cat_prm_vecs_evo(opt_res) return
def main(): main_dir = Path( r'P:\Synchronize\IWS\Testings\fourtrans_practice\multisite_phs_spec_corr' ) os.chdir(main_dir) interp_var = 'temp' ft_type = 'mag' #========================================================================== if interp_var == 'temp': # MEAN TEMPERATURE in_data_file = os.path.join(f'temperature_{ft_type}_spec_df.csv') in_vgs_file = os.path.join(r'temperature_cftns.csv') in_stns_coords_file = os.path.join(os.path.dirname(in_data_file), r'temperature_avg_coords.csv') out_dir = r'temperature_kriging' var_units = u'\u2103' # 'centigrade' var_name = 'temperature' out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc' # interpolated values # can be int, float, 'min_in'/'max_in' or None # min_var_val = 'min_in' # max_var_val = 'max_in' # min_var_val = None # max_var_val = None #========================================================================== #========================================================================== elif interp_var == 'ppt': # PRECIPITATION in_data_file = os.path.join(f'precipitation_{ft_type}_spec_df.csv') in_vgs_file = os.path.join(r'precipitation_cftns.csv') in_stns_coords_file = os.path.join(os.path.dirname(in_data_file), r'precipitation_coords.csv') out_dir = r'precipitation_kriging' var_units = 'mm' var_name = 'precipitation' out_krig_net_cdf_file = f'kriging_1km_{ft_type}.nc' # interpolated values # can be int, float, 'min_in'/'max_in' or None # min_var_val = 'min_in' # max_var_val = 'max_in' # min_var_val = None # max_var_val = None #========================================================================== else: raise ValueError(f'Invalid value for interp_var: {interp_var}!') out_krig_net_cdf_file = out_krig_net_cdf_file # assuming in_drift_raster and in_stns_coords_file and in_bounds_shp_file # have the same coordinates system # assuming in_drift_rasters_list have the same cell sizes, bounds and NDVs # basically they are copies of each other except for the drift values in_drift_rasters_list = ([ r'P:\Synchronize\IWS\QGIS_Neckar\raster\lower_de_gauss_z3_1km.tif' ]) # in_bounds_shp_file = ( # os.path.join(r'P:\Synchronize\IWS\QGIS_Neckar\raster', # r'taudem_out_spate_rockenau\watersheds.shp')) in_bounds_shp_file = (os.path.join( r'P:\Synchronize\IWS\QGIS_Neckar\raster\taudem_out_spate_rockenau\watersheds.shp' )) align_ras_file = in_drift_rasters_list[0] out_figs_dir = os.path.join(out_dir, 'krige_figs') x_coords_lab = 'X' y_coords_lab = 'Y' time_dim_lab = 'freq' nc_mode = 'w' # min_ppt_thresh = 1.0 idw_exp = 5 n_cpus = 1 buffer_dist = 20e3 sec_buffer_dist = 2e3 in_sep = str(';') ord_krige_flag = True sim_krige_flag = True edk_krige_flag = True idw_flag = True plot_figs_flag = True # ord_krige_flag = False sim_krige_flag = False edk_krige_flag = False idw_flag = False plot_figs_flag = False os.chdir(main_dir) if not os.path.exists(out_dir): os.mkdir(out_dir) if (not os.path.exists(out_figs_dir)) and plot_figs_flag: os.mkdir(out_figs_dir) # print('min_var_val:', min_var_val) # print('max_var_val:', max_var_val) print('idw_exp:', idw_exp) print('n_cpus:', n_cpus) print('nc_mode:', nc_mode) print('var_name:', var_name) print('out_dir:', out_dir) print('in_bounds_shp_file:', in_bounds_shp_file) print('out_krig_net_cdf_file:', out_krig_net_cdf_file) assert any([ord_krige_flag, sim_krige_flag, edk_krige_flag, idw_flag]) #========================================================================== # read the data frames #========================================================================== in_data_df = pd.read_csv(in_data_file, sep=in_sep, index_col=0, encoding='utf-8') in_vgs_df = pd.read_csv(in_vgs_file, sep=in_sep, index_col=0, encoding='utf-8') in_stns_coords_df = pd.read_csv(in_stns_coords_file, sep=in_sep, index_col=0, encoding='utf-8') all_stns = in_data_df.columns.intersection(in_stns_coords_df.index) assert all_stns.shape[0] in_data_df = in_data_df.loc[:, all_stns] in_stns_coords_df = in_stns_coords_df.loc[all_stns, :] #========================================================================== # Get stations that are around/in the bounds_shp only #========================================================================== bds_vec = ogr.Open(in_bounds_shp_file) assert bds_vec bds_lyr = bds_vec.GetLayer(0) feat_buffs_list = [] feat_sec_buffs_list = [] for feat in bds_lyr: # just to get the names of the catchments geom = feat.GetGeometryRef().Clone() assert geom feat_buffs_list.append(geom.Buffer(buffer_dist)) feat_sec_buffs_list.append(geom.Buffer(sec_buffer_dist)) bds_vec.Destroy() assert feat_buffs_list and feat_sec_buffs_list print(len(feat_buffs_list), 'polygons in the in_bounds_shp_file...') fin_stns = [] for poly in feat_buffs_list: for stn in all_stns: if stn in fin_stns: continue curr_pt = cnvt_to_pt(*in_stns_coords_df.loc[stn, ['X', 'Y']].values) if chk_cntmt(curr_pt, poly): fin_stns.append(stn) assert fin_stns print('%d stations out of %d within buffer zone of in_bounds_shp_file' % (len(fin_stns), in_stns_coords_df.shape[0])) fin_stns = np.unique(fin_stns) in_data_df = in_data_df.loc[:, fin_stns] in_stns_coords_df = in_stns_coords_df.loc[fin_stns, :] #========================================================================== # Read the DEM #========================================================================== # if edk_krige_flag: # in_drift_arr_list = [] # _rows_list = [] # _cols_list = [] # # for in_drift_raster in in_drift_rasters_list: # in_drift_ds = gdal.Open(in_drift_raster) # # assert in_drift_ds, 'GDAL cannot open %s' % in_drift_raster # # drift_rows = in_drift_ds.RasterYSize # drift_cols = in_drift_ds.RasterXSize # # drift_geotransform = in_drift_ds.GetGeoTransform() # # _drift_x_min = drift_geotransform[0] # _drift_y_max = drift_geotransform[3] # # drift_band = in_drift_ds.GetRasterBand(1) # drift_ndv = drift_band.GetNoDataValue() # # cell_width = drift_geotransform[1] # cell_height = abs(drift_geotransform[5]) # # _drift_x_max = _drift_x_min + (drift_cols * cell_width) # _drift_y_min = _drift_y_max - (drift_rows * cell_height) # # _arr = in_drift_ds.ReadAsArray() # # in_drift_arr_list.append(_arr) # _rows_list.append(_arr.shape[0]) # _cols_list.append(_arr.shape[1]) # # assert all(_ == _rows_list[0] for _ in _rows_list), ( # 'Drift raster have unequal number of rows!') # # assert all(_ == _cols_list[0] for _ in _cols_list), ( # 'Drift raster have unequal number of columns!') #========================================================================== # Read the bounding shapefile #========================================================================== # sf = shp.Reader(in_bounds_shp_file) # polys_list = [i.__geo_interface__ for i in sf.iterShapes()] ((fin_x_min, fin_x_max, fin_y_min, fin_y_max), cell_width) = get_aligned_shp_bds_and_cell_size(in_bounds_shp_file, align_ras_file) cell_height = cell_width fin_x_min -= 2 * cell_width fin_x_max += 2 * cell_width fin_y_min -= 2 * cell_height fin_y_max += 2 * cell_height # if edk_krige_flag: # assert fin_x_min > _drift_x_min # assert fin_x_max < _drift_x_max # assert fin_y_min > _drift_y_min # assert fin_y_max < _drift_y_max # # min_col = int(max(0, (fin_x_min - _drift_x_min) / cell_width)) # max_col = int(ceil((fin_x_max - _drift_x_min) / cell_width)) # # min_row = int(max(0, (_drift_y_max - fin_y_max) / cell_height)) # max_row = int(ceil((_drift_y_max - fin_y_min) / cell_height)) # # else: min_col = 0 max_col = int(ceil((fin_x_max - fin_x_min) / cell_width)) min_row = 0 max_row = int(ceil((fin_y_max - fin_y_min) / cell_height)) #========================================================================== # Calculate coordinates at which to krige #========================================================================== assert 0 <= min_col <= max_col, (min_col, max_col) assert 0 <= min_row <= max_row, (min_row, max_row) strt_x_coord = fin_x_min + (0.5 * cell_width) end_x_coord = strt_x_coord + ((max_col - min_col) * cell_width) strt_y_coord = fin_y_max - (0.5 * cell_height) end_y_coord = strt_y_coord - ((max_row - min_row) * cell_height) krige_x_coords = np.linspace(strt_x_coord, end_x_coord, (max_col - min_col + 1)) krige_y_coords = np.linspace(strt_y_coord, end_y_coord, (max_row - min_row + 1)) krige_x_coords_mesh, krige_y_coords_mesh = np.meshgrid( krige_x_coords, krige_y_coords) krige_coords_orig_shape = krige_x_coords_mesh.shape # if plot_figs_flag: # # xy coords for pcolormesh # pcolmesh_x_coords = np.linspace( # fin_x_min, fin_x_max, (max_col - min_col + 1)) # # pcolmesh_y_coords = np.linspace( # fin_y_max, fin_y_min, (max_row - min_row + 1)) # # krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = ( # np.meshgrid(pcolmesh_x_coords, pcolmesh_y_coords)) # # else: # krige_x_coords_plot_mesh, krige_y_coords_plot_mesh = None, None krige_x_coords_mesh = krige_x_coords_mesh.ravel() krige_y_coords_mesh = krige_y_coords_mesh.ravel() # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print(krige_x_coords_mesh.shape[0], # 'cells to interpolate per step before intersection!') # fin_cntn_idxs = np.ones(krige_x_coords_mesh.shape[0], dtype=bool) # fin_cntn_idxs = np.zeros(krige_x_coords_mesh.shape[0], dtype=bool) # ogr_pts = np.vectorize(cnvt_to_pt)(krige_x_coords_mesh, krige_y_coords_mesh) # # for poly in feat_sec_buffs_list: # curr_cntn_idxs = np.vectorize(chk_cntmt)(ogr_pts, poly) # fin_cntn_idxs = fin_cntn_idxs | curr_cntn_idxs # # print(fin_cntn_idxs.sum(), # 'cells to interpolate per step after intersection!') # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # krige_x_coords_mesh = krige_x_coords_mesh[fin_cntn_idxs] # krige_y_coords_mesh = krige_y_coords_mesh[fin_cntn_idxs] # if edk_krige_flag: # drift_vals_list = [] # # krige_cols = np.arange(min_col, max_col + 1, dtype=int) # krige_rows = np.arange(min_row, max_row + 1, dtype=int) # # assert krige_x_coords.shape[0] == krige_cols.shape[0] # assert krige_y_coords.shape[0] == krige_rows.shape[0] # # (krige_drift_cols_mesh, # krige_drift_rows_mesh) = np.meshgrid(krige_cols, krige_rows) # # krige_drift_cols_mesh = krige_drift_cols_mesh.ravel() # krige_drift_rows_mesh = krige_drift_rows_mesh.ravel() # # krige_drift_cols_mesh = krige_drift_cols_mesh[fin_cntn_idxs] # krige_drift_rows_mesh = krige_drift_rows_mesh[fin_cntn_idxs] # # for _drift_arr in in_drift_arr_list: # _drift_vals = _drift_arr[ # krige_drift_rows_mesh, krige_drift_cols_mesh] # # drift_vals_list.append(_drift_vals) # # # drift_vals_arr = np.array(drift_vals_list, dtype=float) # # drift_df_cols = list(range(len(in_drift_rasters_list))) # in_stns_drift_df = pd.DataFrame( # index=in_stns_coords_df.index, # columns=drift_df_cols, # dtype=float) # # for stn in in_stns_drift_df.index: # stn_x = in_stns_coords_df.loc[stn, x_coords_lab] # stn_y = in_stns_coords_df.loc[stn, y_coords_lab] # # stn_col = int((stn_x - _drift_x_min) / cell_width) # stn_row = int((_drift_y_max - stn_y) / cell_height) # # for col, _arr in zip(drift_df_cols, in_drift_arr_list): # try: # _ = _arr[stn_row, stn_col] # if not np.isclose(drift_ndv, _): # in_stns_drift_df.loc[stn, col] = _ # # except IndexError: # pass # # in_stns_drift_df.dropna(inplace=True) #========================================================================== # Open NC #========================================================================== out_nc = nc.Dataset(os.path.join(out_dir, out_krig_net_cdf_file), mode=str(nc_mode)) if nc_mode == 'w': out_nc.set_auto_mask(False) out_nc.createDimension(x_coords_lab, krige_x_coords.shape[0]) out_nc.createDimension(y_coords_lab, krige_y_coords.shape[0]) out_nc.createDimension(time_dim_lab, in_data_df.shape[0]) x_coords_nc = out_nc.createVariable(x_coords_lab, 'd', dimensions=x_coords_lab) x_coords_nc[:] = krige_x_coords y_coords_nc = out_nc.createVariable(y_coords_lab, 'd', dimensions=y_coords_lab) y_coords_nc[:] = krige_y_coords time_nc = out_nc.createVariable(time_dim_lab, 'i8', dimensions=time_dim_lab) time_nc[:] = np.arange(in_data_df.shape[0]) else: raise RuntimeError('Not configured for this option!') time_nc = out_nc.variables[time_dim_lab] krige_y_coords = y_coords_nc[:] krige_x_coords = x_coords_nc[:] #========================================================================== # MP stuff #========================================================================== mp_cond = False if ((n_cpus > 1) and (in_data_df.shape[0] > (n_cpus + 1))): idxs = pd.np.linspace(0, in_data_df.shape[0], (n_cpus) + 1, endpoint=True, dtype=int) idxs = np.unique(idxs) print('MP idxs:', idxs) if idxs.shape[0] == 1: idxs = np.concatenate((np.array([0]), idxs)) mp_cond = True else: idxs = [0, in_data_df.shape[0]] #========================================================================== # Krige #========================================================================== if ord_krige_flag: print('\n\n') print('#' * 10) _beg_t = timeit.default_timer() print('Ordinary Kriging...') if 'OK' not in out_nc.variables: ok_nc = out_nc.createVariable('OK', 'd', dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), fill_value=False) else: ok_nc = out_nc.variables['OK'] ok_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], in_stns_coords_df, in_vgs_df.loc[ft_type][0], krige_x_coords_mesh, krige_y_coords_mesh, krige_coords_orig_shape, (idxs[i], idxs[i + 1]), fin_cntn_idxs) for i in range(n_cpus)) if mp_cond: ok_krige_flds = np.full( (in_data_df.shape[0], krige_coords_orig_shape[0], krige_coords_orig_shape[1]), np.nan, dtype=np.float32) mp_ress = [] try: mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen)) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in ordinary_kriging:', msg) for mp_res in mp_ress: if (len(mp_res) != 3) and (not isinstance(list)): print('\n', mp_res, '\n') continue [strt_index, end_index, sub_ok_krige_flds] = mp_res ok_krige_flds[strt_index:end_index] = sub_ok_krige_flds # free memory mp_res[2], sub_ok_krige_flds = None, None ok_nc[:] = ok_krige_flds else: [strt_index, end_index, ok_krige_flds] = ordinary_kriging(next(ok_vars_gen)) ok_nc[:] = ok_krige_flds ok_nc.units = var_units ok_nc.standard_name = var_name + ' (ordinary kriging)' ok_krige_flds = None _end_t = timeit.default_timer() _tot_t = _end_t - _beg_t print(f'Took {_tot_t:0.4f} seconds!') print('#' * 10) # if sim_krige_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('Simple Kriging...') # if 'SK' not in out_nc.variables: # sk_nc = out_nc.createVariable( # 'SK', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # sk_nc = out_nc.variables['SK'] # # sk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_coords_df, # in_vgs_df.iloc[idxs[i]:idxs[i + 1]], # min_ppt_thresh, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # krige_coords_orig_shape, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # sk_krige_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap(simple_kriging, sk_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in simple_kriging:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_sk_krige_flds] = mp_res # sk_krige_flds[strt_index:end_index] = sub_sk_krige_flds # # # free memory # mp_res[2], sub_sk_krige_flds = None, None # # sk_nc[:] = sk_krige_flds # # else: # [strt_index, # end_index, # sk_krige_flds] = simple_kriging(next(sk_vars_gen)) # # sk_nc[:] = sk_krige_flds # # sk_nc.units = var_units # sk_nc.standard_name = var_name + ' (simple kriging)' # # sk_krige_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # if edk_krige_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('External Drift Kriging...') # if 'EDK' not in out_nc.variables: # edk_nc = out_nc.createVariable( # 'EDK', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # edk_nc = out_nc.variables['EDK'] # # edk_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_drift_df, # in_stns_coords_df, # in_vgs_df.iloc[idxs[i]:idxs[i + 1]], # min_ppt_thresh, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # drift_vals_arr, # krige_coords_orig_shape, # drift_ndv, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # edk_krige_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap( # external_drift_kriging, edk_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in external_drift_kriging:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_edk_krige_flds] = mp_res # edk_krige_flds[strt_index:end_index] = sub_edk_krige_flds # # print('sub_min:', np.nanmin(sub_edk_krige_flds)) # print('sub_max:', np.nanmax(sub_edk_krige_flds)) # # # free memory # mp_res[2], sub_edk_krige_flds = None, None # # else: # [strt_index, # end_index, # edk_krige_flds] = external_drift_kriging(next(edk_vars_gen)) # # edk_nc[:] = edk_krige_flds # # edk_nc.units = var_units # edk_nc.standard_name = var_name + ' (external drift kriging)' # # edk_krige_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) # # #========================================================================== # # IDW # #========================================================================== # if idw_flag: # print('\n\n') # print('#' * 10) # # _beg_t = timeit.default_timer() # # print('Inverse Distance Weighting...') # if 'IDW' not in out_nc.variables: # idw_nc = out_nc.createVariable( # 'IDW', # 'd', # dimensions=(time_dim_lab, y_coords_lab, x_coords_lab), # fill_value=False) # # else: # idw_nc = out_nc.variables['IDW'] # # idw_vars_gen = ((in_data_df.iloc[idxs[i]:idxs[i + 1]], # in_stns_coords_df, # min_ppt_thresh, # idw_exp, # var_name, # krige_x_coords_mesh, # krige_y_coords_mesh, # krige_coords_orig_shape, # min_var_val, # max_var_val, # (idxs[i], idxs[i + 1]), # plot_figs_flag, # krige_x_coords_plot_mesh, # krige_y_coords_plot_mesh, # var_units, # polys_list, # out_figs_dir, # fin_cntn_idxs) for i in range(n_cpus)) # # if mp_cond: # idw_flds = np.full( # (in_data_df.shape[0], # krige_coords_orig_shape[0], # krige_coords_orig_shape[1]), # np.nan, # dtype=np.float32) # # mp_ress = [] # try: # mp_pool = ProcessPool(n_cpus) # mp_pool.restart(True) # # mp_ress = list(mp_pool.uimap( # inverse_distance_wtng, idw_vars_gen)) # # mp_pool.clear() # # except Exception as msg: # mp_pool.close() # mp_pool.join() # print('Error in inverse_distance_wtng:', msg) # # for mp_res in mp_ress: # if (len(mp_res) != 3) and (not isinstance(list)): # print('\n', mp_res, '\n') # continue # # [strt_index, end_index, sub_idw_flds] = mp_res # idw_flds[strt_index:end_index] = sub_idw_flds # # # free memory # mp_res[2], sub_idw_flds = None, None # # else: # [strt_index, # end_index, # idw_flds] = inverse_distance_wtng(next(idw_vars_gen)) # # idw_nc[:] = idw_flds # # idw_nc.units = var_units # idw_nc.standard_name = ( # var_name + ' (IDW (exp=%0.3f))' % float(idw_exp)) # # idw_flds = None # # _end_t = timeit.default_timer() # _tot_t = _end_t - _beg_t # # print(f'Took {_tot_t:0.4f} seconds!') # print('#' * 10) out_nc.Author = 'Faizan IWS Uni-Stuttgart' out_nc.Source = out_nc.filepath() out_nc.close() return
def graph_sampling(graph: FSN, strategy: Optional[str] = "MetaDiff", n_jobs: Optional[int] = 4, use_cache: Optional[bool] = True, **kwargs) \ -> List[List[Union[str, int]]]: """ Sampling the sequences of nodes from FSN w.r.t. chosen strategy Parameters ---------- graph : FSN object Graph to be processed strategy : str, default is 'MetaDiff' Walking strategy to be used n_jobs : int, default is 4 Number of workers to be created in parallel pool use_cache : bool, default is True To use the previously cached files Returns ------- Sampled sequences of BP nodes """ set_new_config(**kwargs) local_logger = logging.getLogger(f"{__name__}") if use_cache and os.path.isfile(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl"): local_logger.info("Loading sequences from cache... wait...") try: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "rb") as file: res = pickle.load(file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res except FileNotFoundError: local_logger.info("File not found... Recalculate \n") pass except Exception as e: local_logger.error(f"Unexpected error: {e}") local_logger.info("Sampling sequences... wait...") max_processes = max(n_jobs, os.cpu_count()) global walk if strategy in strategy_to_class.keys(): walk = strategy_to_class[strategy](G=graph, walk_length=CONFIG.WALKS_LENGTH, direction=CONFIG.DIRECTION, pressure=CONFIG.PRESSURE, allow_back=CONFIG.ALLOW_BACK) else: raise KeyError( f"The given strategy {strategy} is unknown. The following ones are implemented: {strategy_to_class.keys()}") sampling_pool = ProcessPool(nodes=max_processes) local_logger.info("Created a Pool with " + str(max_processes) + " processes ") # required to restart pool to update CONFIG inside the parallel part sampling_pool.terminate() sampling_pool.restart() BPs = graph.get_BPs() n_BPs = len(BPs) sampled = list() try: with tqdm(total=n_BPs) as pbar: for i, res in enumerate(sampling_pool.uimap(wrappedWalk, BPs)): sampled.append(res) pbar.update() except KeyboardInterrupt: print('Got ^C while pool mapping, terminating the pool') sampling_pool.terminate() res = list(itertools.chain(*sampled)) sampling_pool.terminate() sampling_pool.restart() local_logger.info("Cashing sampled sequences!") if use_cache: with open(CONFIG.WORK_FOLDER[0] + "sampled_sequences_cached.pkl", "wb") as file: pickle.dump(res, file) local_logger.info(f"Total number of raw sampled sequences is {len(res)}") local_logger.info(f"Average length of sequences is {sum(map(len, res)) / float(len(res))}") return res
def get_stn_comb_freqs(self, obs_vals_df): sim_beg_time = default_timer() n_extra_cpus_per_comb = self._n_cpus_extra // self._n_cpus if n_extra_cpus_per_comb: sim_chunks_idxs = ret_mp_idxs(self._n_sims, n_extra_cpus_per_comb) sub_mp_pool = ProcessPool(n_extra_cpus_per_comb) self._vb = False else: sim_chunks_idxs = np.array([0, self._n_sims]) sub_mp_pool = None sim_chunks_idxs[-1] += 1 # for simulation zero sim_chunk_gen = (( obs_vals_df, (sim_chunks_idxs[i], sim_chunks_idxs[i + 1]),) for i in range(sim_chunks_idxs.shape[0] - 1)) if sub_mp_pool is not None: list(sub_mp_pool.uimap(self._get_stn_comb_freqs, sim_chunk_gen)) sub_mp_pool.clear() else: list(map(self._get_stn_comb_freqs, sim_chunk_gen)) if self._vb_old: if self._vb_old and not self._vb: print_sl() print( f'INFO: Finished computing sub-station combinations in ' f'{default_timer() - sim_beg_time:0.3f} seconds.') print_el() if self._save_sim_cdfs_flag or self._save_sim_acorrs_flag: if self._vb_old: if self._vb_old and not self._vb: print_sl() print( f'INFO: Computing/writing extra sub-station ' f'information to HDF5...') comb_str = str(tuple(obs_vals_df.columns)) stats_gen = (( obs_vals_df.columns[i], comb_str,) for i in range(obs_vals_df.shape[1])) if sub_mp_pool is not None: list(sub_mp_pool.uimap(self._write_stats_to_hdf5, stats_gen)) sub_mp_pool.clear() else: list(map(self._write_stats_to_hdf5, stats_gen)) if self._vb_old: print(f'INFO: Done!') print_el() if sub_mp_pool is not None: sub_mp_pool.join() sub_mp_pool = None return
''' Variation uses pathos dependency Windows OS: Hangs Mac OS: Linux: Debian (unclear because windows app but operated in same manner) Cloud-based: Repl.it: Works Ideone.com: Fails-multiprocess error ''' from multiprocess import freeze_support from pathos.multiprocessing import ProcessPool if __name__ == "__main__": freeze_support() pool = ProcessPool(nodes=4) results = pool.uimap(pow, [1, 2, 3, 4], [5, 6, 7, 8]) print("...") print(list(results))
out_figs_dir, fin_cntn_idxs) for i in range(n_cpus)) if mp_cond: ok_krige_flds = np.full( (fin_date_range.shape[0], krige_coords_orig_shape[0], krige_coords_orig_shape[1]), np.nan, dtype=np.float32) mp_ress = [] try: mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) mp_ress = list(mp_pool.uimap(ordinary_kriging, ok_vars_gen)) mp_pool.clear() except Exception as msg: mp_pool.close() mp_pool.join() print('Error in ordinary_kriging:', msg) for mp_res in mp_ress: if (len(mp_res) != 3) and (not isinstance(list)): print('\n', mp_res, '\n') continue [strt_index, end_index, sub_ok_krige_flds] = mp_res ok_krige_flds[strt_index:end_index] = sub_ok_krige_flds
else: run_length += 1 if run_length >= RUN_LENGTH_TO_SWITCH: # Switch label we're applying and reset counter current = SIGNIFICANT if ( current == NOT_SIGNIFCANT) else NOT_SIGNIFCANT run_length = 0 res[i] = current return res if __name__ == "__main__": # Batch process + multiprocessing from tqdm import tqdm from pathos.multiprocessing import ProcessPool pool = ProcessPool(nodes=3) job_queue = [] for trial_num in range(1, 256): job_queue.append(pool.uimap(process_trial, (trial_num, ))) # Code block to process R & p values for all trials with tqdm(total=255, unit='trials') as pbar: for task in job_queue: list(task) pbar.update()
f.write(text) return url, num_pages except KeyboardInterrupt: print('Exiting') raise KeyboardInterrupt except: traceback.print_exc() def get_number_pages(soup): tds = soup.findAll('td', class_='vbmenu_control') for td in tds: text = td.text.strip() if text.startswith('Page'): tokens = text.split() return int(tokens[-1]) return 1 def rest(): time.sleep(random.random() * 2) if __name__ == '__main__': pool = Pool(nodes = NUM_PROC) with open(input_file) as f: links = (row['link'] for row in csv.DictReader(f)) results = pool.uimap(scrape_thread, links) for r in results: print(r) rest()
'links': urls } json.dump(data, f, indent=4, sort_keys=True) add_completed_url(url) return True except: print('Problem while parsing: {}'.format(url)) traceback.print_exc() add_problem_url(url) return False if __name__ == '__main__': total_url_count = len(URLS) completed_urls_count = len(COMPLETED_URLS) problem_urls_count = len(PROBLEM_URLS) preprocessed_count = completed_urls_count + problem_urls_count unprocessed_urls = (URLS - COMPLETED_URLS) - PROBLEM_URLS pool = Pool(nodes=NUM_PROC) results = pool.uimap(get_content, unprocessed_urls) for i, success in enumerate(results): i += preprocessed_count sys.stderr.write('\rdone {0:%} ({1:d}/{2:d}) bad:{3}'.format( i / total_url_count, i, total_url_count, problem_urls_count)) sys.stderr.flush() if success: completed_urls_count += 1 else: problem_urls_count += 1
class SuperPool: def __init__(self, n_cpu=-1): """Process pool for applying functions multi-threaded with progress bars. Arguments: n_cpu -- The number of processes to spawn. Defaults to the number of threads (logical cores) on your system. Usage: >>> pool = mlc.SuperPool() # By default, the cpu count is used >>> def f(x): ... return x ** 2 >>> res = pool.map(f, range(1000)) # Apply function f to every value in y [mlcrate] 8 CPUs: 100%|████████████████████████| 1000/1000 [00:00<00:00, 1183.78it/s] """ from multiprocessing import cpu_count from pathos.multiprocessing import ProcessPool import tqdm self.tqdm = tqdm if n_cpu == -1: n_cpu = cpu_count() self.n_cpu = n_cpu self.pool = ProcessPool(n_cpu) def __del__(self): self.pool.close() def map(self, func, array, chunksize=16, description=""): """Map a function over array using the pool and return [func(a) for a in array]. Arguments: func -- The function to apply. Can be a lambda function array -- Any iterable to which the function should be applied over chunksize (default: 16) -- The size of a "chunk" which is sent to a CPU core for processing in one go. Larger values should speed up processing when using very fast functions, while smaller values will give a more granular progressbar. description (optional) -- Text to be displayed next to the progressbar. Returns: res -- A list of values returned from the function. """ res = [] def func_tracked(args): x, i = args return func(x), i array_tracked = zip(array, range(len(array))) desc = "{} CPUs{}".format( self.n_cpu, " - {}".format(description) if description else "") for out in self.tqdm.tqdm( self.pool.uimap(func_tracked, array_tracked, chunksize=chunksize), total=len(array), desc=desc, smoothing=0.05, ): res.append(out) # Sort based on i but return only the actual function result actual_res = [r[0] for r in sorted(res, key=lambda r: r[1])] return actual_res def exit(self): """Close the processes and wait for them to clean up.""" self.pool.close() self.pool.join()
print('Beginning... ') full_df = pd.read_csv('../../../../fundamentals.csv', low_memory=False, header=[0, 1]) print('csv in memory') metrics = set([x[1] for x in list(full_df)[1:]]) date_col = list(full_df)[0] pool = Pool(nodes=NPROC) func_args = list(metrics) print('finished creating args') try: outputs = pool.uimap(process_fundamental, func_args) except Exception as e: print(e) for metric, data in zip(func_args, outputs): fundamentals[metric] = data #TODO: Export data into db for faster access #models.Base.metadata.create_all(db.engine) session = db.create_session() with open('universe.csv') as f: data = f.read().replace("'", '').replace(' ', '').replace('\n', '').split(',') universe = data for fundamental in func_args: current = fundamentals[fundamental]
class WorkManager(object): """ Class to in charge of managing the tasks and distributing them to the workers. They can be local (using other cores) or remote using other nodes in the local cluster """ def __init__(self, ncpus='autodetect', ppservers=None, silent=False): if ncpus == 'autodetect': from pathos.helpers import cpu_count self.ncpus = cpu_count() else: self.ncpus = ncpus if ppservers: self._ppservers = ppservers self.sessions = [ppServer(srv) for srv in ppservers] self.ppservers = tuple([i.local_server for i in self.sessions]) from pathos.parallel import ParallelPool as PPPool self.pool = PPPool(ncpus=self.ncpus, ppservers=self.ppservers) self.mode = 'cluster' from pathos.parallel import stats as pp_stats self.pp_stats = pp_stats else: from pathos.multiprocessing import ProcessPool as MPPool self.pool = MPPool(self.ncpus) self.mode = 'multicore' self.stats = {} self.silent = silent def __del__(self): del self.pool def process(self, task, items, timeout=90000): if not isinstance(task, Task): raise TypeError("task argument needs to be an 'Task' instance") # --- Call the Local initialialization task.initializeLocal() # --- Schedule all the jobs .... if self.mode == 'cluster': from ostap.utils.progress_bar import ProgressBar with ProgressBar(max_value=len(items), silent=self.silent) as bar: jobs = self.pool.uimap(_ppfunction, zip([task for i in items], items)) ##jobs = [self.server.submit(_prefunction, (_ppfunction, task, item), (), ('ROOT','Ostap.ParallelPathos')) for item in items] ##jobs = [self.server.submit(_prefunction, (_ppfunction, task, item), (), ('Ostap.Parallel','time')) for item in items] ##jobs = [self.server.submit(_prefunction, (_ppfunction, task, item), (_ppfunction,), ('Ostap','time')) for item in items] for result, stat in jobs: bar += 1 task._mergeResults(result) self._mergeStatistics(stat) self._printStatistics() self.pp_stats() elif self.mode == 'multicore': start = time.time() from ostap.utils.progress_bar import ProgressBar with ProgressBar(max_value=len(items), silent=self.silent) as bar: jobs = self.pool.uimap(_ppfunction, zip([task for i in items], items)) for result, stat in jobs: bar += 1 task._mergeResults(result) self._mergeStatistics(stat) end = time.time() self._printStatistics() logger.info('Time elapsed since server creation %f' % (end - start)) # --- Call the Local Finalize task.finalize() def _printStatistics(self): njobs = 0 for stat in self.stats.values(): njobs += stat.njob logger.info('Job execution statistics:') logger.info( 'job count | % of all jobs | job time sum | time per job | job server' ) for name, stat in self.stats.items(): logger.info( ' %d | %6.2f | %8.3f | %8.3f | %s' % (stat.njob, 100. * stat.njob / njobs, stat.time, stat.time / stat.njob, name)) def _mergeStatistics(self, stat): if stat.name not in self.stats: self.stats[stat.name] = Statistics() s = self.stats[stat.name] s.time += stat.time s.njob += 1
def plot_cats_prms_transfer_perfs(dbs_dir, n_cpus=1): '''Plot catchments performances' by using parameters from other catchment. ''' cats_dbs = glob(os.path.join(dbs_dir, 'cat_*.hdf5')) assert cats_dbs n_cats = len(cats_dbs) n_cpus = min(n_cats, n_cpus) kf_prms_dict = {} cats_vars_dict = {} for cat_db in cats_dbs: with h5py.File(cat_db, 'r') as db: kfolds = db['data'].attrs['kfolds'] cat = db.attrs['cat'] cv_flag = db['data'].attrs['cv_flag'] if cv_flag: print('plot_prm_trans_perfs not possible with cv_flag!') return f_var_infos = db['cdata/aux_var_infos'][...] prms_idxs = db['cdata/use_prms_idxs'][...] f_vars = db['cdata/aux_vars'][...] prms_flags = db['cdata/all_prms_flags'][...] bds_arr = db['cdata/bds_arr'][...] cat_vars_dict = {} cat_vars_dict['f_var_infos'] = f_var_infos cat_vars_dict['prms_idxs'] = prms_idxs cat_vars_dict['f_vars'] = f_vars cat_vars_dict['prms_flags'] = prms_flags cat_vars_dict['bds_arr'] = bds_arr cats_vars_dict[cat] = cat_vars_dict for i in range(1, kfolds + 1): kf_str = f'kf_{i:02d}' cd_db = db[f'calib/{kf_str}'] opt_prms = cd_db['opt_prms'][...] if i not in kf_prms_dict: kf_prms_dict[i] = {} kf_prms_dict[i][cat] = {} kf_prms_dict[i][cat]['opt_prms'] = opt_prms const_args = (kf_prms_dict, cats_vars_dict) plot_gen = ((cat_db, const_args) for cat_db in cats_dbs) if (n_cpus > 1) and (n_cats > 1): mp_pool = ProcessPool(n_cpus) mp_pool.restart(True) print(list(mp_pool.uimap(plot_cat_prms_transfer_perfs, plot_gen))) mp_pool.clear() mp_pool.close() mp_pool.join() else: for plot_args in plot_gen: plot_cat_prms_transfer_perfs(plot_args) return
return int(td.text.split()[-1]) return 1 def save_html(url): fpath = forum_pages_path.format(url.replace('https://www.', '').replace('/', '_')) if os.path.exists(fpath): print('Skipping {}, already saved.'.format(url)) return try: time.sleep(random() * 2) text = requests.get(url).text with open(fpath, 'wt') as f: f.write(text) print('Saved {}'.format(url)) except: traceback.print_exc() print('Problem when trying to save:', url) if __name__ == '__main__': pool = Pool(nodes = NUM_PROC) for row in forum_data: url, title, categories = row['url'], row['title'], row['categories'] soup = bsoup(requests.get(url).text, features = 'html5') print('Total pages for {}: {}'.format(url, get_total_pages(soup))) total_pages = get_total_pages(soup) forum_id = get_forum_id(url) urls = [base_forum_url.format(forum_id=forum_id, page_num=page_num) for page_num in range(1, total_pages + 1)] results = pool.uimap(save_html, urls) list(results) del soup
def run_all(self, processors=1, iseed=1): """Run the model for all trials in the designed experiment and store results. Model constructor is assumed to take args (seed, collect_stepwise_data, trial kwargs). Args: processors (int, default=1): Number of cpu cores to use for batch run. iseed (int, default=1): Initial seed for replication 1 of each trial. Seeds for subsequent replications will be PNRG by this class. """ pool = ProcessPool(nodes=processors) job_queue = [] # Generator for initial model seeds. Models use these seeds to manage # their own RNGs. brng = PCG64(iseed) randomgen = RandomGenerator(brng) param_names = self.design.columns param_names = param_names[(param_names != 'replications') & (param_names != 'Trial')] total_iterations = self.design.replications.sum() self.manifest = [] # Records what seed was used where for row in self.design.itertuples(): kwargs = {key: getattr(row, key) for key in param_names} # Reset model seed generator for next design point brng.seed(iseed) for rep in range(1, row.replications + 1): # Advance model seed for next replication model_seed = randomgen.randint(10000000) model_key = ( row.Trial, rep, ) self.manifest.append((model_key, model_seed)) job_queue.append( pool.uimap(self._run_single_replication, (model_seed, ), (kwargs, ), (model_key, ))) with tqdm(total=total_iterations, desc='Total', unit='dp', disable=not self.display_progress) as pbar_total: # empty the queue results = [] for task in job_queue: for model_vars, agent_vars, stepwise_vars in list(task): results.append((model_vars, agent_vars, stepwise_vars)) pbar_total.update() if self.data_handler: return # Results already stored to database, nothing more to record # store the results in batchrunner # FUTURE: rework this module to only support external data_handler. # Rationale: Best practice to treat each replication atomically. Key # benefit of having all replications in memory is to do experiment-wide # analysis, and a data analysis module can read in all per-replication # files. for model_vars, agent_vars, stepwise_vars in results: if self.model_reporters: for model_key, model_val in model_vars.items(): self.model_vars[model_key] = model_val if self.agent_reporters: for agent_key, reports in agent_vars.items(): self.agent_vars[agent_key] = reports if self.collect_stepwise_data: for stepwise_key, stepwise_val in stepwise_vars.items(): self.stepwise_vars[stepwise_key] = stepwise_val