def _parallel_resource_map(self): """Map all resource gids to exclusion gids in parallel. Returns ------- lats : np.ndarray 2D un-projected latitude array of tech exclusion points. 0's if no res point found. Shape is equal to exclusions shape. lons : np.ndarray 2D un-projected longitude array of tech exclusion points. 0's if no res point found. Shape is equal to exclusions shape. ind_all : np.ndarray Index values of the NN resource point. -1 if no res point found. 2D integer array with shape equal to the exclusions extent shape. """ gids = np.array(list(range(self._n_sc)), dtype=np.uint32) gid_chunks = np.array_split(gids, int(np.ceil(len(gids) / 2))) # init full output arrays ind_all, coords_all = self._init_out_arrays() n_finished = 0 futures = {} loggers = [__name__, 'reV'] with SpawnProcessPool(max_workers=self._max_workers, loggers=loggers) as exe: # iterate through split executions, submitting each to worker for i, gid_set in enumerate(gid_chunks): # submit executions and append to futures list futures[exe.submit(self.map_resource_gids, gid_set, self._excl_fpath, self._res_fpath, self.distance_upper_bound, self._map_chunk)] = i for future in as_completed(futures): n_finished += 1 logger.info('Parallel TechMapping futures collected: ' '{} out of {}'.format(n_finished, len(futures))) i = futures[future] result = future.result() res = self._map_chunk with SupplyCurveExtent(self._excl_fpath, resolution=res) as sc: for j, gid in enumerate(gid_chunks[i]): i_out_arr = sc.get_flat_excl_ind(gid) ind_all[i_out_arr] = result[0][j] coords_all[i_out_arr, :] = result[1][j] ind_all = ind_all.reshape(self._excl_shape) lats = coords_all[:, 0].reshape(self._excl_shape) lons = coords_all[:, 1].reshape(self._excl_shape) return lats, lons, ind_all
def _run_parallel(self): """Run offshore gen aggregation and ORCA econ compute in parallel.""" futures = {} loggers = [__name__, 'reV'] with SpawnProcessPool(max_workers=self._max_workers, loggers=loggers) as exe: iterator = self.meta_out_offshore.iterrows() for i, (ifarm, meta) in enumerate(iterator): row = self._offshore_data.loc[ifarm, :] farm_gid, res_gid = self._get_farm_gid(ifarm) self._check_dist(meta, row) if farm_gid is not None: cf_ilocs = np.where(self._i == ifarm)[0] meta = self.meta_source_offshore.iloc[cf_ilocs] system_inputs = self._get_system_inputs(res_gid) site_data = row.to_dict() self._check_sys_inputs(system_inputs, site_data) future = exe.submit(self._get_farm_data, self._gen_fpath, meta, system_inputs, site_data, site_gid=farm_gid) futures[future] = i for fi, future in enumerate(as_completed(futures)): logger.info( 'Completed {} out of {} offshore compute futures.'.format( fi + 1, len(futures))) i = futures[future] gen_data = future.result() for k, v in gen_data.items(): if isinstance(v, (np.ndarray, list, tuple)): self._out[k][:, i] = v else: self._out[k][i] = v
def _parallel_run(self, max_workers=None, pool_size=(os.cpu_count() * 2), timeout=1800, **kwargs): """Execute parallel compute. Parameters ---------- max_workers : None | int Number of workers. None will default to cpu count. pool_size : int Number of futures to submit to a single process pool for parallel futures. timeout : int | float Number of seconds to wait for parallel run iteration to complete before returning zeros. kwargs : dict Keyword arguments to self.run(). """ logger.debug('Running parallel execution with max_workers={}' .format(max_workers)) i = 0 N, pc_chunks = self._pre_split_pc(pool_size=pool_size) for j, pc_chunk in enumerate(pc_chunks): logger.debug('Starting process pool for points control ' 'iteration {} out of {}' .format(j + 1, len(pc_chunks))) failed_futures = False chunks = {} futures = [] loggers = [__name__, 'reV.gen', 'reV.econ', 'reV'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: for pc in pc_chunk: future = exe.submit(self.run, pc, **kwargs) futures.append(future) chunks[future] = pc for future in futures: i += 1 try: result = future.result(timeout=timeout) except TimeoutError: failed_futures = True sites = chunks[future].project_points.sites result = self._handle_failed_future(future, i, sites, timeout) self.out = result mem = psutil.virtual_memory() m = ('Parallel run at iteration {0} out of {1}. ' 'Memory utilization is {2:.3f} GB out of {3:.3f} GB ' 'total ({4:.1f}% used, intended limit of {5:.1f}%)' .format(i, N, mem.used / 1e9, mem.total / 1e9, 100 * mem.used / mem.total, 100 * self.mem_util_lim)) logger.info(m) if failed_futures: logger.info('Forcing pool shutdown after failed futures.') exe.shutdown(wait=False) logger.info('Forced pool shutdown complete.') self.flush()
def run_parallel(self, agg_method='mean', excl_area=0.0081, max_workers=None, chunk_point_len=1000): """ Aggregate in parallel Parameters ---------- agg_method : str Aggregation method, either mean or sum/aggregate excl_area : float Area of an exclusion cell (square km). max_workers : int | None Number of cores to run summary on. None is all available cpus. chunk_point_len : int Number of SC points to process on a single parallel worker. Returns ------- agg_out : dict Aggregated values for each aggregation dataset """ chunks = np.array_split( self._gids, int(np.ceil(len(self._gids) / chunk_point_len))) logger.info('Running supply curve point aggregation for ' 'points {} through {} at a resolution of {} ' 'on {} cores in {} chunks.' .format(self._gids[0], self._gids[-1], self._resolution, max_workers, len(chunks))) n_finished = 0 futures = [] dsets = self._agg_dsets + ('meta', ) agg_out = {ds: [] for ds in dsets} loggers = [__name__, 'reV.supply_curve.points'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: # iterate through split executions, submitting each to worker for gid_set in chunks: # submit executions and append to futures list futures.append(exe.submit( self.run_serial, self._excl_fpath, self._h5_fpath, self._tm_dset, *self._agg_dsets, agg_method=agg_method, excl_dict=self._excl_dict, area_filter_kernel=self._area_filter_kernel, min_area=self._min_area, check_excl_layers=self._check_excl_layers, resolution=self._resolution, excl_area=excl_area, gids=gid_set, gen_index=self._gen_index)) # gather results for future in futures: n_finished += 1 logger.info('Parallel aggregation futures collected: ' '{} out of {}' .format(n_finished, len(chunks))) for k, v in future.result().items(): if v: agg_out[k].extend(v) return agg_out
def run_parallel(self, sc_point_method, args=None, kwargs=None, max_workers=None, chunk_point_len=1000): """ Aggregate with sc_point_method in parallel Parameters ---------- args : list | None List of positional args for sc_point_method kwargs : dict | None Dict of kwargs for sc_point_method max_workers : int | None Number of cores to run summary on. None is all available cpus. chunk_point_len : int Number of SC points to process on a single parallel worker. Returns ------- summary : list List of outputs from sc_point_method. """ chunks = np.array_split( self._gids, int(np.ceil(len(self._gids) / chunk_point_len))) logger.info('Running supply curve point aggregation for ' 'points {} through {} at a resolution of {} ' 'on {} cores in {} chunks.' .format(self._gids[0], self._gids[-1], self._resolution, max_workers, len(chunks))) n_finished = 0 futures = [] output = [] loggers = [__name__, 'reV.supply_curve.points'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: # iterate through split executions, submitting each to worker for gid_set in chunks: # submit executions and append to futures list futures.append(exe.submit( self.run_serial, sc_point_method, self._excl_fpath, self._tm_dset, excl_dict=self._excl_dict, area_filter_kernel=self._area_filter_kernel, min_area=self._min_area, check_excl_layers=self._check_excl_layers, resolution=self._resolution, gids=gid_set, args=args, kwargs=kwargs)) # gather results for future in as_completed(futures): n_finished += 1 logger.info('Parallel aggregation futures collected: ' '{} out of {}' .format(n_finished, len(chunks))) output += future.result() return output
def compute_statistics(self, dataset, sites=None, diurnal=False, month=False, combinations=False, max_workers=None, chunks_per_worker=5, lat_lon_only=True): """ Compute statistics Parameters ---------- dataset : str Dataset to extract stats for sites : list | slice, optional Subset of sites to extract, by default None or all sites diurnal : bool, optional Extract diurnal stats, by default False month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None chunks_per_worker : int, optional Number of chunks to extract on each worker, by default 5 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True Returns ------- res_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ if max_workers is None: max_workers = os.cpu_count() slices = self._get_slices(dataset, sites, chunks_per_slice=chunks_per_worker) if len(slices) == 1: max_workers = 1 if max_workers > 1: msg = ('Extracting {} for {} in parallel using {} workers'.format( list(self.statistics), dataset, max_workers)) logger.info(msg) loggers = [__name__, 'rex'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: futures = [] for sites_slice in slices: future = exe.submit(self._extract_stats, self.res_h5, self.statistics, dataset, res_cls=self.res_cls, hsds=self._hsds, time_index=self.time_index, sites_slice=sites_slice, diurnal=diurnal, month=month, combinations=combinations) futures.append(future) res_stats = [] for i, future in enumerate(as_completed(futures)): res_stats.append(future.result()) logger.debug('Completed {} out of {} workers'.format( (i + 1), len(futures))) else: msg = ('Extracting {} for {} in serial'.format( self.statistics.keys(), dataset)) logger.info(msg) res_stats = [] for i, sites_slice in enumerate(slices): res_stats.append( self._extract_stats(self.res_h5, self.statistics, dataset, res_cls=self.res_cls, hsds=self._hsds, time_index=self.time_index, sites_slice=sites_slice, diurnal=diurnal, month=month, combinations=combinations)) logger.debug('Completed {} out of {} sets of sites'.format( (i + 1), len(slices))) gc.collect() log_mem(logger) res_stats = pd.concat(res_stats) if lat_lon_only: meta = self.lat_lon else: meta = self.meta res_stats = meta.join(res_stats.sort_index(), how='inner') return res_stats
def compute(self, dset1, dset2, bins1, bins2, sites=None, max_workers=None, chunks_per_worker=5): """ Compute joint probability distribution between given datasets using given bins for all sites. Parameters ---------- dset1 : str Dataset 1 to generate joint probability distribution for dset2 : str Dataset 2 to generate joint probabilty distribution for bins1 : tuple (start, stop, step) for dataset 1 bins. The stop value is inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4, 6). If the stop value is not perfectly divisible by the step, the last bin will overshoot the stop value. bins2 : tuple (start, stop, step) for dataset 2 bins. The stop value is inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4, 6). If the stop value is not perfectly divisible by the step, the last bin will overshoot the stop value. sites : list | slice, optional Subset of sites to extract, by default None or all sites max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None chunks_per_worker : int, optional Number of chunks to extract on each worker, by default 5 Returns ------- jpd: pandas.DataFrame DataFrame of joint probability distribution between given datasets with given bins """ if max_workers is None: max_workers = os.cpu_count() slices = self._get_slices(dset1, dset2, sites, chunks_per_slice=chunks_per_worker) if len(slices) == 1: max_workers = 1 jpd = {} if max_workers > 1: msg = ('Computing the joint probability distribution between {} ' 'and {} in parallel using {} workers'.format( dset1, dset2, max_workers)) logger.info(msg) loggers = [__name__, 'rex'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: futures = [] for sites_slice in slices: future = exe.submit(self.compute_joint_pd, self.res_h5, dset1, dset2, bins1, bins2, res_cls=self.res_cls, hsds=self._hsds, sites_slice=sites_slice) futures.append(future) for i, future in enumerate(as_completed(futures)): jpd.update(future.result()) logger.debug('Completed {} out of {} workers'.format( (i + 1), len(futures))) else: msg = ('Computing the joint probability distribution between {} ' 'and {} in serial.'.format(dset1, dset2)) logger.info(msg) for i, sites_slice in enumerate(slices): jpd.update( self.compute_joint_pd(self.res_h5, dset1, dset2, bins1, bins2, res_cls=self.res_cls, hsds=self._hsds, sites_slice=sites_slice)) logger.debug('Completed {} out of {} sets of sites'.format( (i + 1), len(slices))) gc.collect() log_mem(logger) bins1 = self._make_bins(*bins1) bins2 = self._make_bins(*bins2) index = np.meshgrid(bins1[:-1], bins2[:-1], indexing='ij') index = np.array(index).T.reshape(-1, 2).astype(np.int16) index = pd.MultiIndex.from_arrays(index.T, names=(dset1, dset2)) jpd = pd.DataFrame({k: v.flatten(order='F') for k, v in jpd.items()}, index=index).sort_index(axis=1) return jpd