Python SpawnProcessPool Examples, rex.utilities.execution.SpawnProcessPool Python Examples

Example #1

0

Show file

    def _parallel_resource_map(self):
        """Map all resource gids to exclusion gids in parallel.

        Returns
        -------
        lats : np.ndarray
            2D un-projected latitude array of tech exclusion points.
            0's if no res point found. Shape is equal to exclusions shape.
        lons : np.ndarray
            2D un-projected longitude array of tech exclusion points.
            0's if no res point found. Shape is equal to exclusions shape.
        ind_all : np.ndarray
            Index values of the NN resource point. -1 if no res point found.
            2D integer array with shape equal to the exclusions extent shape.
        """

        gids = np.array(list(range(self._n_sc)), dtype=np.uint32)
        gid_chunks = np.array_split(gids, int(np.ceil(len(gids) / 2)))

        # init full output arrays
        ind_all, coords_all = self._init_out_arrays()

        n_finished = 0
        futures = {}
        loggers = [__name__, 'reV']
        with SpawnProcessPool(max_workers=self._max_workers,
                              loggers=loggers) as exe:

            # iterate through split executions, submitting each to worker
            for i, gid_set in enumerate(gid_chunks):
                # submit executions and append to futures list
                futures[exe.submit(self.map_resource_gids, gid_set,
                                   self._excl_fpath, self._res_fpath,
                                   self.distance_upper_bound,
                                   self._map_chunk)] = i

            for future in as_completed(futures):
                n_finished += 1
                logger.info('Parallel TechMapping futures collected: '
                            '{} out of {}'.format(n_finished, len(futures)))

                i = futures[future]
                result = future.result()

                res = self._map_chunk
                with SupplyCurveExtent(self._excl_fpath, resolution=res) as sc:
                    for j, gid in enumerate(gid_chunks[i]):
                        i_out_arr = sc.get_flat_excl_ind(gid)
                        ind_all[i_out_arr] = result[0][j]
                        coords_all[i_out_arr, :] = result[1][j]

        ind_all = ind_all.reshape(self._excl_shape)
        lats = coords_all[:, 0].reshape(self._excl_shape)
        lons = coords_all[:, 1].reshape(self._excl_shape)

        return lats, lons, ind_all

Example #2

0

Show file

    def _run_parallel(self):
        """Run offshore gen aggregation and ORCA econ compute in parallel."""

        futures = {}
        loggers = [__name__, 'reV']
        with SpawnProcessPool(max_workers=self._max_workers,
                              loggers=loggers) as exe:

            iterator = self.meta_out_offshore.iterrows()
            for i, (ifarm, meta) in enumerate(iterator):

                row = self._offshore_data.loc[ifarm, :]
                farm_gid, res_gid = self._get_farm_gid(ifarm)

                self._check_dist(meta, row)

                if farm_gid is not None:
                    cf_ilocs = np.where(self._i == ifarm)[0]
                    meta = self.meta_source_offshore.iloc[cf_ilocs]
                    system_inputs = self._get_system_inputs(res_gid)
                    site_data = row.to_dict()

                    self._check_sys_inputs(system_inputs, site_data)

                    future = exe.submit(self._get_farm_data,
                                        self._gen_fpath,
                                        meta,
                                        system_inputs,
                                        site_data,
                                        site_gid=farm_gid)

                    futures[future] = i

            for fi, future in enumerate(as_completed(futures)):
                logger.info(
                    'Completed {} out of {} offshore compute futures.'.format(
                        fi + 1, len(futures)))
                i = futures[future]
                gen_data = future.result()
                for k, v in gen_data.items():
                    if isinstance(v, (np.ndarray, list, tuple)):
                        self._out[k][:, i] = v
                    else:
                        self._out[k][i] = v

Example #3

0

Show file

    def _parallel_run(self, max_workers=None, pool_size=(os.cpu_count() * 2),
                      timeout=1800, **kwargs):
        """Execute parallel compute.

        Parameters
        ----------
        max_workers : None | int
            Number of workers. None will default to cpu count.
        pool_size : int
            Number of futures to submit to a single process pool for
            parallel futures.
        timeout : int | float
            Number of seconds to wait for parallel run iteration to complete
            before returning zeros.
        kwargs : dict
            Keyword arguments to self.run().
        """

        logger.debug('Running parallel execution with max_workers={}'
                     .format(max_workers))
        i = 0
        N, pc_chunks = self._pre_split_pc(pool_size=pool_size)
        for j, pc_chunk in enumerate(pc_chunks):
            logger.debug('Starting process pool for points control '
                         'iteration {} out of {}'
                         .format(j + 1, len(pc_chunks)))

            failed_futures = False
            chunks = {}
            futures = []
            loggers = [__name__, 'reV.gen', 'reV.econ', 'reV']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                for pc in pc_chunk:
                    future = exe.submit(self.run, pc, **kwargs)
                    futures.append(future)
                    chunks[future] = pc

                for future in futures:
                    i += 1
                    try:
                        result = future.result(timeout=timeout)
                    except TimeoutError:
                        failed_futures = True
                        sites = chunks[future].project_points.sites
                        result = self._handle_failed_future(future, i, sites,
                                                            timeout)

                    self.out = result

                    mem = psutil.virtual_memory()
                    m = ('Parallel run at iteration {0} out of {1}. '
                         'Memory utilization is {2:.3f} GB out of {3:.3f} GB '
                         'total ({4:.1f}% used, intended limit of {5:.1f}%)'
                         .format(i, N, mem.used / 1e9, mem.total / 1e9,
                                 100 * mem.used / mem.total,
                                 100 * self.mem_util_lim))
                    logger.info(m)

                if failed_futures:
                    logger.info('Forcing pool shutdown after failed futures.')
                    exe.shutdown(wait=False)
                    logger.info('Forced pool shutdown complete.')

        self.flush()

Example #4

0

Show file

    def run_parallel(self, agg_method='mean', excl_area=0.0081,
                     max_workers=None, chunk_point_len=1000):
        """
        Aggregate in parallel

        Parameters
        ----------
        agg_method : str
            Aggregation method, either mean or sum/aggregate
        excl_area : float
            Area of an exclusion cell (square km).
        max_workers : int | None
            Number of cores to run summary on. None is all
            available cpus.
        chunk_point_len : int
            Number of SC points to process on a single parallel worker.

        Returns
        -------
        agg_out : dict
            Aggregated values for each aggregation dataset
        """
        chunks = np.array_split(
            self._gids, int(np.ceil(len(self._gids) / chunk_point_len)))

        logger.info('Running supply curve point aggregation for '
                    'points {} through {} at a resolution of {} '
                    'on {} cores in {} chunks.'
                    .format(self._gids[0], self._gids[-1], self._resolution,
                            max_workers, len(chunks)))

        n_finished = 0
        futures = []
        dsets = self._agg_dsets + ('meta', )
        agg_out = {ds: [] for ds in dsets}
        loggers = [__name__, 'reV.supply_curve.points']
        with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe:
            # iterate through split executions, submitting each to worker
            for gid_set in chunks:
                # submit executions and append to futures list
                futures.append(exe.submit(
                    self.run_serial,
                    self._excl_fpath,
                    self._h5_fpath,
                    self._tm_dset,
                    *self._agg_dsets,
                    agg_method=agg_method,
                    excl_dict=self._excl_dict,
                    area_filter_kernel=self._area_filter_kernel,
                    min_area=self._min_area,
                    check_excl_layers=self._check_excl_layers,
                    resolution=self._resolution,
                    excl_area=excl_area,
                    gids=gid_set,
                    gen_index=self._gen_index))

            # gather results
            for future in futures:
                n_finished += 1
                logger.info('Parallel aggregation futures collected: '
                            '{} out of {}'
                            .format(n_finished, len(chunks)))
                for k, v in future.result().items():
                    if v:
                        agg_out[k].extend(v)

        return agg_out

Example #5

0

Show file

    def run_parallel(self, sc_point_method, args=None, kwargs=None,
                     max_workers=None, chunk_point_len=1000):
        """
        Aggregate with sc_point_method in parallel

        Parameters
        ----------
        args : list | None
            List of positional args for sc_point_method
        kwargs : dict | None
            Dict of kwargs for sc_point_method
        max_workers : int | None
            Number of cores to run summary on. None is all
            available cpus.
        chunk_point_len : int
            Number of SC points to process on a single parallel worker.

        Returns
        -------
        summary : list
            List of outputs from sc_point_method.
        """

        chunks = np.array_split(
            self._gids, int(np.ceil(len(self._gids) / chunk_point_len)))

        logger.info('Running supply curve point aggregation for '
                    'points {} through {} at a resolution of {} '
                    'on {} cores in {} chunks.'
                    .format(self._gids[0], self._gids[-1], self._resolution,
                            max_workers, len(chunks)))

        n_finished = 0
        futures = []
        output = []
        loggers = [__name__, 'reV.supply_curve.points']
        with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe:

            # iterate through split executions, submitting each to worker
            for gid_set in chunks:
                # submit executions and append to futures list
                futures.append(exe.submit(
                    self.run_serial,
                    sc_point_method, self._excl_fpath, self._tm_dset,
                    excl_dict=self._excl_dict,
                    area_filter_kernel=self._area_filter_kernel,
                    min_area=self._min_area,
                    check_excl_layers=self._check_excl_layers,
                    resolution=self._resolution,
                    gids=gid_set,
                    args=args,
                    kwargs=kwargs))

            # gather results
            for future in as_completed(futures):
                n_finished += 1
                logger.info('Parallel aggregation futures collected: '
                            '{} out of {}'
                            .format(n_finished, len(chunks)))
                output += future.result()

        return output

Example #6

0

Show file

File: temporal_stats.py Project: NREL/rex

    def compute_statistics(self,
                           dataset,
                           sites=None,
                           diurnal=False,
                           month=False,
                           combinations=False,
                           max_workers=None,
                           chunks_per_worker=5,
                           lat_lon_only=True):
        """
        Compute statistics

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = self._get_slices(dataset,
                                  sites,
                                  chunks_per_slice=chunks_per_worker)
        if len(slices) == 1:
            max_workers = 1

        if max_workers > 1:
            msg = ('Extracting {} for {} in parallel using {} workers'.format(
                list(self.statistics), dataset, max_workers))
            logger.info(msg)

            loggers = [__name__, 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites_slice in slices:
                    future = exe.submit(self._extract_stats,
                                        self.res_h5,
                                        self.statistics,
                                        dataset,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        time_index=self.time_index,
                                        sites_slice=sites_slice,
                                        diurnal=diurnal,
                                        month=month,
                                        combinations=combinations)
                    futures.append(future)

                res_stats = []
                for i, future in enumerate(as_completed(futures)):
                    res_stats.append(future.result())
                    logger.debug('Completed {} out of {} workers'.format(
                        (i + 1), len(futures)))
        else:
            msg = ('Extracting {} for {} in serial'.format(
                self.statistics.keys(), dataset))
            logger.info(msg)
            res_stats = []
            for i, sites_slice in enumerate(slices):
                res_stats.append(
                    self._extract_stats(self.res_h5,
                                        self.statistics,
                                        dataset,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        time_index=self.time_index,
                                        sites_slice=sites_slice,
                                        diurnal=diurnal,
                                        month=month,
                                        combinations=combinations))
                logger.debug('Completed {} out of {} sets of sites'.format(
                    (i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        res_stats = pd.concat(res_stats)

        if lat_lon_only:
            meta = self.lat_lon
        else:
            meta = self.meta

        res_stats = meta.join(res_stats.sort_index(), how='inner')

        return res_stats

Example #7

0

Show file

File: joint_pd.py Project: NREL/rex

    def compute(self,
                dset1,
                dset2,
                bins1,
                bins2,
                sites=None,
                max_workers=None,
                chunks_per_worker=5):
        """
        Compute joint probability distribution between given datasets using
        given bins for all sites.

        Parameters
        ----------
        dset1 : str
            Dataset 1 to generate joint probability distribution for
        dset2 : str
            Dataset 2 to generate joint probabilty distribution for
        bins1 : tuple
            (start, stop, step) for dataset 1 bins. The stop value is
            inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4,
            6). If the stop value is not perfectly divisible by the step, the
            last bin will overshoot the stop value.
        bins2 : tuple
            (start, stop, step) for dataset 2 bins. The stop value is
            inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4,
            6). If the stop value is not perfectly divisible by the step, the
            last bin will overshoot the stop value.
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5

        Returns
        -------
        jpd: pandas.DataFrame
            DataFrame of joint probability distribution between given datasets
            with given bins
        """
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = self._get_slices(dset1,
                                  dset2,
                                  sites,
                                  chunks_per_slice=chunks_per_worker)
        if len(slices) == 1:
            max_workers = 1

        jpd = {}
        if max_workers > 1:
            msg = ('Computing the joint probability distribution between {} '
                   'and {} in parallel using {} workers'.format(
                       dset1, dset2, max_workers))
            logger.info(msg)

            loggers = [__name__, 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites_slice in slices:
                    future = exe.submit(self.compute_joint_pd,
                                        self.res_h5,
                                        dset1,
                                        dset2,
                                        bins1,
                                        bins2,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        sites_slice=sites_slice)
                    futures.append(future)

                for i, future in enumerate(as_completed(futures)):
                    jpd.update(future.result())
                    logger.debug('Completed {} out of {} workers'.format(
                        (i + 1), len(futures)))

        else:
            msg = ('Computing the joint probability distribution between {} '
                   'and {} in serial.'.format(dset1, dset2))
            logger.info(msg)
            for i, sites_slice in enumerate(slices):
                jpd.update(
                    self.compute_joint_pd(self.res_h5,
                                          dset1,
                                          dset2,
                                          bins1,
                                          bins2,
                                          res_cls=self.res_cls,
                                          hsds=self._hsds,
                                          sites_slice=sites_slice))
                logger.debug('Completed {} out of {} sets of sites'.format(
                    (i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        bins1 = self._make_bins(*bins1)
        bins2 = self._make_bins(*bins2)
        index = np.meshgrid(bins1[:-1], bins2[:-1], indexing='ij')
        index = np.array(index).T.reshape(-1, 2).astype(np.int16)
        index = pd.MultiIndex.from_arrays(index.T, names=(dset1, dset2))
        jpd = pd.DataFrame({k: v.flatten(order='F')
                            for k, v in jpd.items()},
                           index=index).sort_index(axis=1)

        return jpd