def split_sources(self, sources=None, src_filter=None, maxweight=None, concurrent_tasks=None): """ Split a set of sources of the same source group; light sources (i.e. with weight <= maxweight) are not split. :param sources: sources of the same source group :param src_filter: SourceFilter instance :param maxweight: weight used to decide if a source is light :yields: blocks of sources of weight around maxweight """ if sources is None: sources = self.get_sources() if src_filter is None: src_filter = self.src_filter if maxweight is None: maxweight = self.get_maxweight(concurrent_tasks) light = [src for src in sources if src.weight <= maxweight] for block in block_splitter(light, maxweight, weight=operator.attrgetter('weight')): yield block heavy = [src for src in sources if src.weight > maxweight] for src in heavy: srcs = split_filter_source(src, src_filter) for block in block_splitter(srcs, maxweight, weight=operator.attrgetter('weight')): yield block
def split_in_blocks(self, maxweight, sources): """ Split a set of sources in blocks of weight up to maxweight; heavy sources (i.e. with weight > maxweight) are split. :param maxweight: maximum weight of a block :param sources: sources of the same source group :yields: blocks of sources of weight around maxweight """ sources.sort(key=weight) # yield light sources in blocks light = [src for src in sources if src.weight <= maxweight] for block in block_splitter(light, maxweight, weight): yield block # yield heavy sources in blocks heavy = [src for src in sources if src.weight > maxweight] for src in heavy: srcs = [ s for s in source.split_source(src) if self.src_filter.get_close_sites(s) is not None ] for block in block_splitter(srcs, maxweight, weight): yield block
def split_sources(self, sources, src_filter, maxweight=MAXWEIGHT): """ Split a set of sources of the same source group; light sources (i.e. with weight <= maxweight) are not split. :param sources: sources of the same source group :param src_filter: SourceFilter instance :param maxweight: weight used to decide if a source is light :yields: blocks of sources of weight around maxweight """ light = [src for src in sources if src.weight <= maxweight] self.add_infos(light) for block in block_splitter(light, maxweight, weight=operator.attrgetter('weight')): yield block heavy = [src for src in sources if src.weight > maxweight] self.add_infos(heavy) for src in heavy: srcs = sourceconverter.split_filter_source(src, src_filter) if len(srcs) > 1: logging.info('Splitting %s "%s" in %d sources', src.__class__.__name__, src.source_id, len(srcs)) for block in block_splitter(srcs, maxweight, weight=operator.attrgetter('weight')): yield block
def build_starmap(self, ssm, sitecol, assetcol, riskmodel, imts, trunc_level, correl_model, min_iml, monitor): """ :param ssm: CompositeSourceModel containing a single source model :param sitecol: a SiteCollection instance :param assetcol: an AssetCollection instance :param riskmodel: a RiskModel instance :param imts: a list of Intensity Measure Types :param trunc_level: truncation level :param correl_model: correlation model :param min_iml: vector of minimum intensities, one per IMT :param monitor: a Monitor instance :returns: a pair (starmap, dictionary) """ ruptures_by_grp = AccumDict() num_ruptures = 0 num_events = 0 allargs = [] grp_trt = {} # collect the sources maxweight = ssm.get_maxweight(self.oqparam.concurrent_tasks) logging.info('Using a maxweight of %d', maxweight) for src_group in ssm.src_groups: grp_trt[src_group.id] = trt = src_group.trt gsims = ssm.gsim_lt.values[trt] for block in block_splitter(src_group, maxweight, getweight): allargs.append((block, self.sitecol, gsims, monitor)) # collect the ruptures for dic in parallel.starmap(self.compute_ruptures, allargs): ruptures_by_grp += dic [rupts] = dic.values() num_ruptures += len(rupts) num_events += dic.num_events ruptures_by_grp.num_events = num_events save_ruptures(self, ruptures_by_grp) # determine the realizations rlzs_assoc = ssm.info.get_rlzs_assoc( count_ruptures=lambda grp: len(ruptures_by_grp.get(grp.id, 0))) allargs = [] # prepare the risk inputs ruptures_per_block = self.oqparam.ruptures_per_block for src_group in ssm.src_groups: for rupts in block_splitter( ruptures_by_grp[src_group.id], ruptures_per_block): trt = grp_trt[rupts[0].grp_id] ri = riskinput.RiskInputFromRuptures( trt, imts, sitecol, rupts, trunc_level, correl_model, min_iml) allargs.append((ri, riskmodel, rlzs_assoc, assetcol, monitor)) taskname = '%s#%d' % (losses_by_taxonomy.__name__, ssm.sm_id + 1) smap = starmap(losses_by_taxonomy, allargs, name=taskname) attrs = dict(num_ruptures={ sg_id: len(rupts) for sg_id, rupts in ruptures_by_grp.items()}, num_events=num_events, num_rlzs=len(rlzs_assoc.realizations), sm_id=ssm.sm_id) return smap, attrs
def gen_args(self, src_groups, oq, monitor): """ Used in the case of large source model logic trees. :param src_groups: a list of SourceGroup instances :param oq: a :class:`openquake.commonlib.oqvalidation.OqParam` instance :param monitor: a :class:`openquake.baselib.performance.Monitor` :yields: (sources, sites, gsims, monitor) tuples """ ngroups = len(src_groups) maxweight = self.csm.get_maxweight(oq.concurrent_tasks) logging.info('Using a maxweight of %d', maxweight) nheavy = nlight = 0 self.infos = {} for sg in src_groups: logging.info('Sending source group #%d of %d (%s, %d sources)', sg.id + 1, ngroups, sg.trt, len(sg.sources)) gsims = self.rlzs_assoc.gsims_by_grp_id[sg.id] if oq.poes_disagg: # only for disaggregation monitor.sm_id = self.rlzs_assoc.sm_ids[sg.id] monitor.seed = self.rlzs_assoc.seed monitor.samples = self.rlzs_assoc.samples[sg.id] light = [src for src in sg.sources if src.weight <= maxweight] for block in block_splitter( light, maxweight, weight=operator.attrgetter('weight')): for src in block: self.infos[sg.id, src.source_id] = source.SourceInfo(src) yield block, self.sitecol, gsims, monitor nlight += 1 heavy = [src for src in sg.sources if src.weight > maxweight] if not heavy: continue with self.monitor('split/filter heavy sources', autoflush=True): for src in heavy: sites = self.ss_filter.affected(src) self.infos[sg.id, src.source_id] = source.SourceInfo(src) sources = split_filter_source( src, sites, self.ss_filter, self.random_seed) if len(sources) > 1: logging.info( 'Splitting %s "%s" in %d sources', src.__class__.__name__, src.source_id, len(sources)) for block in block_splitter( sources, maxweight, weight=operator.attrgetter('weight')): yield block, sites, gsims, monitor nheavy += 1 logging.info('Sent %d light and %d heavy tasks', nlight, nheavy)
def apply(cls, task, args, concurrent_tasks=cpu_count * 3, maxweight=None, weight=lambda item: 1, key=lambda item: 'Unspecified', distribute=None, progress=logging.info): r""" Apply a task to a tuple of the form (sequence, \*other_args) by first splitting the sequence in chunks, according to the weight of the elements and possibly to a key (see :func: `openquake.baselib.general.split_in_blocks`). :param task: a task to run in parallel :param args: the arguments to be passed to the task function :param concurrent_tasks: hint about how many tasks to generate :param maxweight: if not None, used to split the tasks :param weight: function to extract the weight of an item in arg0 :param key: function to extract the kind of an item in arg0 :param distribute: if not given, inferred from OQ_DISTRIBUTE :param progress: logging function to use (default logging.info) :returns: an :class:`IterResult` object """ arg0 = args[0] # this is assumed to be a sequence mon = args[-1] args = args[1:-1] if maxweight: # block_splitter is lazy task_args = ((blk,) + args for blk in block_splitter( arg0, maxweight, weight, key)) else: # split_in_blocks is eager task_args = [(blk,) + args for blk in split_in_blocks( arg0, concurrent_tasks or 1, weight, key)] return cls(task, task_args, mon, distribute, progress).submit_all()
def get_rupture_getters(dstore, ct=0, slc=slice(None), srcfilter=None): """ :param dstore: a :class:`openquake.commonlib.datastore.DataStore` :param ct: number of concurrent tasks :returns: a list of RuptureGetters """ full_lt = dstore['full_lt'] rlzs_by_gsim = full_lt.get_rlzs_by_gsim() rup_array = dstore['ruptures'][slc] if len(rup_array) == 0: raise NotFound('There are no ruptures in %s' % dstore) rup_array.sort(order=['trt_smr', 'n_occ']) scenario = 'scenario' in dstore['oqparam'].calculation_mode proxies = [RuptureProxy(rec, scenario) for rec in rup_array] maxweight = rup_array['n_occ'].sum() / (ct / 2 or 1) rgetters = [] for block in general.block_splitter(proxies, maxweight, operator.itemgetter('n_occ'), key=operator.itemgetter('trt_smr')): trt_smr = block[0]['trt_smr'] rbg = rlzs_by_gsim[trt_smr] rg = RuptureGetter(block, dstore.filename, trt_smr, full_lt.trt_by(trt_smr), rbg) rgetters.append(rg) return rgetters
def gen_rupture_getters(dstore, slc=slice(None), concurrent_tasks=1, filename=None): """ :yields: RuptureGetters """ try: e0s = dstore['eslices'][:, 0] except KeyError: e0s = None if dstore.parent: dstore = dstore.parent csm_info = dstore['csm_info'] trt_by_grp = csm_info.grp_by("trt") samples = csm_info.get_samples_by_grp() rlzs_by_gsim = csm_info.get_rlzs_by_gsim_grp() rup_array = dstore['ruptures'][slc] maxweight = numpy.ceil(len(rup_array) / (concurrent_tasks or 1)) nr, ne = 0, 0 for grp_id, arr in general.group_array(rup_array, 'grp_id').items(): if not rlzs_by_gsim[grp_id]: # this may happen if a source model has no sources, like # in event_based_risk/case_3 continue for block in general.block_splitter(arr, maxweight): if e0s is None: e0 = numpy.zeros(len(block), U32) else: e0 = e0s[nr: nr + len(block)] rgetter = RuptureGetter( numpy.array(block), filename or dstore.filename, grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim[grp_id], e0) yield rgetter nr += len(block) ne += rgetter.num_events
def apply(cls, task, task_args, concurrent_tasks=executor.num_tasks_hint, maxweight=None, weight=lambda item: 1, key=lambda item: 'Unspecified', name=None): """ Apply a task to a tuple of the form (sequence, \*other_args) by first splitting the sequence in chunks, according to the weight of the elements and possibly to a key (see :func: `openquake.baselib.general.split_in_blocks`). :param task: a task to run in parallel :param task_args: the arguments to be passed to the task function :param agg: the aggregation function :param acc: initial value of the accumulator (default empty AccumDict) :param concurrent_tasks: hint about how many tasks to generate :param maxweight: if not None, used to split the tasks :param weight: function to extract the weight of an item in arg0 :param key: function to extract the kind of an item in arg0 """ arg0 = task_args[0] # this is assumed to be a sequence args = task_args[1:] if maxweight: chunks = block_splitter(arg0, maxweight, weight, key) else: chunks = split_in_blocks(arg0, concurrent_tasks or 1, weight, key) return cls(task, [(chunk,) + args for chunk in chunks], name)
def classical_split_filter(srcs, srcfilter, gsims, params, monitor): """ Split the given sources, filter the subsources and the compute the PoEs. Yield back subtasks if the split sources contain more than maxweight ruptures. """ # first check if we are sampling the sources ss = int(os.environ.get('OQ_SAMPLE_SOURCES', 0)) if ss: splits, stime = split_sources(srcs) srcs = readinput.random_filtered_sources(splits, srcfilter, ss) yield classical(srcs, srcfilter, gsims, params, monitor) return sources = [] with monitor("filtering/splitting sources"): for src, _sites in srcfilter(srcs): if src.num_ruptures >= params['maxweight']: splits, stime = split_sources([src]) sources.extend(srcfilter.filter(splits)) else: sources.append(src) blocks = list(block_splitter(sources, params['maxweight'], operator.attrgetter('num_ruptures'))) if blocks: # yield the first blocks (if any) and compute the last block in core # NB: the last block is usually the smallest one for block in blocks[:-1]: yield classical, block, srcfilter, gsims, params yield classical(blocks[-1], srcfilter, gsims, params, monitor)
def _send_sources(self, smap): oq = self.oqparam opt = self.oqparam.optimize_same_id_sources nrup = operator.attrgetter('num_ruptures') param = dict( truncation_level=oq.truncation_level, imtls=oq.imtls, filter_distance=oq.filter_distance, reqv=oq.get_reqv(), pointsource_distance=oq.pointsource_distance, maxweight=min(self.csm.get_maxweight(nrup, oq.concurrent_tasks), base.RUPTURES_PER_BLOCK)) logging.info('Max ruptures per task = %(maxweight)d', param) num_tasks = 0 num_sources = 0 if self.csm.has_dupl_sources and not opt: logging.warning('Found %d duplicated sources', self.csm.has_dupl_sources) for trt, sources in self.csm.get_trt_sources(): gsims = self.csm.info.gsim_lt.get_gsims(trt) num_sources += len(sources) if hasattr(sources, 'atomic') and sources.atomic: smap.submit(sources, self.src_filter, gsims, param, func=classical) yield sources num_tasks += 1 else: # regroup the sources in blocks for block in block_splitter(sources, param['maxweight'], nrup): smap.submit(block, self.src_filter, gsims, param) yield block num_tasks += 1 logging.info('Sent %d sources in %d tasks', num_sources, num_tasks)
def submit_sources(self, sitecol, siteidx=0): """ Submit the light sources and then the (split) heavy sources. Only the sources affecting the sitecol as considered. Also, set the .seed attribute of each source. """ rlzs_assoc = self.csm.info.get_rlzs_assoc() for kind in ('light', 'heavy'): sources = list(self.get_sources(kind, sitecol)) if not sources: continue # set a seed for each split source; the seed is used # only by the event based calculator, but it is set anyway for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter( sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('trt_model_id')): sent = self.tm.submit(block, sitecol, siteidx, rlzs_assoc, self.monitor.new()) self.source_chunks.append( (len(block), block.weight, sum(sent.values()))) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks)
def classical_split_filter(sources, rlzs_by_gsim, params, monitor): """ Compute the PoEs from filtered sources. """ minw = params['min_weight'] maxw = params['max_weight'] / 2 blocks = list(block_splitter(sources, maxw, get_weight)) if not blocks: yield {'pmap': {}, 'extra': {}} return heavy = [] light = list(blocks[-1]) for block in blocks[:-1]: if block.weight < minw: # extend light sources light.extend(block) else: # heavy block, turn it into a subtask heavy.append(int(block.weight)) yield classical, block, rlzs_by_gsim, params if heavy: msg = 'produced %d subtask with weights %s' % (len(heavy), heavy) try: logs.dbcmd('log', monitor.calc_id, datetime.utcnow(), 'DEBUG', 'classical_split_filter#%d' % monitor.task_no, msg) except Exception: # a foreign key error in case of `oq run` is expected print(msg) yield classical(light, rlzs_by_gsim, params, monitor)
def submit_sources(self, sitecol, siteidx=0): """ Submit the light sources and then the (split) heavy sources. Only the sources affecting the sitecol as considered. Also, set the .seed attribute of each source. """ rlzs_assoc = self.csm.info.get_rlzs_assoc() for kind in ('light', 'heavy'): sources = list(self.get_sources(kind, sitecol)) if not sources: continue # set a seed for each split source; the seed is used # only by the event based calculator, but it is set anyway for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter(sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('trt_model_id')): sent = self.tm.submit(block, sitecol, siteidx, rlzs_assoc, self.monitor.new()) self.source_chunks.append( (len(block), block.weight, sum(sent.values()))) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks)
def gen_args(self, tiles): """ Yield (sources, sitecol, siteidx, rlzs_assoc, monitor) by looping on the tiles and on the source blocks. """ siteidx = 0 for i, sitecol in enumerate(tiles, 1): if len(tiles) > 1: logging.info('Processing tile %d', i) tile = Tile(sitecol, self.maximum_distance) for kind in ('light', 'heavy'): if self.filter_sources: logging.info('Filtering %s sources', kind) sources = list(self.get_sources(kind, tile)) if not sources: continue for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter( sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('src_group_id')): yield (block, sitecol, siteidx, self.rlzs_assoc, self.monitor.new()) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks) siteidx += len(sitecol)
def gen_rupture_getters(dstore, slc=slice(None), concurrent_tasks=1, hdf5cache=None): """ :yields: RuptureGetters """ if dstore.parent: dstore = dstore.parent csm_info = dstore['csm_info'] trt_by_grp = csm_info.grp_by("trt") samples = csm_info.get_samples_by_grp() rlzs_by_gsim = csm_info.get_rlzs_by_gsim_grp() rup_array = dstore['ruptures'][slc] maxweight = numpy.ceil(len(rup_array) / (concurrent_tasks or 1)) nr, ne = 0, 0 for grp_id, arr in general.group_array(rup_array, 'grp_id').items(): if not rlzs_by_gsim[grp_id]: # this may happen if a source model has no sources, like # in event_based_risk/case_3 continue for block in general.block_splitter(arr, maxweight): rgetter = RuptureGetter(hdf5cache or dstore.filename, numpy.array(block), grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim[grp_id]) rgetter.weight = getattr(block, 'weight', len(block)) yield rgetter nr += len(block) ne += rgetter.num_events logging.info('Read %d ruptures and %d events', nr, ne)
def _send_sources(self, smap): oq = self.oqparam opt = self.oqparam.optimize_same_id_sources nrup = operator.attrgetter('num_ruptures') param = dict( truncation_level=oq.truncation_level, imtls=oq.imtls, filter_distance=oq.filter_distance, reqv=oq.get_reqv(), pointsource_distance=oq.pointsource_distance, maxweight=min(self.csm.get_maxweight(nrup, oq.concurrent_tasks), base.RUPTURES_PER_BLOCK)) logging.info('Max ruptures per task = %(maxweight)d', param) num_tasks = 0 num_sources = 0 if self.csm.has_dupl_sources and not opt: logging.warning('Found %d duplicated sources', self.csm.has_dupl_sources) for trt, sources in self.csm.get_trt_sources(): gsims = self.csm.info.gsim_lt.get_gsims(trt) num_sources += len(sources) if hasattr(sources, 'atomic') and sources.atomic: smap.submit(sources, self.src_filter, gsims, param, func=classical) yield sources num_tasks += 1 else: # regroup the sources in blocks for block in block_splitter(sources, param['maxweight'], nrup): smap.submit(block, self.src_filter, gsims, param) yield block num_tasks += 1 logging.info('Sent %d sources in %d tasks', num_sources, num_tasks)
def _gen_riskinputs(self, kind, eps, num_events): rinfo_dt = numpy.dtype([('sid', U16), ('num_assets', U16)]) rinfo = [] assets_by_site = self.assetcol.assets_by_site() dstore = self.can_read_parent() or self.datastore for sid, assets in enumerate(assets_by_site): if len(assets) == 0: continue # build the riskinputs if kind == 'poe': # hcurves, shape (R, N) getter = getters.PmapGetter(dstore, self.rlzs_assoc, [sid]) getter.num_rlzs = self.R else: # gmf getter = getters.GmfDataGetter(dstore, [sid], self.R) if dstore is self.datastore: # read the hazard data in the controller node getter.init() else: # the datastore must be closed to avoid the HDF5 fork bug assert dstore.hdf5 == (), '%s is not closed!' % dstore for block in general.block_splitter( assets, self.oqparam.assets_per_site_limit): # dictionary of epsilons for the reduced assets reduced_eps = {ass.ordinal: eps[ass.ordinal] for ass in block if eps is not None and len(eps)} yield riskinput.RiskInput(getter, [block], reduced_eps) rinfo.append((sid, len(block))) if len(block) >= TWO16: logging.error('There are %d assets on site #%d!', len(block), sid) self.datastore['riskinput_info'] = numpy.array(rinfo, rinfo_dt)
def classical_split_filter(srcs, srcfilter, gsims, params, monitor): """ Split the given sources, filter the subsources and the compute the PoEs. Yield back subtasks if the split sources contain more than maxweight ruptures. """ # first check if we are sampling the sources ss = int(os.environ.get('OQ_SAMPLE_SOURCES', 0)) if ss: splits, stime = split_sources(srcs) srcs = random_filtered_sources(splits, srcfilter, ss) yield classical(srcs, srcfilter, gsims, params, monitor) return # NB: splitting all the sources improves the distribution significantly, # compared to splitting only the big source sources = [] with monitor("filtering/splitting sources"): for src, _sites in srcfilter(srcs): splits, _stime = split_sources([src]) sources.extend(srcfilter.filter(splits)) if sources: sources.sort(key=weight) totsites = len(srcfilter.sitecol) mw = 1000 if totsites <= params['max_sites_disagg'] else 50000 mweight = max(mw, sum(src.weight for src in sources) / params['task_multiplier']) blocks = list(block_splitter(sources, mweight, weight)) for block in blocks[:-1]: yield classical, block, srcfilter, gsims, params yield classical(blocks[-1], srcfilter, gsims, params, monitor)
def submit_sources(self, sitecol, siteidx=0): """ Submit the light sources and then the (split) heavy sources. Only the sources affecting the sitecol as considered. """ tile = Tile(sitecol, self.maximum_distance) for kind in ('light', 'heavy'): if self.filter_sources: logging.info('Filtering %s sources', kind) sources = list(self.get_sources(kind, tile)) if not sources: continue for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter(sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('trt_model_id')): sent = self.tm.submit(block, sitecol, siteidx, self.rlzs_assoc, self.monitor.new()) self.source_chunks.append( (len(block), block.weight, sum(sent.values()))) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks)
def classical_split_filter(srcs, srcfilter, gsims, params, monitor): """ Split the given sources, filter the subsources and the compute the PoEs. Yield back subtasks if the split sources contain more than maxweight ruptures. """ # first check if we are sampling the sources ss = int(os.environ.get('OQ_SAMPLE_SOURCES', 0)) if ss: splits, stime = split_sources(srcs) srcs = readinput.random_filtered_sources(splits, srcfilter, ss) yield classical(srcs, srcfilter, gsims, params, monitor) return sources = [] with monitor("filtering/splitting sources"): for src, _sites in srcfilter(srcs): if src.num_ruptures >= params['maxweight']: splits, stime = split_sources([src]) sources.extend(srcfilter.filter(splits)) else: sources.append(src) blocks = list(block_splitter(sources, params['maxweight'], operator.attrgetter('num_ruptures'))) if blocks: # yield the first blocks (if any) and compute the last block in core # NB: the last block is usually the smallest one for block in blocks[:-1]: yield classical, block, srcfilter, gsims, params yield classical(blocks[-1], srcfilter, gsims, params, monitor)
def gen_args(self, ruptures_by_grp): """ :param ruptures_by_grp: a dictionary of EBRupture objects :yields: the arguments for compute_gmfs_and_curves """ oq = self.oqparam monitor = self.monitor(self.core_task.__name__) imts = list(oq.imtls) min_iml = calc.fix_minimum_intensity(oq.minimum_intensity, imts) correl_model = oq.get_correl_model() try: csm_info = self.csm.info except AttributeError: # no csm csm_info = self.datastore['csm_info'] samples_by_grp = csm_info.get_samples_by_grp() for grp_id in ruptures_by_grp: ruptures = ruptures_by_grp[grp_id] if not ruptures: continue rlzs_by_gsim = self.rlzs_assoc.get_rlzs_by_gsim(grp_id) for block in block_splitter(ruptures, oq.ruptures_per_block): samples = samples_by_grp[grp_id] getter = GmfGetter(rlzs_by_gsim, block, self.sitecol, imts, min_iml, oq.truncation_level, correl_model, samples) yield getter, oq, monitor
def from_sources(self, par, monitor): """ Prefilter the composite source model and store the source_info """ self.R = self.csm.info.get_num_rlzs() num_rlzs = {grp_id: sum( len(rlzs) for rlzs in self.rlzs_by_gsim_grp[grp_id].values()) for grp_id in self.rlzs_by_gsim_grp} param = {'ruptures_per_block': RUPTURES_PER_BLOCK} param['filter_distance'] = self.oqparam.filter_distance param['ses_per_logic_tree_path'] = self.oqparam.ses_per_logic_tree_path param['gsims_by_trt'] = self.csm.gsim_lt.values param['pointsource_distance'] = self.oqparam.pointsource_distance logging.info('Building ruptures') ires = parallel.Starmap.apply( build_ruptures, (self.csm.get_sources(), self.src_filter, param, monitor), concurrent_tasks=self.oqparam.concurrent_tasks, weight=operator.attrgetter('num_ruptures'), key=operator.attrgetter('src_group_id')) def weight(ebr): return numpy.sqrt(num_rlzs[ebr.grp_id] * ebr.multiplicity * len(ebr.sids)) for ruptures in block_splitter(self._store_ruptures(ires), BLOCKSIZE, weight, operator.attrgetter('grp_id')): ebr = ruptures[0] rlzs_by_gsim = self.rlzs_by_gsim_grp[ebr.grp_id] par = par.copy() par['samples'] = self.samples_by_grp[ebr.grp_id] yield ruptures, self.src_filter, rlzs_by_gsim, par, monitor self.setting_events() if self.oqparam.ground_motion_fields: logging.info('Building GMFs')
def get_rupture_getters(dstore, ct=0, slc=slice(None), srcfilter=None): """ :param dstore: a :class:`openquake.commonlib.datastore.DataStore` :param ct: number of concurrent tasks :returns: a list of RuptureGetters """ full_lt = dstore['full_lt'] rlzs_by_gsim = full_lt.get_rlzs_by_gsim() rup_array = dstore['ruptures'][slc] if len(rup_array) == 0: raise NotFound('There are no ruptures in %s' % dstore) rup_array.sort(order='trt_smr') # avoid generating too many tasks scenario = 'scenario' in dstore['oqparam'].calculation_mode if srcfilter is None: proxies = [RuptureProxy(rec, None, scenario) for rec in rup_array] elif len(rup_array) <= 1000: # do not parallelize proxies = weight_ruptures(rup_array, srcfilter, full_lt.trt_by, scenario) else: # parallelize the weighting of the ruptures proxies = parallel.Starmap.apply( weight_ruptures, (rup_array, srcfilter, full_lt.trt_by, scenario), concurrent_tasks=ct).reduce(acc=[]) maxweight = sum(proxy.weight for proxy in proxies) / (ct or 1) rgetters = [] for block in general.block_splitter(proxies, maxweight, operator.attrgetter('weight'), key=operator.itemgetter('trt_smr')): trt_smr = block[0]['trt_smr'] rg = RuptureGetter(block, dstore.filename, trt_smr, full_lt.trt_by(trt_smr), rlzs_by_gsim[trt_smr]) rgetters.append(rg) return rgetters
def gen_args(self, tiles): """ Yield (sources, sitecol, siteidx, rlzs_assoc, monitor) by looping on the tiles and on the source blocks. """ siteidx = 0 for i, sitecol in enumerate(tiles, 1): if len(tiles) > 1: logging.info('Processing tile %d', i) tile = Tile(sitecol, self.maximum_distance) for kind in ('light', 'heavy'): if self.filter_sources: logging.info('Filtering %s sources', kind) sources = list(self.get_sources(kind, tile)) if not sources: continue for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter( sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('trt_model_id')): yield (block, sitecol, siteidx, self.rlzs_assoc, self.monitor.new()) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks) siteidx += len(sitecol)
def gen_rupture_getters(dstore, slc=slice(None), concurrent_tasks=1, hdf5cache=None): """ :yields: RuptureGetters """ if dstore.parent: dstore = dstore.parent csm_info = dstore['csm_info'] trt_by_grp = csm_info.grp_by("trt") samples = csm_info.get_samples_by_grp() rlzs_by_gsim = csm_info.get_rlzs_by_gsim_grp() rup_array = dstore['ruptures'][slc] maxweight = numpy.ceil(len(rup_array) / (concurrent_tasks or 1)) nr, ne, first_event = 0, 0, 0 for grp_id, arr in general.group_array(rup_array, 'grp_id').items(): if not rlzs_by_gsim[grp_id]: # this may happen if a source model has no sources, like # in event_based_risk/case_3 continue for block in general.block_splitter(arr, maxweight): rgetter = RuptureGetter( hdf5cache or dstore.filename, numpy.array(block), grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim[grp_id], first_event) rgetter.weight = getattr(block, 'weight', len(block)) first_event += rgetter.num_events yield rgetter nr += len(block) ne += rgetter.num_events logging.info('Read %d ruptures and %d events', nr, ne)
def apply(cls, task, task_args, concurrent_tasks=executor.num_tasks_hint, maxweight=None, weight=lambda item: 1, key=lambda item: 'Unspecified', name=None): """ Apply a task to a tuple of the form (sequence, \*other_args) by first splitting the sequence in chunks, according to the weight of the elements and possibly to a key (see :func: `openquake.baselib.general.split_in_blocks`). :param task: a task to run in parallel :param task_args: the arguments to be passed to the task function :param agg: the aggregation function :param acc: initial value of the accumulator (default empty AccumDict) :param concurrent_tasks: hint about how many tasks to generate :param maxweight: if not None, used to split the tasks :param weight: function to extract the weight of an item in arg0 :param key: function to extract the kind of an item in arg0 """ arg0 = task_args[0] # this is assumed to be a sequence args = task_args[1:] if maxweight: chunks = block_splitter(arg0, maxweight, weight, key) else: chunks = split_in_blocks(arg0, concurrent_tasks or 1, weight, key) return cls(task, [(chunk, ) + args for chunk in chunks], name)
def _gen_riskinputs_gmf(self, dstore): if 'gmf_data' not in dstore: # needed for case_shakemap dstore.close() dstore = self.datastore if 'gmf_data' not in dstore: raise InvalidFile('Did you forget gmfs_csv in %s?' % self.oqparam.inputs['job_ini']) with self.monitor('reading GMFs'): rlzs = dstore['events']['rlz_id'] gmf_df = dstore.read_df('gmf_data', 'sid') by_sid = dict(list(gmf_df.groupby(gmf_df.index))) logging.info('Grouped the GMFs by site ID') for sid, assets in enumerate(self.assetcol.assets_by_site()): if len(assets) == 0: continue try: df = by_sid[sid] except KeyError: getter = getters.ZeroGetter(sid, rlzs, self.R) else: df['rlzs'] = rlzs[df.eid.to_numpy()] getter = getters.GmfDataGetter(sid, df, len(rlzs), self.R) if len(dstore['gmf_data/gmv_0']) == 0: raise RuntimeError( 'There are no GMFs available: perhaps you did set ' 'ground_motion_fields=False or a large minimum_intensity') for block in general.block_splitter( assets, self.oqparam.assets_per_site_limit): yield riskinput.RiskInput(sid, getter, numpy.array(block)) if len(block) >= TWO16: logging.error('There are %d assets on site #%d!', len(block), sid)
def gen_rupture_getters(dstore, srcfilter, slc=slice(None)): """ :yields: filtered RuptureGetters """ full_lt = dstore['full_lt'] trt_by_grp = full_lt.trt_by_grp samples = full_lt.get_samples_by_grp() rlzs_by_gsim = full_lt.get_rlzs_by_gsim_grp() rup_array = dstore['ruptures'][slc] ct = dstore['oqparam'].concurrent_tasks or 1 items = list(general.group_array(rup_array, 'grp_id').items()) items.sort(key=lambda it: len(it[1])) maxweight = None while items: grp_id, rups = items.pop() # from the largest group if not rlzs_by_gsim[grp_id]: # this may happen if a source model has no sources, like # in event_based_risk/case_3 continue trt = trt_by_grp[grp_id] proxies = list(_gen(rups, srcfilter, trt, samples[grp_id])) if not maxweight: maxweight = sum(p.weight for p in proxies) / ct blocks = list( general.block_splitter(proxies, maxweight, operator.attrgetter('weight'))) logging.info('Group %d: %d ruptures -> %d task(s)', grp_id, len(rups), len(blocks)) for block in blocks: rgetter = RuptureGetter(block, dstore.filename, grp_id, trt, samples[grp_id], rlzs_by_gsim[grp_id]) yield rgetter
def execute(self): """ Parallelize on the riskinputs and returns a dictionary of results. Require a `.core_task` to be defined with signature (riskinputs, crmodel, param, monitor). """ if not hasattr(self, 'riskinputs'): # in the reportwriter return ct = self.oqparam.concurrent_tasks or 1 maxw = sum(ri.weight for ri in self.riskinputs) / ct smap = parallel.Starmap(self.core_task.__func__, h5=self.datastore.hdf5) smap.monitor.save('crmodel', self.crmodel) for block in general.block_splitter(self.riskinputs, maxw, get_weight, sort=True): for ri in block: # we must use eager reading for performance reasons: # concurrent reading on the workers would be extra-slow; # also, I could not get lazy reading to work with # the SWMR mode for event_based_risk if not isinstance(ri.hazard_getter, getters.PmapGetter): ri.hazard_getter.init() smap.submit((block, self.param)) return smap.reduce(self.combine)
def test_block_splitter_with_generator(self): # Test the block with a data set of unknown length # (such as a generator) data = range(10) expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] actual = [x for x in block_splitter(data, 3)] self.assertEqual(expected, actual)
def apply(cls, task, task_args, concurrent_tasks=executor.num_tasks_hint, maxweight=None, weight=lambda item: 1, key=lambda item: 'Unspecified', name=None): """ Apply a task to a tuple of the form (sequence, \*other_args) by first splitting the sequence in chunks, according to the weight of the elements and possibly to a key (see :function: `openquake.baselib.general.split_in_blocks`). Then reduce the results with an aggregation function. The chunks which are generated internally can be seen directly ( useful for debugging purposes) by looking at the attribute `._chunks`, right after the `apply` function has been called. :param task: a task to run in parallel :param task_args: the arguments to be passed to the task function :param agg: the aggregation function :param acc: initial value of the accumulator (default empty AccumDict) :param concurrent_tasks: hint about how many tasks to generate :param maxweight: if not None, used to split the tasks :param weight: function to extract the weight of an item in arg0 :param key: function to extract the kind of an item in arg0 """ arg0 = task_args[0] # this is assumed to be a sequence args = task_args[1:] if maxweight: chunks = block_splitter(arg0, maxweight, weight, key) else: chunks = split_in_blocks(arg0, concurrent_tasks or 1, weight, key) return cls.starmap(task, [(chunk,) + args for chunk in chunks], name)
def submit_sources(self, sitecol, siteidx=0): """ Submit the light sources and then the (split) heavy sources. Only the sources affecting the sitecol as considered. """ tile = Tile(sitecol, self.maximum_distance) for kind in ('light', 'heavy'): if self.filter_sources: logging.info('Filtering %s sources', kind) sources = list(self.get_sources(kind, tile)) if not sources: continue for src in sources: self.csm.filtered_weight += src.weight nblocks = 0 for block in block_splitter( sources, self.maxweight, operator.attrgetter('weight'), operator.attrgetter('trt_model_id')): sent = self.tm.submit(block, sitecol, siteidx, self.rlzs_assoc, self.monitor.new()) self.source_chunks.append( (len(block), block.weight, sum(sent.values()))) nblocks += 1 logging.info('Sent %d sources in %d block(s)', len(sources), nblocks)
def _gen_riskinputs(self, kind, eps, num_events): assets_by_site = self.assetcol.assets_by_site() dstore = self.can_read_parent() or self.datastore for sid, assets in enumerate(assets_by_site): if len(assets) == 0: continue # build the riskinputs if kind == 'poe': # hcurves, shape (R, N) getter = PmapGetter(dstore, self.rlzs_assoc, [sid]) getter.num_rlzs = self.R else: # gmf getter = GmfDataGetter(dstore, [sid], self.R, self.oqparam.imtls) if dstore is self.datastore: # read the hazard data in the controller node getter.init() else: # the datastore must be closed to avoid the HDF5 fork bug assert dstore.hdf5 == (), '%s is not closed!' % dstore for block in general.block_splitter(assets, 1000): # dictionary of epsilons for the reduced assets reduced_eps = { ass.ordinal: eps[ass.ordinal] for ass in block if eps is not None and len(eps) } yield riskinput.RiskInput(getter, [block], reduced_eps)
def apply(cls, task, args, concurrent_tasks=cpu_count * 3, maxweight=None, weight=lambda item: 1, key=lambda item: 'Unspecified', name=None, distribute=None): """ Apply a task to a tuple of the form (sequence, \*other_args) by first splitting the sequence in chunks, according to the weight of the elements and possibly to a key (see :func: `openquake.baselib.general.split_in_blocks`). :param task: a task to run in parallel :param args: the arguments to be passed to the task function :param concurrent_tasks: hint about how many tasks to generate :param maxweight: if not None, used to split the tasks :param weight: function to extract the weight of an item in arg0 :param key: function to extract the kind of an item in arg0 :param name: name of the task to be used in the log :param distribute: if not given, inferred from OQ_DISTRIBUTE :returns: an :class:`IterResult` object """ arg0 = args[0] # this is assumed to be a sequence args = args[1:] if maxweight: chunks = block_splitter(arg0, maxweight, weight, key) else: chunks = split_in_blocks(arg0, concurrent_tasks or 1, weight, key) task_args = [(ch, ) + args for ch in chunks] return cls(task, task_args, name, distribute).submit_all()
def gen_rupture_getters(dstore, srcfilter, ct): """ :param dstore: a :class:`openquake.baselib.datastore.DataStore` :param srcfilter: a :class:`openquake.hazardlib.calc.filters.SourceFilter` :param ct: number of concurrent tasks :yields: filtered RuptureGetters """ full_lt = dstore['full_lt'] trt_by_grp = full_lt.trt_by_grp samples = full_lt.get_samples_by_grp() rlzs_by_gsim = full_lt.get_rlzs_by_gsim_grp() rup_array = dstore['ruptures'][()] items = list(general.group_array(rup_array, 'grp_id').items()) items.sort(key=lambda item: len(item[1])) # other weights were much worse maxweight = None while items: grp_id, rups = items.pop() # from the largest group if not rlzs_by_gsim[grp_id]: # this may happen if a source model has no sources, like # in event_based_risk/case_3 continue trt = trt_by_grp[grp_id] proxies = list(_gen(rups, srcfilter, trt, samples[grp_id])) if not maxweight: maxweight = sum(p.weight for p in proxies) / (ct // 2 or 1) blocks = list(general.block_splitter( proxies, maxweight, operator.attrgetter('weight'))) logging.info('Group %d: %d ruptures -> %d task(s)', grp_id, len(rups), len(blocks)) for block in blocks: rgetter = RuptureGetter( block, dstore.filename, grp_id, trt, samples[grp_id], rlzs_by_gsim[grp_id]) yield rgetter
def start_tasks(self, sm_id, ruptures_by_grp, sitecol, assetcol, riskmodel, imts, trunc_level, correl_model, min_iml, monitor): """ :param sm_id: source model ordinal :param ruptures_by_grp: dictionary of ruptures by src_group_id :param sitecol: a SiteCollection instance :param assetcol: an AssetCollection instance :param riskmodel: a RiskModel instance :param imts: a list of Intensity Measure Types :param trunc_level: truncation level :param correl_model: correlation model :param min_iml: vector of minimum intensities, one per IMT :param monitor: a Monitor instance :returns: an IterResult instance """ csm_info = self.csm_info.get_info(sm_id) grp_ids = sorted(csm_info.get_sm_by_grp()) rlzs_assoc = csm_info.get_rlzs_assoc( count_ruptures=lambda grp: len(ruptures_by_grp.get(grp.id, []))) num_events = sum(ebr.multiplicity for grp in ruptures_by_grp for ebr in ruptures_by_grp[grp]) seeds = self.oqparam.random_seed + numpy.arange(num_events) allargs = [] # prepare the risk inputs ruptures_per_block = self.oqparam.ruptures_per_block start = 0 ignore_covs = self.oqparam.ignore_covs for grp_id in grp_ids: rlzs_by_gsim = rlzs_assoc.get_rlzs_by_gsim(grp_id) samples = rlzs_assoc.samples[grp_id] for rupts in block_splitter(ruptures_by_grp.get(grp_id, []), ruptures_per_block): if ignore_covs or not self.riskmodel.covs: eps = None elif self.oqparam.asset_correlation: eps = EpsilonMatrix1(num_events, self.oqparam.master_seed) else: n_events = sum(ebr.multiplicity for ebr in rupts) eps = EpsilonMatrix0(len(self.assetcol), seeds[start:start + n_events]) start += n_events getter = riskinput.GmfGetter(grp_id, rlzs_by_gsim, rupts, sitecol, imts, min_iml, trunc_level, correl_model, samples) ri = riskinput.RiskInputFromRuptures(getter, eps) allargs.append((ri, riskmodel, assetcol, monitor)) self.vals = self.assetcol.values() taskname = '%s#%d' % (event_based_risk.__name__, sm_id + 1) ires = Starmap(event_based_risk, allargs, name=taskname).submit_all() ires.num_ruptures = { sg_id: len(rupts) for sg_id, rupts in ruptures_by_grp.items() } ires.num_events = num_events ires.num_rlzs = len(rlzs_assoc.realizations) ires.sm_id = sm_id return ires
def execute(self): oq = self.oqparam self.set_param(num_taxonomies=self.assetcol.num_taxonomies_by_site(), maxweight=oq.ebrisk_maxweight / (oq.concurrent_tasks or 1)) parent = self.datastore.parent if parent: hdf5path = parent.filename grp_indices = parent['ruptures'].attrs['grp_indices'] nruptures = len(parent['ruptures']) else: hdf5path = self.datastore.hdf5cache() grp_indices = self.datastore['ruptures'].attrs['grp_indices'] nruptures = len(self.datastore['ruptures']) with hdf5.File(hdf5path, 'r+') as cache: self.datastore.hdf5.copy('weights', cache) self.datastore.hdf5.copy('ruptures', cache) self.datastore.hdf5.copy('rupgeoms', cache) self.init_logic_tree(self.csm_info) smap = parallel.Starmap(self.core_task.__func__, monitor=self.monitor()) trt_by_grp = self.csm_info.grp_by("trt") samples = self.csm_info.get_samples_by_grp() rlzs_by_gsim_grp = self.csm_info.get_rlzs_by_gsim_grp() ruptures_per_block = numpy.ceil(nruptures / (oq.concurrent_tasks or 1)) for grp_id, rlzs_by_gsim in rlzs_by_gsim_grp.items(): start, stop = grp_indices[grp_id] for indices in general.block_splitter(range(start, stop), ruptures_per_block): rgetter = getters.RuptureGetter(hdf5path, list(indices), grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim) smap.submit(rgetter, self.src_filter, self.param) return smap.reduce(self.agg_dicts, numpy.zeros(self.N))
def gen_task_queue(self): """ Build a task queue to be attached to the Starmap instance """ oq = self.oqparam gsims_by_trt = self.csm_info.get_gsims_by_trt() trt_sources = self.csm.get_trt_sources(optimize_dupl=True) del self.csm # save memory def srcweight(src): trt = src.tectonic_region_type g = len(gsims_by_trt[trt]) m = (oq.maximum_distance(trt) / 300)**2 return src.weight * g * m totweight = sum( sum(srcweight(src) for src in sources) for trt, sources, atomic in trt_sources) param = dict(truncation_level=oq.truncation_level, imtls=oq.imtls, filter_distance=oq.filter_distance, reqv=oq.get_reqv(), maximum_distance=oq.maximum_distance, pointsource_distance=oq.pointsource_distance, shift_hypo=oq.shift_hypo, max_weight=oq.max_weight, max_sites_disagg=oq.max_sites_disagg) srcfilter = self.src_filter(self.datastore.tempname) if oq.calculation_mode == 'preclassical' and self.N == 1: f1 = f2 = ruptures_by_mag_dist elif oq.calculation_mode == 'preclassical': f1 = f2 = preclassical elif oq.split_by_magnitude: f1 = f2 = classical else: f1, f2 = classical, classical_split_filter C = oq.concurrent_tasks or 1 for trt, sources, atomic in trt_sources: param['effect'] = self.effect.get(trt) gsims = gsims_by_trt[trt] if atomic: # do not split atomic groups nb = 1 yield f1, (sources, srcfilter, gsims, param) else: # regroup the sources in blocks if oq.split_by_magnitude: sources = split_by_mag(sources) blocks = list(block_splitter(sources, totweight / C, srcweight)) nb = len(blocks) for block in blocks: logging.debug('Sending %d sources with weight %d', len(block), block.weight) yield f2, (block, srcfilter, gsims, param) nr = sum(src.weight for src in sources) logging.info('TRT = %s', trt) logging.info('max_dist=%d km, gsims=%d, ruptures=%d, blocks=%d', oq.maximum_distance(trt), len(gsims), nr, nb)
def split_sources(csm, sources, src_filter, maxweight): """ Fast replacement of CompositeSourceModel.split_sources """ csm.add_infos(sources) return general.block_splitter(sources, maxweight, weight=operator.attrgetter('weight'))
def test_split_with_weight(self): weights = dict([('a', 11), ('b', 10), ('c', 100), ('d', 15), ('e', 20), ('f', 5), ('g', 30), ('h', 17), ('i', 25)]) blocks = list(block_splitter('abcdefghi', 50, weights.get)) self.assertEqual( repr(blocks), "[<WeightedSequence ['a', 'b'], weight=21>, <WeightedSequence ['c'], weight=100>, <WeightedSequence ['d', 'e', 'f'], weight=40>, <WeightedSequence ['g', 'h'], weight=47>, <WeightedSequence ['i'], weight=25>]" )
def test_block_splitter(self): expected = [ [0, 1, 2], [3, 4, 5], [6, 7, 8], [9], ] actual = [x for x in block_splitter(self.DATA, 3)] self.assertEqual(expected, actual)
def split(cls, src, block_size): """ Split the given fault source into MultiRuptureSources depending on the given block size. """ for i, ruptures in enumerate( block_splitter(src.iter_ruptures(), block_size)): yield cls(ruptures, '%s-%s' % (src.source_id, i), src.tectonic_region_type, src.trt_model_id)
def test_split_with_weight(self): weights = dict( [("a", 11), ("b", 10), ("c", 100), ("d", 15), ("e", 20), ("f", 5), ("g", 30), ("h", 17), ("i", 25)] ) blocks = list(block_splitter("abcdefghi", 50, weights.get)) self.assertEqual( repr(blocks), "[<WeightedSequence ['a', 'b'], weight=21>, <WeightedSequence ['c'], weight=100>, <WeightedSequence ['d', 'e', 'f'], weight=40>, <WeightedSequence ['g', 'h'], weight=47>, <WeightedSequence ['i'], weight=25>]", )
def build_starmap(self, sm_id, ruptures_by_grp, sitecol, assetcol, riskmodel, imts, trunc_level, correl_model, min_iml, monitor): """ :param sm_id: source model ordinal :param ruptures_by_grp: dictionary of ruptures by src_group_id :param sitecol: a SiteCollection instance :param assetcol: an AssetCollection instance :param riskmodel: a RiskModel instance :param imts: a list of Intensity Measure Types :param trunc_level: truncation level :param correl_model: correlation model :param min_iml: vector of minimum intensities, one per IMT :param monitor: a Monitor instance :returns: a pair (starmap, dictionary of attributes) """ csm_info = self.csm_info.get_info(sm_id) grp_ids = sorted(csm_info.get_sm_by_grp()) rlzs_assoc = csm_info.get_rlzs_assoc( count_ruptures=lambda grp: len(ruptures_by_grp.get(grp.id, []))) num_events = sum(ebr.multiplicity for grp in ruptures_by_grp for ebr in ruptures_by_grp[grp]) seeds = self.oqparam.random_seed + numpy.arange(num_events) allargs = [] # prepare the risk inputs ruptures_per_block = self.oqparam.ruptures_per_block start = 0 grp_trt = csm_info.grp_trt() ignore_covs = self.oqparam.ignore_covs for grp_id in grp_ids: for rupts in block_splitter( ruptures_by_grp.get(grp_id, []), ruptures_per_block): if ignore_covs or not self.riskmodel.covs: eps = None elif self.oqparam.asset_correlation: eps = EpsilonMatrix1(num_events, self.oqparam.master_seed) else: n_events = sum(ebr.multiplicity for ebr in rupts) eps = EpsilonMatrix0( len(self.assetcol), seeds[start: start + n_events]) start += n_events ri = riskinput.RiskInputFromRuptures( grp_trt[grp_id], rlzs_assoc, imts, sitecol, rupts, trunc_level, correl_model, min_iml, eps) allargs.append((ri, riskmodel, assetcol, monitor)) self.vals = self.assetcol.values() taskname = '%s#%d' % (event_based_risk.__name__, sm_id + 1) smap = starmap(event_based_risk, allargs, name=taskname) attrs = dict(num_ruptures={ sg_id: len(rupts) for sg_id, rupts in ruptures_by_grp.items()}, num_events=num_events, num_rlzs=len(rlzs_assoc.realizations), sm_id=sm_id) return smap, attrs
def test_block_splitter_with_iter(self): # Test the block with a data set of unknown length data = iter(range(10)) expected = [ [0, 1, 2], [3, 4, 5], [6, 7, 8], [9], ] actual = [x for x in block_splitter(data, 3)] self.assertEqual(expected, actual)
def split(src, chunksize=MINWEIGHT): """ Split a complex fault source in chunks """ for i, block in enumerate(block_splitter(src.iter_ruptures(), chunksize, key=operator.attrgetter('mag'))): rup = block[0] source_id = '%s:%d' % (src.source_id, i) amfd = mfd.ArbitraryMFD([rup.mag], [rup.mag_occ_rate]) rcs = RuptureCollectionSource( source_id, src.name, src.tectonic_region_type, amfd, block) yield rcs
def actual_data(self, job): damage_states = list(models.DmgState.objects.filter( risk_calculation=job).order_by('lsi')) data = list(block_splitter( models.DamageData.objects.filter( dmg_state__risk_calculation=job).order_by( 'exposure_data', 'dmg_state'), len(damage_states))) # this is a test with 5 damage states # no_damage, slight, moderate, extreme, complete # NB: you can print the actual values with the command # print [[round(col.fraction, 8) for col in row] for row in data] return [[col.fraction for col in row] for row in data]
def supertask(text, monitor): # a supertask spawning subtasks of kind get_length with monitor('waiting'): time.sleep(.1) for block in general.block_splitter(text, max_weight=10): items = [(k, len(list(grp))) for k, grp in itertools.groupby(block)] if len(items) == 1: # for instance items = [('i', 1)] k, v = items[0] yield get_length(k * v, monitor) return # for instance items = [('a', 4), ('e', 4), ('i', 2)] for k, v in items: yield get_length, k * v
def _gen_riskinputs(self, kind): rinfo_dt = numpy.dtype([('sid', U16), ('num_assets', U16)]) rinfo = [] assets_by_site = self.assetcol.assets_by_site() for sid, assets in enumerate(assets_by_site): if len(assets) == 0: continue getter = self.get_getter(kind, sid) for block in general.block_splitter( assets, self.oqparam.assets_per_site_limit): yield riskinput.RiskInput(getter, numpy.array(block)) rinfo.append((sid, len(block))) if len(block) >= TWO16: logging.error('There are %d assets on site #%d!', len(block), sid) self.datastore['riskinput_info'] = numpy.array(rinfo, rinfo_dt)
def block_splitter(self, sources, weight=get_weight, key=lambda src: 1): """ :param sources: a list of sources :param weight: a weight function (default .weight) :param key: None or 'src_group_id' :returns: an iterator over blocks of sources """ ct = self.oqparam.concurrent_tasks or 1 maxweight = self.csm.get_maxweight(weight, ct, source.MINWEIGHT) if not hasattr(self, 'logged'): if maxweight == source.MINWEIGHT: logging.info('Using minweight=%d', source.MINWEIGHT) else: logging.info('Using maxweight=%d', maxweight) self.logged = True return general.block_splitter(sources, maxweight, weight, key)
def actual_data(self, job): damage_states = list(models.DmgState.objects.filter( risk_calculation=job).order_by('lsi')) outs = models.Output.objects.filter(oq_job=job).order_by('id') rows = [] for out in outs: data = [[ col.fraction for col in row] for row in block_splitter( models.DamageData.objects.filter( damage=out.damage).order_by( 'exposure_data', 'dmg_state'), len(damage_states))] rows.append(data) # this is a test with 5 damage states # no_damage, slight, moderate, extreme, complete return rows
def test_split_with_kind(self): Source = namedtuple("Source", "typology, weight") s1 = Source("point", 1) s2 = Source("point", 1) s3 = Source("area", 2) s4 = Source("area", 4) s5 = Source("area", 4) blocks = list( block_splitter([s1, s2, s3, s4, s5], max_weight=6, weight=attrgetter("weight"), kind=attrgetter("typology")) ) self.assertEqual(list(map(len, blocks)), [2, 2, 1]) self.assertEqual([b.weight for b in blocks], [2, 6, 4]) blocks = list( split_in_blocks([s1, s2, s3, s4, s5], hint=6, weight=attrgetter("weight"), key=attrgetter("typology")) ) self.assertEqual(list(map(len, blocks)), [2, 1, 1, 1]) self.assertEqual([b.weight for b in blocks], [2, 2, 4, 4])
def split(self, maxweight): """ :yields: RuptureGetters with weight <= maxweight """ # NB: can be called only after .set_weights() has been called idx = {ri: i for i, ri in enumerate(self.rup_indices)} fe = self.first_event for rup_indices in general.block_splitter( self.rup_indices, maxweight, lambda ri: self.weights[idx[ri]]): if rup_indices: # some indices may have weight 0 and are discarded rgetter = self.__class__( self.filename, list(rup_indices), self.grp_id, self.trt, self.samples, self.rlzs_by_gsim, fe) fe += rgetter.num_events rgetter.weight = sum([self.weights[idx[ri]] for ri in rup_indices]) yield rgetter
def execute(self): oq = self.oqparam self.set_param( num_taxonomies=self.assetcol.num_taxonomies_by_site(), maxweight=oq.ebrisk_maxweight / (oq.concurrent_tasks or 1), epspath=cache_epsilons( self.datastore, oq, self.assetcol, self.riskmodel, self.E)) parent = self.datastore.parent if parent: hdf5path = parent.filename grp_indices = parent['ruptures'].attrs['grp_indices'] nruptures = len(parent['ruptures']) else: hdf5path = self.datastore.hdf5cache() grp_indices = self.datastore['ruptures'].attrs['grp_indices'] nruptures = len(self.datastore['ruptures']) with hdf5.File(hdf5path, 'r+') as cache: self.datastore.hdf5.copy('weights', cache) self.datastore.hdf5.copy('ruptures', cache) self.datastore.hdf5.copy('rupgeoms', cache) self.init_logic_tree(self.csm_info) smap = parallel.Starmap( self.core_task.__func__, monitor=self.monitor()) trt_by_grp = self.csm_info.grp_by("trt") samples = self.csm_info.get_samples_by_grp() rlzs_by_gsim_grp = self.csm_info.get_rlzs_by_gsim_grp() ruptures_per_block = numpy.ceil(nruptures / (oq.concurrent_tasks or 1)) first_event = 0 for grp_id, rlzs_by_gsim in rlzs_by_gsim_grp.items(): start, stop = grp_indices[grp_id] for indices in general.block_splitter( range(start, stop), ruptures_per_block): rgetter = getters.RuptureGetter( hdf5path, list(indices), grp_id, trt_by_grp[grp_id], samples[grp_id], rlzs_by_gsim, first_event) first_event += rgetter.num_events smap.submit(rgetter, self.src_filter, self.param) self.events_per_sid = [] self.gmf_nbytes = 0 res = smap.reduce(self.agg_dicts, numpy.zeros(self.N)) logging.info('Produced %s of GMFs', general.humansize(self.gmf_nbytes)) return res
def test_split_with_kind(self): Source = namedtuple('Source', 'typology, weight') s1 = Source('point', 1) s2 = Source('point', 1) s3 = Source('area', 2) s4 = Source('area', 4) s5 = Source('area', 4) blocks = list( block_splitter([s1, s2, s3, s4, s5], max_weight=6, weight=attrgetter('weight'), kind=attrgetter('typology'))) self.assertEqual(list(map(len, blocks)), [2, 2, 1]) self.assertEqual([b.weight for b in blocks], [2, 6, 4]) blocks = list( split_in_blocks([s1, s2, s3, s4, s5], hint=6, weight=attrgetter('weight'), key=attrgetter('typology'))) self.assertEqual(list(map(len, blocks)), [1, 1, 1, 2]) self.assertEqual([b.weight for b in blocks], [2, 4, 4, 2])
def export_dmg_per_asset_csv(key, output, target): """ Classical Damage Per Asset in CSV format """ dest = _get_result_export_dest(target, output) damage_states = list(models.DmgState.objects.filter( risk_calculation=output.oq_job).order_by('lsi')) data = block_splitter( models.DamageData.objects.filter( dmg_state__risk_calculation=output.oq_job).order_by( 'exposure_data', 'dmg_state'), len(damage_states)) with FileWrapper(dest, mode='wb') as csvfile: writer = csv.writer(csvfile) writer.writerow(['asset_ref'] + [ds.dmg_state for ds in damage_states]) for row in data: asset = row[0].exposure_data fractions = [rec.fraction for rec in row] writer.writerow( [asset.asset_ref] + map(writers.scientificformat, fractions)) return dest
def test_split_with_weight(self): weights = dict([('a', 11), ('b', 10), ('c', 100), ('d', 15), ('e', 20), ('f', 5), ('g', 30), ('h', 17), ('i', 25)]) blocks = list(block_splitter('abcdefghi', 50, weights.get)) self.assertEqual(repr(blocks), "[<WeightedSequence ['a', 'b'], weight=21>, <WeightedSequence ['c'], weight=100>, <WeightedSequence ['d', 'e', 'f'], weight=40>, <WeightedSequence ['g', 'h'], weight=47>, <WeightedSequence ['i'], weight=25>]")
def test_block_splitter_block_size_lt_zero(self): gen = block_splitter(self.DATA, -1) with self.assertRaises(ValueError): next(gen)
def test_block_splitter_block_size_gt_data_len(self): expected = [self.DATA] actual = [x for x in block_splitter(self.DATA, 11)] self.assertEqual(expected, actual)