def extract_cols(datagrp, sel, slc, columns): """ :param datagrp: something like and HDF5 data group :param sel: dictionary column name -> value specifying a selection :param slc: a slice object specifying the rows considered :param columns: the full list of column names :returns: a dictionary col -> array of values """ first = columns[0] nrows = len(datagrp[first]) if slc.start is None and slc.stop is None: # split in slices slcs = general.gen_slices(0, nrows, MAX_ROWS) else: slcs = [slc] acc = general.AccumDict(accum=[]) # col -> arrays for slc in slcs: if sel: ok = slice(None) dic = {col: datagrp[col][slc] for col in sel} for col in sel: if isinstance(ok, slice): # first selection ok = is_ok(dic[col], sel[col]) else: # other selections ok &= is_ok(dic[col], sel[col]) for col in columns: acc[col].append(datagrp[col][slc][ok]) else: # avoid making unneeded copies for col in columns: acc[col].append(datagrp[col][slc]) return {k: numpy.concatenate(decode_lol(vs)) for k, vs in acc.items()}
def get_poes(self, mean_std, cmaker, ctx): """ Calculate and return probabilities of exceedance (PoEs) of one or more intensity measure levels (IMLs) of one intensity measure type (IMT) for one or more pairs "site -- rupture". :param mean_std: An array of shape (2, M, N) with mean and standard deviations for the sites and intensity measure types :param cmaker: A ContextMaker instance :param ctxs: Context objects used to compute mean_std :returns: array of PoEs of shape (N, L) :raises ValueError: If truncation level is not ``None`` and neither non-negative float number, and if ``imts`` dictionary contain wrong or unsupported IMTs (see :attr:`DEFINED_FOR_INTENSITY_MEASURE_TYPES`). """ loglevels = cmaker.loglevels truncation_level = cmaker.truncation_level N = mean_std.shape[2] # 2, M, N L = loglevels.size maxsize = int(numpy.ceil(ONE_MB / L / 8)) arr = numpy.zeros((N, L)) if truncation_level is not None and truncation_level < 0: raise ValueError('truncation level must be zero, positive number ' 'or None') if hasattr(self, 'weights_signs'): outs = [] weights, signs = zip(*self.weights_signs) for s in signs: ms = numpy.array(mean_std) # make a copy for m in range(len(loglevels)): ms[0, m] += s * ctx.adjustment outs.append(_get_poes(ms, loglevels, truncation_level)) arr[:] = numpy.average(outs, weights=weights, axis=0) elif hasattr(self, "mixture_model"): for f, w in zip(self.mixture_model["factors"], self.mixture_model["weights"]): mean_stdi = numpy.array(mean_std) # a copy mean_stdi[1] *= f # multiply stddev by factor arr[:] += w * _get_poes(mean_stdi, loglevels, truncation_level) else: # regular case # split large arrays in slices < 1 MB to fit inside the CPU cache for sl in gen_slices(0, N, maxsize): arr[sl] = _get_poes(mean_std[:, :, sl], loglevels, truncation_level) imtweight = getattr(self, 'weight', None) # ImtWeight or None for imt in loglevels: if imtweight and imtweight.dic.get(imt) == 0: # set by the engine when parsing the gsim logictree # when 0 ignore the contribution: see _build_trts_branches arr[:, loglevels(imt)] = 0 return arr
def split_df(df, cond=True, maxsize=1000): """ :param df: a large dataframe :param cond: boolean condition for splitting :param maxsize: split dataframes larger than maxsize :yields: dataframes smaller than maxsize """ n = len(df) if n <= maxsize or not cond: yield df else: for slc in gen_slices(0, len(df), maxsize): yield df[slc]
def __iter__(self): if len(self.mags) <= BLOCKSIZE: # already split yield self return # split in blocks of BLOCKSIZE ruptures each for i, slc in enumerate(gen_slices(0, len(self.mags), BLOCKSIZE)): src = self.__class__( '%s:%d' % (self.source_id, i), self.name, self.tectonic_region_type, self.rupture_idxs[slc], self.pmfs[slc], self.mags[slc], self.rakes[slc]) src.set_sections(self.sections) src.num_ruptures = src.count_ruptures() yield src
def full_disaggregation(self): """ Run the disaggregation phase. """ oq = self.oqparam tl = oq.truncation_level src_filter = self.src_filter() if hasattr(self, 'csm'): for sg in self.csm.src_groups: if sg.atomic: raise NotImplementedError( 'Atomic groups are not supported yet') if not self.csm.get_sources(): raise RuntimeError('All sources were filtered away!') csm_info = self.datastore['csm_info'] self.poes_disagg = oq.poes_disagg or (None, ) self.imts = list(oq.imtls) self.ws = [rlz.weight for rlz in self.rlzs_assoc.realizations] self.pgetter = getters.PmapGetter(self.datastore, self.ws, self.sitecol.sids) # build array rlzs (N, Z) if oq.rlz_index is None: Z = oq.num_rlzs_disagg rlzs = numpy.zeros((self.N, Z), int) if self.R > 1: for sid in self.sitecol.sids: curves = numpy.array( [pc.array for pc in self.pgetter.get_pcurves(sid)]) mean = getters.build_stat_curve(curves, oq.imtls, stats.mean_curve, self.ws) rlzs[sid] = util.closest_to_ref(curves, mean.array)[:Z] self.datastore['best_rlzs'] = rlzs else: Z = len(oq.rlz_index) rlzs = numpy.zeros((self.N, Z), int) for z in range(Z): rlzs[:, z] = oq.rlz_index[z] assert Z <= self.R, (Z, self.R) self.Z = Z self.rlzs = rlzs if oq.iml_disagg: # no hazard curves are needed self.poe_id = {None: 0} curves = [[None for z in range(Z)] for s in range(self.N)] self.ok_sites = set(self.sitecol.sids) else: self.poe_id = {poe: i for i, poe in enumerate(oq.poes_disagg)} curves = [ self.get_curve(sid, rlzs[sid]) for sid in self.sitecol.sids ] self.ok_sites = set(self.check_poes_disagg(curves, rlzs)) self.iml4 = _iml4(rlzs, oq.iml_disagg, oq.imtls, self.poes_disagg, curves) if oq.disagg_by_src: self.build_disagg_by_src(rlzs) eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1) # build trt_edges trts = tuple(csm_info.trts) trt_num = {trt: i for i, trt in enumerate(trts)} self.trts = trts # build mag_edges min_mag = csm_info.min_mag max_mag = csm_info.max_mag mag_edges = oq.mag_bin_width * numpy.arange( int(numpy.floor(min_mag / oq.mag_bin_width)), int(numpy.ceil(max_mag / oq.mag_bin_width) + 1)) # build dist_edges maxdist = max(oq.maximum_distance(trt) for trt in trts) dist_edges = oq.distance_bin_width * numpy.arange( 0, int(numpy.ceil(maxdist / oq.distance_bin_width) + 1)) # build eps_edges eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1) # build lon_edges, lat_edges per sid bbs = src_filter.get_bounding_boxes(mag=max_mag) lon_edges, lat_edges = {}, {} # by sid for sid, bb in zip(self.sitecol.sids, bbs): lon_edges[sid], lat_edges[sid] = disagg.lon_lat_bins( bb, oq.coordinate_bin_width) self.bin_edges = mag_edges, dist_edges, lon_edges, lat_edges, eps_edges self.save_bin_edges() self.imldict = {} # sid, rlz, poe, imt -> iml for s in self.sitecol.sids: for z, rlz in enumerate(rlzs[s]): logging.info('Site #%d, disaggregating for rlz=#%d', s, rlz) for p, poe in enumerate(self.poes_disagg): for m, imt in enumerate(oq.imtls): self.imldict[s, rlz, poe, imt] = self.iml4[s, m, p, z] # submit disagg tasks gid = self.datastore['rup/grp_id'][()] indices_by_grp = get_indices(gid) # grp_id -> [(start, stop),...] blocksize = len(gid) // (oq.concurrent_tasks or 1) + 1 # NB: removing the blocksize causes slow disaggregation tasks allargs = [] dstore = (self.datastore.parent if self.datastore.parent else self.datastore) for grp_id, trt in csm_info.trt_by_grp.items(): trti = trt_num[trt] rlzs_by_gsim = self.rlzs_assoc.get_rlzs_by_gsim(grp_id) cmaker = ContextMaker( trt, rlzs_by_gsim, { 'truncation_level': oq.truncation_level, 'maximum_distance': src_filter.integration_distance, 'filter_distance': oq.filter_distance, 'imtls': oq.imtls }) for start, stop in indices_by_grp[grp_id]: for slc in gen_slices(start, stop, blocksize): allargs.append((dstore, slc, self.sitecol, oq, cmaker, self.iml4, trti, self.bin_edges)) results = parallel.Starmap(compute_disagg, allargs, h5=self.datastore.hdf5).reduce( self.agg_result, AccumDict(accum={})) return results # sid -> trti-> 8D array
def full_disaggregation(self): """ Run the disaggregation phase. """ oq = self.oqparam tl = oq.truncation_level src_filter = self.src_filter() if hasattr(self, 'csm'): for sg in self.csm.src_groups: if sg.atomic: raise NotImplementedError( 'Atomic groups are not supported yet') if not self.csm.get_sources(): raise RuntimeError('All sources were filtered away!') csm_info = self.datastore['csm_info'] self.poes_disagg = oq.poes_disagg or (None, ) self.imts = list(oq.imtls) if oq.rlz_index is None: try: rlzs = self.datastore['best_rlz'][()] except KeyError: rlzs = numpy.zeros(self.N, int) else: rlzs = [oq.rlz_index] * self.N if oq.iml_disagg: self.poe_id = {None: 0} curves = [None] * len(self.sitecol) # no hazard curves are needed self.ok_sites = set(self.sitecol.sids) else: self.poe_id = {poe: i for i, poe in enumerate(oq.poes_disagg)} curves = [self.get_curve(sid, rlzs) for sid in self.sitecol.sids] self.ok_sites = set(self.check_poes_disagg(curves, rlzs)) self.iml2s = _iml2s(rlzs, oq.iml_disagg, oq.imtls, self.poes_disagg, curves) if oq.disagg_by_src: self.build_disagg_by_src() eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1) # build trt_edges trts = tuple(csm_info.trts) trt_num = {trt: i for i, trt in enumerate(trts)} self.trts = trts # build mag_edges min_mag = csm_info.min_mag max_mag = csm_info.max_mag mag_edges = oq.mag_bin_width * numpy.arange( int(numpy.floor(min_mag / oq.mag_bin_width)), int(numpy.ceil(max_mag / oq.mag_bin_width) + 1)) # build dist_edges maxdist = max(oq.maximum_distance(trt, max_mag) for trt in trts) dist_edges = oq.distance_bin_width * numpy.arange( 0, int(numpy.ceil(maxdist / oq.distance_bin_width) + 1)) # build eps_edges eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1) # build lon_edges, lat_edges per sid bbs = src_filter.get_bounding_boxes(mag=max_mag) lon_edges, lat_edges = {}, {} # by sid for sid, bb in zip(self.sitecol.sids, bbs): lon_edges[sid], lat_edges[sid] = disagg.lon_lat_bins( bb, oq.coordinate_bin_width) self.bin_edges = mag_edges, dist_edges, lon_edges, lat_edges, eps_edges self.save_bin_edges() self.imldict = {} # sid, rlzi, poe, imt -> iml for s in self.sitecol.sids: iml2 = self.iml2s[s] r = rlzs[s] logging.info('Site #%d, disaggregating for rlz=#%d', s, r) for p, poe in enumerate(self.poes_disagg): for m, imt in enumerate(oq.imtls): self.imldict[s, r, poe, imt] = iml2[m, p] # submit disagg tasks gid = self.datastore['rup/grp_id'][()] indices_by_grp = get_indices(gid) # grp_id -> [(start, stop),...] blocksize = len(gid) // (oq.concurrent_tasks or 1) + 1 allargs = [] for grp_id, trt in csm_info.trt_by_grp.items(): trti = trt_num[trt] rlzs_by_gsim = self.rlzs_assoc.get_rlzs_by_gsim(grp_id) cmaker = ContextMaker( trt, rlzs_by_gsim, { 'truncation_level': oq.truncation_level, 'maximum_distance': src_filter.integration_distance, 'filter_distance': oq.filter_distance, 'imtls': oq.imtls }) for start, stop in indices_by_grp[grp_id]: for slc in gen_slices(start, stop, blocksize): allargs.append((self.datastore, slc, cmaker, self.iml2s, trti, self.bin_edges)) results = parallel.Starmap(compute_disagg, allargs, h5=self.datastore.hdf5).reduce( self.agg_result, AccumDict(accum={})) return results # sid -> trti-> 7D array