def get_map(self, binning, **kwargs): """Return a map of the spline evaluated at the centers of the given binning. """ if not isinstance(binning, MultiDimBinning): if isinstance(binning, Sequence): binning = MultiDimBinning(dimensions=binning) elif isinstance(binning, Mapping): binning = MultiDimBinning(**binning) else: raise ValueError('Do not know what to do with `binning`=%s of' ' type %s' % (binning, type(binning))) if self._validate_spl is not None: self._validate_spl(binning) return self._eval_spl(self.spline, binning, name=self.name, **kwargs)
def keep_inbounds(self, binning): """Cut out any events that fall outside `binning`. Note that events that fall exactly on an outer edge are kept. Parameters ---------- binning : OneDimBinning or MultiDimBinning Returns ------- cut_data : EventsPi """ # Get the binning instance try: binning = OneDimBinning(binning) except: # pylint: disable=bare-except pass if isinstance(binning, OneDimBinning): binning = [binning] binning = MultiDimBinning(binning) # Define a cut to remove events outside of the binned region bin_edge_cuts = [dim.inbounds_criteria for dim in binning] bin_edge_cuts = " & ".join([str(x) for x in bin_edge_cuts]) # Apply the cut return self.apply_cut(bin_edge_cuts)
def compute_binning_constants(self): """Compute some constants related to the binning. Just for illustrating a few properties of the binning one might want to evaluate.""" # Get the energy/coszen (ONLY) weighted centers here, since these # are actually used in the oscillations computation. All other # dimensions are ignored. Since these won't change so long as the # binning doesn't change, attache these to self. self.ecz_binning = MultiDimBinning([ self.input_binning.true_energy.to('GeV'), self.input_binning.true_coszen.to('dimensionless') ]) e_centers, cz_centers = self.ecz_binning.weighted_centers self.e_centers = e_centers.magnitude self.cz_centers = cz_centers.magnitude self.num_czbins = self.input_binning.true_coszen.num_bins self.num_ebins = self.input_binning.true_energy.num_bins self.e_dim_num = self.input_binning.names.index('true_energy') self.cz_dim_num = self.input_binning.names.index('true_coszen') # Illustrate how to find input binning dimensions which the transforms # created by this service will not depend on. self.extra_dim_nums = list(range(self.input_binning.num_dims)) [ self.extra_dim_nums.remove(d) for d in (self.e_dim_num, self.cz_dim_num) ]
def _flatten_to_2d(in_map): assert isinstance(in_map, Map) shape = in_map.shape names = in_map.binning.names dims = len(shape) assert dims % 2 == 0 nbins_a = np.product(shape[:dims // 2]) nbins_b = np.product(shape[dims // 2:]) names_a = reduce(lambda x, y: x + ' ' + y, names[:dims // 2]) names_b = reduce(lambda x, y: x + ' ' + y, names[dims // 2:]) binning = [] binning.append( OneDimBinning(name=names_a, num_bins=nbins_a, is_lin=True, domain=[0, nbins_a])) binning.append( OneDimBinning(name=names_b, num_bins=nbins_b, is_lin=True, domain=[0, nbins_b])) binning = MultiDimBinning(binning) hist = in_map.hist.reshape(nbins_a, nbins_b) return Map(name=in_map.name, hist=hist, binning=binning)
def binned_to_array(self, key, src_representation, dest_representation): """Augmented binned data to array data""" logging.trace('Transforming %s binned to array data' % (key)) self.representation = src_representation weights = self[key] if not src_representation.is_irregular: logging.trace( f"Container `{self.name}`: regularized lookup for {key}") sample = [] dimensions = [] for d in src_representation: if d.is_log: self.representation = "log_events" sample.append(self[d.name]) dimensions.append( OneDimBinning(d.name, domain=np.log(d.domain.m), num_bins=d.num_bins)) else: self.representation = "events" sample.append(self[d.name]) dimensions.append(d) hist_binning = MultiDimBinning(dimensions) else: logging.trace( f"Container `{self.name}`: irregular lookup for {key}") self.representation = dest_representation sample = [self[name] for name in src_representation.names] hist_binning = src_representation return lookup(sample, weights, hist_binning)
def test_container(): n_evts = 10000 x = np.arange(n_evts, dtype=FTYPE) y = np.arange(n_evts, dtype=FTYPE) w = np.ones(n_evts, dtype=FTYPE) w *= np.random.rand(n_evts) container = Container('test') container.add_array_data('x', x) container.add_array_data('y', y) container.add_array_data('w', w) binning_x = OneDimBinning(name='x', num_bins=10, is_lin=True, domain=[0, 100]) binning_y = OneDimBinning(name='y', num_bins=10, is_lin=True, domain=[0, 100]) binning = MultiDimBinning([binning_x, binning_y]) #print binning.names print(container.get_binned_data('x', binning).get('host')) print(Container.unroll_binning('x', binning).get('host')) # array print('original array') print(container.get_array_data('w').get('host')) container.array_to_binned('w', binning) # binned print('binned') print(container.get_binned_data('w').get('host')) print(container.get_hist('w')) print('augmented again') # augment container.binned_to_array('w') print(container.get_array_data('w').get('host'))
def setup_function(self): assert isinstance( self.apply_mode, MultiDimBinning ), f"KDE stage needs a binning as `apply_mode`, but is {self.apply_mode}" # For dimensions that are logarithmic, we add a linear binning in # the logarithm. dimensions = [] for dim in self.apply_mode: if dim.is_lin: new_dim = deepcopy(dim) # We don't compute the log of the variable just yet, this # will be done later during `apply_function` using the # representation mechanism. # We replace the logarithmic binning with a linear binning in log-space elif dim.is_irregular: new_dim = OneDimBinning( dim.name, bin_edges=np.log(dim.bin_edges.m), ) else: new_dim = OneDimBinning(dim.name, domain=np.log(dim.domain.m), num_bins=dim.num_bins) dimensions.append(new_dim) self.regularized_apply_mode = MultiDimBinning(dimensions) logging.debug("Using regularized binning:\n" + repr(self.regularized_apply_mode))
def test_histogram(): """Unit tests for `histogram` function. Correctness is defined as matching the histogram produced by numpy.histogramdd. """ all_num_bins = [2, 3, 4] n_evts = 10000 rand = np.random.RandomState(seed=0) weights = rand.rand(n_evts).astype(FTYPE) binning = [] sample = [] for num_dims, num_bins in enumerate(all_num_bins, start=1): binning.append( OneDimBinning( name=f'dim{num_dims - 1}', num_bins=num_bins, is_lin=True, domain=[0, num_bins], )) s = rand.rand(n_evts).astype(FTYPE) * num_bins sample.append(s) bin_edges = [b.edge_magnitudes for b in binning] test = histogram(sample, weights, MultiDimBinning(binning), averaged=False) ref, _ = np.histogramdd(sample=sample, bins=bin_edges, weights=weights) ref = ref.astype(FTYPE).ravel() assert recursiveEquality(test, ref), f'\ntest:\n{test}\n\nref:\n{ref}' test_avg = histogram(sample, weights, MultiDimBinning(binning), averaged=True) ref_counts, _ = np.histogramdd(sample=sample, bins=bin_edges, weights=None) ref_counts = ref_counts.astype(FTYPE).ravel() ref_avg = (ref / ref_counts).astype(FTYPE) assert recursiveEquality(test_avg, ref_avg), \ f'\ntest_avg:\n{test_avg}\n\nref_avg:\n{ref_avg}' logging.info('<< PASS : test_histogram >>')
def test_container(): """Unit tests for Container class.""" # NOTE: Right now the numbers are tuned so that the weights are identical # per bin. If you change binning that's likely not the case anymore and you # inevitably end up with averaged values over bins, which are then not # equal to the individual weights anymore when those are not identical per # bin n_evts = 10000 x = np.linspace(0, 100, n_evts, dtype=FTYPE) y = np.linspace(0, 100, n_evts, dtype=FTYPE) w = np.tile(np.arange(100, dtype=FTYPE) + 0.5, (100, 1)).T.ravel() container = Container('test', 'events') container['x'] = x container['y'] = y container['w'] = w binning_x = OneDimBinning(name='x', num_bins=100, is_lin=True, domain=[0, 100]) binning_y = OneDimBinning(name='y', num_bins=100, is_lin=True, domain=[0, 100]) binning = MultiDimBinning([binning_x, binning_y]) logging.trace('Testing container and translation methods') container.representation = binning bx = container['x'] m = np.meshgrid(binning.midpoints[0].m, binning.midpoints[1].m)[1].ravel() assert np.allclose(bx, m, **ALLCLOSE_KW), f'test:\n{bx}\n!= ref:\n{m}' # array repr container.representation = 'events' array_weights = container['w'] assert np.allclose(array_weights, w, **ALLCLOSE_KW), f'test:\n{array_weights}\n!= ref:\n{w}' # binned repr container.representation = binning diag = np.diag(np.arange(100) + 0.5) bd = container['w'] h = container.get_hist('w') assert np.allclose(bd, diag.ravel(), **ALLCLOSE_KW), f'test:\n{bd}\n!= ref:\n{diag.ravel()}' assert np.allclose(h[0], diag, **ALLCLOSE_KW), f'test:\n{h[0]}\n!= ref:\n{diag}' assert h[1] == binning, f'test:\n{h[1]}\n!= ref:\n{binning}' # augment to array repr again container.representation = 'events' a = container['w'] assert np.allclose(a, w, **ALLCLOSE_KW), f'test:\n{a}\n!= ref:\n{w}'
def __init__(self, input_names, output_name, input_binning=None, output_binning=None, tex=None, hash=None, error_method=None): # pylint: disable=redefined-builtin # Convert to sequence of single string if a single string was passed # for uniform interfacing if isinstance(input_names, basestring): input_names = [input_names] else: input_names = [name for name in input_names] self._input_names = input_names assert isinstance(output_name, basestring) self._output_name = output_name if input_binning is not None: if not isinstance(input_binning, MultiDimBinning): if isinstance(input_binning, Sequence): input_binning = MultiDimBinning(input_binning) else: input_binning = MultiDimBinning(**input_binning) self._input_binning = input_binning else: self._input_binning = None if output_binning is not None: if not isinstance(output_binning, MultiDimBinning): if isinstance(output_binning, Sequence): output_binning = MultiDimBinning(output_binning) else: output_binning = MultiDimBinning(**output_binning) self._output_binning = output_binning else: self._output_binning = None self._tex = tex self._hash = hash if bool(error_method): self._error_method = error_method else: self._error_method = None
def validate_calc_grid(calc_grid): """Check whether a multi-dimensional binning is suitable for use as the grid on which oscillations are calculated for event-by-event reweighting.""" calc_grid = MultiDimBinning(calc_grid) dim_names = set(calc_grid.names) if not dim_names == set(['true_energy', 'true_coszen']): raise ValueError('Oscillation grid must contain "true_energy" and' ' "true_coszen" dimensions, and no more! Got "%s".' % dim_names)
def pipeline_cfg_from_states(state_dict): """Recover a pipeline cfg containing PISA objects from a raw state. When a pipeline configuration is stored to JSON, the PISA objects turn into their serialized states. This function looks through the dictionary returned by `from_json` and recovers the PISA objects such as `ParamSet` and `MultiDimBinning`. It should really become part of PISA file I/O functionality to read and write PISA objects inside dictionaries/lists into a JSON and be able to recover them... """ # TODO: Make this a core functionality of PISA # This is just a mess... some objects have a `from_state` method, some take the # unpacked state dict as input, some take the state... pipeline_cfg = collections.OrderedDict() for stage_key in state_dict.keys(): # need to check all of this manually... no automatic way to do it :( if stage_key == "pipeline": pipeline_cfg[stage_key] = copy.deepcopy(state_dict[stage_key]) pipeline_cfg[stage_key]["output_key"] = tuple( pipeline_cfg[stage_key]["output_key"]) binning_state = pipeline_cfg[stage_key]["output_binning"] pipeline_cfg[stage_key]["output_binning"] = MultiDimBinning( **binning_state) continue # undo what we did in `serialize_pipeline_cfg` by splitting the keys into tuples tuple_key = tuple(stage_key.split("__")) pipeline_cfg[tuple_key] = copy.deepcopy(state_dict[stage_key]) for k in ["calc_mode", "apply_mode", "node_mode"]: if k in pipeline_cfg[tuple_key]: if isinstance(pipeline_cfg[tuple_key][k], collections.abc.Mapping): pipeline_cfg[tuple_key][k] = MultiDimBinning( **pipeline_cfg[tuple_key][k]) if "params" in pipeline_cfg[tuple_key].keys(): pipeline_cfg[tuple_key]["params"] = ParamSet( pipeline_cfg[tuple_key]["params"]) # if any stage takes any other arguments that we didn't think of here, they # won't work return pipeline_cfg
def _flatten_to_1d(in_map): assert isinstance(in_map, Map) bin_name = reduce(add, in_map.binning.names) num_bins = np.product(in_map.shape) binning = MultiDimBinning([ OneDimBinning(name=bin_name, num_bins=num_bins, is_lin=True, domain=[0, num_bins]) ]) hist = in_map.hist.flatten() return Map(name=in_map.name, hist=hist, binning=binning)
def array_to_binned(self, key, src_representation, dest_representation): """Histogram data array into binned data Parameters ---------- key : str src_representation : str dest_representation : MultiDimBinning #averaged : bool # if True, the histogram entries are averages of the numbers that # end up in a given bin. This for example must be used when oscillation # probabilities are translated.....otherwise we end up with probability*count # per bin Notes ----- right now, CPU-only """ # TODO: make work for n-dim logging.trace('Transforming %s array to binned data' % (key)) assert src_representation in self.array_representations assert isinstance(dest_representation, MultiDimBinning) if not dest_representation.is_irregular: sample = [] dimensions = [] for d in dest_representation: if d.is_log: self.representation = "log_events" sample.append(self[d.name]) dimensions.append( OneDimBinning(d.name, domain=np.log(d.domain.m), num_bins=d.num_bins)) else: self.representation = "events" sample.append(self[d.name]) dimensions.append(d) hist_binning = MultiDimBinning(dimensions) else: self.representation = src_representation sample = [self[name] for name in dest_representation.names] hist_binning = dest_representation self.representation = src_representation weights = self[key] hist = histogram(sample, weights, hist_binning, averaged=True) return hist
def setup_function(self): scale_file = find_resource(self.scale_file) logging.info("Loading scaling factors from : %s", scale_file) scaling_dict = from_json(scale_file) scale_binning = MultiDimBinning( **scaling_dict[self.variable]["binning"]) scale_factors = np.array(scaling_dict[self.variable]["scales"], dtype=FTYPE) logging.info(f"Binning for ad-hoc systematic: \n {str(scale_binning)}") logging.info( f"scaling factors of ad-hoc systematic:\n {str(scale_factors)}") self.data.representation = scale_binning for container in self.data: container["adhoc_scale_factors"] = scale_factors
def test_standard_plots(xsec_file, outdir='./'): from pisa.utils.plotter import Plotter xsec = genie.get_combined_xsec(xsec_file) e_bins = MultiDimBinning( [OneDimBinning(name='true_energy', tex=r'E_\nu', num_bins=150, domain=(1E-1, 1E3)*ureg.GeV, is_log=True)] ) xsec.compute_maps(e_bins) logging.info('Making plots for genie xsec_maps') plot_obj = Plotter(outdir=outdir, stamp='Cross-Section', fmt='png', log=True, size=(12, 8), label=r'Cross-Section ($m^{2}$)') maps = xsec.return_mapset() plot_obj.plot_xsec(maps, ylim=(1E-43, 1E-37))
def test_per_e_plot(xsec_file, outdir='./'): from pisa.utils.plotter import Plotter xsec = genie.get_combined_xsec(xsec_file) e_bins = MultiDimBinning( [OneDimBinning(name='true_energy', tex=r'E_\nu', num_bins=200, domain=(1E-1, 1E3)*ureg.GeV, is_log=True)] ) xsec.compute_maps(e_bins) xsec.scale_maps(1/e_bins.true_energy.bin_widths.magnitude) logging.info('Making plots for genie xsec_maps per energy') plot_obj = Plotter(outdir=outdir, stamp='Cross-Section / Energy', fmt='png', log=False, size=(12, 8), label=r'Cross-Section / Energy ($m^{2} GeV^{-1}$)') maps = xsec.return_mapset() plot_obj.plot_xsec(maps, ylim=(3.5E-41, 3E-40))
def digitize(self, kinds, binning, binning_cols=None): """Wrapper for numpy's digitize function.""" if isinstance(kinds, basestring): kinds = [kinds] if 'muons' not in kinds and 'noise' not in kinds: kinds = self._parse_flavint_groups(kinds) kinds = kinds[0] if isinstance(binning_cols, basestring): binning_cols = [binning_cols] # TODO: units of columns, and convert bin edges if necessary if isinstance(binning, OneDimBinning): binning = MultiDimBinning([binning]) elif isinstance(binning, MultiDimBinning): pass elif (isinstance(binning, Iterable) and not isinstance(binning, Sequence)): binning = list(binning) elif isinstance(binning, Sequence): pass else: raise TypeError('Unhandled type %s for `binning`.' % type(binning)) if isinstance(binning, Sequence): raise NotImplementedError( 'Simle sequences not handled at this time. Please specify a' ' OneDimBinning or MultiDimBinning object for `binning`.') # assert len(binning_cols) == len(binning) # bin_edges = binning # TODO: units support for Data will mean we can do `m_as(...)` here! bin_edges = [edges.magnitude for edges in binning.bin_edges] if binning_cols is None: binning_cols = binning.names else: assert set(binning_cols).issubset(set(binning.names)) hist_idxs = [] for colname in binning_cols: sample = self[kinds][colname] hist_idxs.append(np.digitize(sample, binning[colname].bin_edges.m)) hist_idxs = np.vstack(hist_idxs).T return hist_idxs
def histogram_np(sample, weights, binning, apply_weights=True): # pylint: disable=missing-docstring """helper function for numpy historams""" binning = MultiDimBinning(binning) bin_edges = [edges.magnitude for edges in binning.bin_edges] if weights is not None and weights.ndim == 2: # that means it's 1-dim data instead of scalars hists = [] for i in range(weights.shape[1]): w = weights[:, i] if apply_weights else None hist, _ = np.histogramdd(sample=sample, weights=w, bins=bin_edges) hists.append(hist.ravel()) flat_hist = np.stack(hists, axis=1) else: w = weights if apply_weights else None hist, _ = np.histogramdd(sample=sample, weights=w, bins=bin_edges) flat_hist = hist.ravel() return flat_hist.astype(FTYPE)
def keepInbounds(self, binning): """Cut out any events that fall outside `binning`. Note that events that fall exactly on an outer edge are kept. Parameters ---------- binning : OneDimBinning or MultiDimBinning Returns ------- remaining_events : Events """ try: binning = OneDimBinning(binning) except Exception: pass if isinstance(binning, OneDimBinning): binning = [binning] binning = MultiDimBinning(binning) current_cuts = self.metadata['cuts'] new_cuts = [dim.inbounds_criteria for dim in binning] unapplied_cuts = [c for c in new_cuts if c not in current_cuts] if not unapplied_cuts: logging.debug( "All inbounds criteria '%s' have already been" " applied. Returning events unmodified.", new_cuts) return self all_cuts = deepcopy(current_cuts) + unapplied_cuts # Create a single cut from all unapplied cuts keep_criteria = ' & '.join(['(%s)' % c for c in unapplied_cuts]) # Do the cutting remaining_events = self.applyCut(keep_criteria=keep_criteria) # Replace the combined 'cuts' string with individual cut strings remaining_events.metadata['cuts'] = all_cuts return remaining_events
def pisa2_map_to_pisa3_map(pisa2_map, ebins_name='ebins', czbins_name='czbins', is_log=True, is_lin=True): expected_keys = ['map', 'ebins', 'czbins'] if sorted(pisa2_map.keys()) != sorted(expected_keys): raise ValueError( 'PISA 2 map should be a dict containining entries: %s' % expected_keys) ebins = OneDimBinning(name=ebins_name, bin_edges=pisa2_map['ebins'] * ureg.GeV, is_log=is_log, tex='E_{\nu}') czbins = OneDimBinning(name=czbins_name, bin_edges=pisa2_map['czbins'], is_lin=is_lin, tex='\cos\theta_Z') bins = MultiDimBinning([ebins, czbins]) return Map(name='pisa2equivalent', hist=pisa2_map['map'], binning=bins)
def test_histogram(): n_evts = 100 x = np.arange(n_evts, dtype=FTYPE) y = np.arange(n_evts, dtype=FTYPE) w = np.ones(n_evts, dtype=FTYPE) #w *= np.random.rand(n_evts) x = SmartArray(x) y = SmartArray(y) w = SmartArray(w) binning_x = OneDimBinning(name='x', num_bins=10, is_lin=True, domain=[0, 100]) binning_y = OneDimBinning(name='y', num_bins=10, is_lin=True, domain=[0, 100]) binning = MultiDimBinning([binning_x, binning_y]) sample = [x, y] weights = w averaged = True histo = histogram(sample, weights, binning, averaged) assert np.array_equal(histo.reshape(10, 10), np.zeros(shape=(10, 10)))
def create_pseudo_data(toymc_params, seed=None): ''' Create pseudo data consisting of a gaussian peak on top of a uniform background ''' if seed is not None: np.random.seed(seed) binning = toymc_params.binning # # Gaussian signal peak # signal = np.random.normal(loc=toymc_params.mu, scale=toymc_params.sigma, size=toymc_params.nsig) # # Uniform background # background = np.random.uniform(high=toymc_params.nbackground_high, low=toymc_params.nbackground_low, size=toymc_params.nbkg) total_data = np.concatenate([signal, background]) counts_data, _ = np.histogram(total_data, bins=binning.bin_edges.magnitude) # Convert data histogram into a pisa map data_map = Map(name='total', binning=MultiDimBinning([binning]), hist=counts_data) # Set the errors as the sqrt of the counts data_map.set_errors(error_hist=np.sqrt(counts_data)) data_as_mapset = MapSet([data_map]) return data_as_mapset
def parse_pipeline_config(config): """Parse pipeline config. Parameters ---------- config : string or ConfigParser Returns ------- stage_dicts : OrderedDict Keys are (stage_name, service_name) tuples and values are OrderedDicts with keys the argnames and values the arguments' values. Some known arg values are parsed out fully into Python objects, while the rest remain as strings that must be used or parsed elsewhere. """ # Note: imports placed here to avoid circular imports from pisa.core.binning import MultiDimBinning, OneDimBinning from pisa.core.param import ParamSelector if isinstance(config, basestring): config = from_file(config) elif isinstance(config, PISAConfigParser): pass else: raise TypeError( '`config` must either be a string or PISAConfigParser. Got %s ' 'instead.' % type(config)) if not config.has_section('binning'): raise NoSectionError( "Could not find 'binning'. Only found sections: %s" % config.sections()) # Create binning objects binning_dict = {} for name, value in config['binning'].items(): if name.endswith('.order'): order = split(config.get('binning', name)) binning, _ = split(name, sep='.') bins = [] for bin_name in order: try: def_raw = config.get('binning', binning + '.' + bin_name) except: dims_defined = [ split(dim, sep='.')[1] for dim in config['binning'].keys() if dim.startswith(binning + '.') and not dim.endswith('.order') ] logging.error( "Failed to find definition of '%s' dimension of '%s'" " binning entry. Only found definition(s) of: %s", bin_name, binning, dims_defined) del dims_defined raise try: kwargs = eval(def_raw) # pylint: disable=eval-used except: logging.error( "Failed to evaluate definition of '%s' dimension of" " '%s' binning entry:\n'%s'", bin_name, binning, def_raw) raise try: bins.append(OneDimBinning(bin_name, **kwargs)) except: logging.error( "Failed to instantiate new `OneDimBinning` from '%s'" " dimension of '%s' binning entry with definition:\n" "'%s'\n", bin_name, binning, kwargs) raise binning_dict[binning] = MultiDimBinning(bins) # Pipeline section section = 'pipeline' # Get and parse the order of the stages (and which services implement them) order = [split(x, STAGE_SEP) for x in split(config.get(section, 'order'))] param_selections = [] if config.has_option(section, 'param_selections'): param_selections = split(config.get(section, 'param_selections')) detector_name = None if config.has_option(section, 'detector_name'): detector_name = config.get(section, 'detector_name') # Parse [stage.<stage_name>] sections and store to stage_dicts stage_dicts = OrderedDict() for stage, service in order: old_section_header = 'stage%s%s' % (STAGE_SEP, stage) new_section_header = '%s%s%s' % (stage, STAGE_SEP, service) if config.has_section(old_section_header): logging.warning( '"%s" is an old-style section header, in the future use "%s"' % (old_section_header, new_section_header)) section = old_section_header elif config.has_section(new_section_header): section = new_section_header else: raise IOError( 'missing section in cfg for stage "%s" service "%s"' % (stage, service)) # Instantiate dict to store args to pass to this stage service_kwargs = OrderedDict() param_selector = ParamSelector(selections=param_selections) service_kwargs['params'] = param_selector n_params = 0 for fullname in config.options(section): try: value = config.get(section, fullname) except: logging.error( 'Unable to obtain value of option "%s" in section "%s".' % (fullname, section)) raise # See if this matches a param specification param_match = PARAM_RE.match(fullname) if param_match is not None: n_params += 1 param_match_dict = param_match.groupdict() param_subfields = param_match_dict['subfields'].split('.') # Figure out what the dotted fields represent... infodict = interpret_param_subfields(subfields=param_subfields) # If field is an attr, skip since these are located manually if infodict['attr'] is not None: continue # Check if this param already exists in a previous stage; if # so, make sure there are no specs for this param, but just a # link to previous the param object that is already # instantiated. for kw in stage_dicts.values(): # Stage did not get a `params` argument from config if not kw.has_key('params'): continue # Retrieve the param from the ParamSelector try: param = kw['params'].get(name=infodict['pname'], selector=infodict['selector']) except KeyError: continue # Make sure there are no other specs (in this section) for # the param defined defined in previous section for a in PARAM_ATTRS: if config.has_option(section, '%s.%s' % (fullname, a)): raise ValueError("Parameter spec. '%s' of '%s' " "found in section '%s', but " "parameter exists in previous " "stage!" % (a, fullname, section)) break # Param *not* found in a previous stage (i.e., no explicit # `break` encountered in `for` loop above); therefore must # instantiate it. else: param = parse_param(config=config, section=section, selector=infodict['selector'], fullname=fullname, pname=infodict['pname'], value=value) param_selector.update(param, selector=infodict['selector']) # If it's not a param spec but contains 'binning', assume it's a # binning spec for CAKE stages elif 'binning' in fullname: service_kwargs[fullname] = binning_dict[value] # it's gonna be a PI stage elif '_specs' in fullname: value = parse_string_literal(value) # is it None? if value is None: service_kwargs[fullname] = value # is it evts? elif value in ['evnts', 'events']: service_kwargs[fullname] = 'events' # so it gotta be a binning else: service_kwargs[fullname] = binning_dict[value] # it's a list on in/output names list elif fullname.endswith('_names'): value = split(value) service_kwargs[fullname] = value # Otherwise it's some other stage instantiation argument; identify # this by its full name and try to interpret and instantiate a # Python object using the string else: try: value = parse_quantity(value) value = value.nominal_value * value.units except ValueError: value = parse_string_literal(value) service_kwargs[fullname] = value # If no params actually specified in config, remove 'params' from the # service's keyword args if n_params == 0: service_kwargs.pop('params') # Store the service's kwargs to the stage_dicts stage_dicts[(stage, service)] = service_kwargs stage_dicts['detector_name'] = detector_name return stage_dicts
def test_BinnedTensorTransform(): """Unit tests for BinnedTensorTransform class""" binning = MultiDimBinning([ dict(name='energy', is_log=True, domain=(1, 80) * ureg.GeV, num_bins=10), dict(name='coszen', is_lin=True, domain=(-1, 0), num_bins=5) ]) nue_map = Map(name='nue', binning=binning, hist=np.random.random(binning.shape)) nue_map.set_poisson_errors() numu_map = Map(name='numu', binning=binning, hist=np.random.random(binning.shape)) numu_map.set_poisson_errors() inputs = MapSet( name='inputs', maps=[nue_map, numu_map], ) xform0 = BinnedTensorTransform(input_names='nue', output_name='nue', input_binning=binning, output_binning=binning, xform_array=2 * np.ones(binning.shape)) xform1 = BinnedTensorTransform(input_names=['numu'], output_name='numu', input_binning=binning, output_binning=binning, xform_array=3 * np.ones(binning.shape)) xform2 = BinnedTensorTransform( input_names=['nue', 'numu'], output_name='nue_numu', input_binning=binning, output_binning=binning, xform_array=np.stack( [2 * np.ones(binning.shape), 3 * np.ones(binning.shape)], axis=0)) assert np.all((xform2 + 2).xform_array - xform2.xform_array == 2) testdir = tempfile.mkdtemp() try: for i, t in enumerate([xform0, xform1, xform2]): t_file = os.path.join(testdir, str(i) + '.json') t.to_json(t_file) t_ = BinnedTensorTransform.from_json(t_file) assert t_ == t, 't=\n%s\nt_=\n%s' % (t, t_) finally: shutil.rmtree(testdir, ignore_errors=True) logging.info('<< PASS : test_BinnedTensorTransform >>') xforms = TransformSet(name='scaling', transforms=[xform0, xform1, xform2], hash=9) assert xforms.hash == 9 xforms.hash = -20 assert xforms.hash == -20 _ = xforms.apply(inputs) # TODO: get this working above, then test here! #xforms2 = xforms * 2 testdir = tempfile.mkdtemp() try: for i, t in enumerate([xforms]): t_filename = os.path.join(testdir, str(i) + '.json') t.to_json(t_filename) t_ = TransformSet.from_json(t_filename) assert t_ == t, 't=\n%s\nt_=\n%s' % (t.transforms, t_.transforms) finally: shutil.rmtree(testdir, ignore_errors=True) logging.info('<< PASS : test_TransformSet >>')
def binning(self): binning = self._reference_state["binning"] if not is_binning(binning): binning = MultiDimBinning(**binning) return binning
def histogram(self, kinds, binning, binning_cols=None, weights_col=None, errors=False, name=None, tex=None, **kwargs): """Histogram the events of all `kinds` specified, with `binning` and optionally applying `weights`. Parameters ---------- kinds : string, sequence of NuFlavInt, or NuFlavIntGroup binning : OneDimBinning, MultiDimBinning or sequence of arrays (one array per binning dimension) binning_cols : string or sequence of strings Bin only these dimensions, ignoring other dimensions in `binning` weights_col : None or string Column to use for weighting the events errors : bool Whether to attach errors to the resulting Map name : None or string Name to give to resulting Map. If None, a default is derived from `kinds` and `weights_col`. tex : None or string TeX label to give to the resulting Map. If None, default is dereived from the `name` specified or the derived default. **kwargs : Keyword args passed to Map object Returns ------- Map : numpy ndarray with as many dimensions as specified by `binning` argument """ # TODO: make able to take integer for `binning` and--in combination # with units in the Data columns--generate an appropriate # MultiDimBinning object, attach this and return the package as a Map. if isinstance(kinds, basestring): kinds = [kinds] if 'muons' not in kinds and 'noise' not in kinds: kinds = self._parse_flavint_groups(kinds) kinds = kinds[0] if isinstance(binning_cols, basestring): binning_cols = [binning_cols] assert weights_col is None or isinstance(weights_col, basestring) # TODO: units of columns, and convert bin edges if necessary if isinstance(binning, OneDimBinning): binning = MultiDimBinning([binning]) elif isinstance(binning, MultiDimBinning): pass elif (isinstance(binning, Iterable) and not isinstance(binning, Sequence)): binning = list(binning) elif isinstance(binning, Sequence): pass else: raise TypeError('Unhandled type %s for `binning`.' % type(binning)) if isinstance(binning, Sequence): raise NotImplementedError( 'Simle sequences not handled at this time. Please specify a' ' OneDimBinning or MultiDimBinning object for `binning`.') # assert len(binning_cols) == len(binning) # bin_edges = binning # TODO: units support for Data will mean we can do `m_as(...)` here! bin_edges = [edges.magnitude for edges in binning.bin_edges] if binning_cols is None: binning_cols = binning.names else: assert set(binning_cols).issubset(set(binning.names)) # Extract the columns' data into a list of array(s) for histogramming sample = [self[kinds][colname] for colname in binning_cols] err_weights = None hist_weights = None if weights_col is not None: hist_weights = self[kinds][weights_col] if errors: err_weights = np.square(hist_weights) hist, edges = np.histogramdd(sample=sample, weights=hist_weights, bins=bin_edges) if errors: sumw2, edges = np.histogramdd(sample=sample, weights=err_weights, bins=bin_edges) hist = unp.uarray(hist, np.sqrt(sumw2)) if name is None: if tex is None: try: tex = kinds.tex # TODO: specify specific exception(s) except: tex = r'{0}'.format(kinds) if weights_col is not None: tex += r', \; {\rm weights} =' + text2tex(weights_col) name = str(kinds) if weights_col is not None: name += ', weights=' + weights_col if tex is None: tex = text2tex(name) return Map(name=name, hist=hist, binning=binning, tex=tex, **kwargs)
def test_Events(): """Unit tests for Events class""" from pisa.utils.flavInt import NuFlavInt # Instantiate empty object events = Events() # Instantiate from PISA events HDF5 file events = Events( 'events/events__vlvnt__toy_1_to_80GeV_spidx1.0_cz-1_to_1_1e2evts_set0__unjoined__with_fluxes_honda-2015-spl-solmin-aa.hdf5' ) # Apply a simple cut events = events.applyCut('(true_coszen <= 0.5) & (true_energy <= 70)') for fi in events.flavints: assert np.max(events[fi]['true_coszen']) <= 0.5 assert np.max(events[fi]['true_energy']) <= 70 # Apply an "inbounds" cut via a OneDimBinning true_e_binning = OneDimBinning(name='true_energy', num_bins=80, is_log=True, domain=[10, 60] * ureg.GeV) events = events.keepInbounds(true_e_binning) for fi in events.flavints: assert np.min(events[fi]['true_energy']) >= 10 assert np.max(events[fi]['true_energy']) <= 60 # Apply an "inbounds" cut via a MultiDimBinning true_e_binning = OneDimBinning(name='true_energy', num_bins=80, is_log=True, domain=[20, 50] * ureg.GeV) true_cz_binning = OneDimBinning(name='true_coszen', num_bins=40, is_lin=True, domain=[-0.8, 0]) mdb = MultiDimBinning([true_e_binning, true_cz_binning]) events = events.keepInbounds(mdb) for fi in events.flavints: assert np.min(events[fi]['true_energy']) >= 20 assert np.max(events[fi]['true_energy']) <= 50 assert np.min(events[fi]['true_coszen']) >= -0.8 assert np.max(events[fi]['true_coszen']) <= 0 # Now try to apply a cut that fails on one flav/int (since the field will # be missing) and make sure that the cut did not get applied anywhere in # the end (i.e., it is rolled back) sub_evts = events['nutaunc'] sub_evts.pop('true_energy') events['nutaunc'] = sub_evts try: events = events.applyCut('(true_energy >= 30) & (true_energy <= 40)') except Exception: pass else: raise Exception('Should not have been able to apply the cut!') for fi in events.flavints: if fi == NuFlavInt('nutaunc'): continue assert np.min(events[fi]['true_energy']) < 30 logging.info( '<< PASS : test_Events >> (note:' ' "[ ERROR] Events object is in an inconsistent state. Reverting cut' ' for all flavInts." message above **is expected**.)')
def get_hist( sample, binning, weights=None, bw_method="scott", adaptive=True, alpha=0.3, use_cuda=False, coszen_reflection=0.25, coszen_name="coszen", oversample=1, bootstrap=False, bootstrap_niter=10, ): """Helper function for histograms from KDE For description of args see kde_histogramdd() Handling the reflctions at the coszen edges ToDo: ---- * Handle zenith like coszen? Or better: Define set of variables to perform reflection on and reflection parameters (e.g. `reflect_fract` or somesuch to stand in for for `coszen_reflection` and `reflect_dims` as standin for `coszen_name`; also need some way to specify whether to reflect about lower and/or upper edge); each such parameter can either be a single value, or a sequence with one value per variable. * Any good reason for 0.25 and 'scott' defaults? If not, don't define a default and force the user to explicitly set this when function is called. """ if bootstrap and oversample > 1: # Because the errors within a bin are highly correlated, they could not just # be added in quadrature to create an oversampled histogram with errors. raise ValueError("Bootstrapping cannot be combined with oversampling.") # the KDE implementation expects an empty weights array instead of `None` if weights is None: weights = [] # Get the overall normalization here, because the KDE will be normalized # to one and we'll need to rescale in the end if len(weights) == 0: norm = sample.shape[0] else: norm = np.sum(weights) binning = binning.oversample(oversample) # Flip around to satisfy the kde implementation x = sample.T # Must have same amount of dimensions as binning dimensions assert x.shape[0] == len(binning) # TODO: What if coszen isn't in binning? Does this fail? # Yes, coszen is expected cz_bin = binning.index(coszen_name) # Swap out cz bin to first place (index 0) if cz_bin != 0: # Also swap binning: new_binning = [binning[coszen_name]] for b in binning: if b.name != coszen_name: new_binning.append(b) binning = MultiDimBinning(new_binning) x[[0, cz_bin]] = x[[cz_bin, 0]] # Check if edge needs to be reflected reflect_lower = binning[coszen_name].bin_edges[0] == -1 reflect_upper = binning[coszen_name].bin_edges[-1] == 1 # Get the kernel weights kde_kwargs = dict( weights=weights, bw_method=bw_method, adaptive=adaptive, alpha=alpha, use_cuda=use_cuda, ) if bootstrap: kernel_weights_adaptive = bootstrap_kde(x, niter=bootstrap_niter, **kde_kwargs) else: kernel_weights_adaptive = gaussian_kde(x, **kde_kwargs) # Get the bin centers, where we're going to evaluate the KDEs, and extend # the bin range for reflection bin_points = [] for b in binning: c = unp.nominal_values(b.weighted_centers) if b.name == coszen_name: # how many bins to add for reflection l = int(len(c) * coszen_reflection) if reflect_lower: c0 = 2 * c[0] - c[1:l + 1][::-1] else: c0 = [] if reflect_upper: c1 = 2 * c[-1] - c[-l - 1:-1][::-1] else: c1 = [] c = np.concatenate([c0, c, c1]) bin_points.append(c) # Shape including reflection edges megashape = ( binning.shape[0] + (int(reflect_upper) + int(reflect_lower)) * l, binning.shape[1], ) # Shape of the reflection edges alone minishape = (binning.shape[0] - l, binning.shape[1]) # Create a set of points grid = np.meshgrid(*bin_points, indexing="ij") points = np.array([g.ravel() for g in grid]) # Evaluate KDEs at given points if bootstrap: hist, errors = kernel_weights_adaptive(points) # variances can simply be added together when we apply reflections, we take # the root afterwards variances = errors**2 else: hist = kernel_weights_adaptive(points) # Reshape 1d array into nd hist = hist.reshape(megashape) if bootstrap: variances = variances.reshape(megashape) def apply_reflection(hist_): # Cut off the reflection edges, mirror them, fill up remaining space with # zeros and add to histo if reflect_lower: hist0 = hist_[0:l, :] hist0_0 = np.zeros(minishape) hist0 = np.flipud(np.concatenate([hist0_0, hist0])) hist_ = hist_[l:, :] else: hist0 = 0 if reflect_upper: hist1 = hist_[-l:, :] hist1_0 = np.zeros(minishape) hist1 = np.flipud(np.concatenate([hist1, hist1_0])) hist_ = hist_[:-l, :] else: hist1 = 0 hist_ = hist_ + hist1 + hist0 return hist_ hist = apply_reflection(hist) if bootstrap: variances = apply_reflection(variances) errors = np.sqrt(variances) # Bin volumes volume = binning.bin_volumes(attach_units=False) hist = hist * volume if bootstrap: errors = errors * volume # Downsample, not applicable when bootstrapping if oversample != 1: for i, b in enumerate(binning): hist = np.add.reduceat(hist, np.arange(0, len(b.bin_edges) - 1, oversample), axis=i) # Swap back the axes if cz_bin != 0: hist = np.swapaxes(hist, 0, cz_bin) if bootstrap: errors = np.swapaxes(errors, 0, cz_bin) if bootstrap: return hist * norm, errors * norm else: return hist * norm
def kde_histogramdd(sample, binning, weights=None, bw_method="scott", adaptive=True, alpha=0.3, use_cuda=False, coszen_reflection=0.25, coszen_name="coszen", oversample=1, stack_pid=True, bootstrap=False, bootstrap_niter=10): """Run kernel density estimation (KDE) for an array of data points, and then evaluate them on a histogram-like grid to effectively produce a histogram-like output. Handles reflection at coszen edges, and will expect coszen to be in the binning Based on Sebastian Schoenen's KDE implementation: http://code.icecube.wisc.edu/svn/sandbox/schoenen/kde Parameters ---------- sample : array Shape (N_evts, vars), with vars in the right order corresponding to the binning order. binning : MultiDimBinning A coszen dimension is expected weights : None or array Same shape as `sample` bw_method: string 'scott' or 'silverman' (see kde module) adaptive : bool (see kde module) alpha : float A parameter for the KDEs (see kde module) use_cuda : bool Run on GPU (only works with <= 2d) coszen_reflection : float Part (number between 0 and 1) of binning that is reflected at the coszen -1 and 1 edges coszen_name : string Binning name to identify the coszen bin that needs to undergo special treatment for reflection oversample : int Evaluate KDE at more points per bin, takes longer, but is more accurate stack_pid : bool Treat each pid bin separately, not as another dimension of the KDEs Only supported for two additional dimensions, pid binning must be named `pid` bootstrap : bool Use the ``bootstrap_kde`` class to produce error estimates on the KDE histograms. Slow, not recommended during fits. bootstrap_niter : int Number of bootstrap iterations. Returns ------- histogram : numpy.ndarray ToDo: ----- * Maybe return Map with binnings attached insted of nd-array? * Generalize to handle any dimensions with any reflection criterias """ if weights is not None and len(weights) != sample.shape[0]: raise ValueError( "Length of sample (%s) and weights (%s) incompatible" % (sample.shape[0], len(weights))) if not stack_pid: return get_hist(sample=sample, binning=binning, weights=weights, bw_method=bw_method, adaptive=adaptive, alpha=alpha, use_cuda=use_cuda, coszen_reflection=coszen_reflection, coszen_name=coszen_name, oversample=oversample, bootstrap=bootstrap, bootstrap_niter=bootstrap_niter) # treat pid bins separately # asuming we're dealing with 2d apart from PID bin_names = copy.copy(binning.names) bin_edges = [b.bin_edges.m for b in binning] pid_bin = bin_names.index("pid") other_bins = [0, 1, 2] other_bins.pop(pid_bin) bin_names.pop(pid_bin) assert len(bin_names) == 2 pid_bin_edges = bin_edges.pop(pid_bin) d2d_binning = [] for b in binning: if b.name != "pid": d2d_binning.append(b) d2d_binning = MultiDimBinning(d2d_binning) pid_stack = [] if bootstrap: pid_stack_errors = [] for pid in range(len(pid_bin_edges) - 1): mask_pid = (sample.T[pid_bin] >= pid_bin_edges[pid]) & ( sample.T[pid_bin] < pid_bin_edges[pid + 1]) data = np.array([ sample.T[other_bins[0]][mask_pid], sample.T[other_bins[1]][mask_pid] ]) if weights is None: weights_pid = None else: weights_pid = weights[mask_pid] hist_kwargs = dict(sample=data.T, weights=weights_pid, binning=d2d_binning, coszen_name=coszen_name, use_cuda=use_cuda, bw_method=bw_method, alpha=alpha, oversample=oversample, coszen_reflection=coszen_reflection, adaptive=adaptive, bootstrap=bootstrap, bootstrap_niter=bootstrap_niter) if bootstrap: hist, errors = get_hist(**hist_kwargs) pid_stack.append(hist) pid_stack_errors.append(errors) else: pid_stack.append(get_hist(**hist_kwargs)) hist = np.dstack(pid_stack) if bootstrap: errors = np.dstack(pid_stack_errors) if pid_bin != 2: hist = np.swapaxes(hist, pid_bin, 2) if bootstrap: errors = np.swapaxes(errors, pid_bin, 2) if bootstrap: return hist, errors else: return hist