def __init__(self, params, output_names, events_file, output_binning=None, output_events=True, error_method=None, debug_mode=None, disk_cache=None, memcache_deepcopy=True, transforms_cache_depth=20, outputs_cache_depth=20): self.sample_hash = None expected_params = ('dataset','keep_criteria') #TODO -> kwargs??? self.events_file = events_file self.neutrinos = False self.muons = False self.noise = False output_names = output_names.replace(' ', '').split(',') clean_outnames = [] self._output_nu_groups = [] for name in output_names: if 'muons' in name: self.muons = True clean_outnames.append(name) elif 'noise' in name: self.noise = True clean_outnames.append(name) elif 'all_nu' in name: self.neutrinos = True self._output_nu_groups = \ [NuFlavIntGroup(f) for f in ALL_NUFLAVINTS] else: self.neutrinos = True self._output_nu_groups.append(NuFlavIntGroup(name)) if self.neutrinos: clean_outnames += [str(f) for f in self._output_nu_groups] if not isinstance(output_events, bool): raise AssertionError( 'output_events must be of type bool, instead it is supplied ' 'with type {0}'.format(type(output_events)) ) if output_events: #TODO Implement MapSet option or remove output_binning = None self.output_events = output_events super(events_to_data, self).__init__( use_transforms=False, params=params, expected_params=expected_params, output_names=clean_outnames, error_method=error_method, debug_mode=debug_mode, disk_cache=disk_cache, memcache_deepcopy=memcache_deepcopy, outputs_cache_depth=outputs_cache_depth, transforms_cache_depth=transforms_cache_depth, output_binning=output_binning ) self._compute_outputs()
def get_xs_ratio_value(self, flavintgroup0, flavintgroup1, energy, gamma=0): """Get ratio of combined cross sections for `flavintgroup0` to combined cross sections for `flavintgroup1`, weighted by E^{-`gamma`}. Parameters ---------- flavintgroup0, flavintgroup1 : NuFlavIntGroup or convertible thereto energy : numeric or sequence thereof Energy (or energies) at which to evaluate total cross section, in units of GeV Returns ------- Ratio of combined cross sections flavintgroup0 / flavintgroup1 evaluated at each energy. Shape of returned value matches that of passed `energy` parameter. """ flavintgroup0 = NuFlavIntGroup(flavintgroup0) flavintgroup1 = NuFlavIntGroup(flavintgroup1) self._define_interpolant(flavintgroup=flavintgroup0) self._define_interpolant(flavintgroup=flavintgroup1) xs_ratio_vals = self._interpolants[flavintgroup0](energy) / \ self._interpolants[flavintgroup1](energy) # Special case to avoid multiplying by array of ones if gamma == 0: return xs_ratio_vals return xs_ratio_vals * energy**(-gamma)
def joined_string(self): """Concise string identifying _only_ joined flavints""" joined_groups = sorted( [NuFlavIntGroup(j) for j in self.metadata['flavints_joined']]) if len(joined_groups) == 0: return 'unjoined' return 'joined_G_' + '_G_'.join([str(g) for g in joined_groups])
def _compute_outputs(self, inputs=None): """Apply basic cuts and compute histograms for output channels.""" logging.debug('Entering sample._compute_outputs') self.config = from_file(self.params['data_sample_config'].value) name = self.config.get('general', 'name') logging.trace('{0} sample sample_hash = ' '{1}'.format(name, self.sample_hash)) self.load_sample_events() if self.params['keep_criteria'].value is not None: # TODO(shivesh) raise NotImplementedError( 'needs check to make sure this works in a DistributionMaker' ) self._data.applyCut(self.params['keep_criteria'].value) self._data.update_hash() if self.output_events: return self._data outputs = [] if self.neutrinos: trans_nu_data = self._data.transform_groups( self._output_nu_groups ) for fig in trans_nu_data.keys(): outputs.append(trans_nu_data.histogram( kinds = fig, binning = self.output_binning, weights_col = 'pisa_weight', errors = True, name = str(NuFlavIntGroup(fig)), )) if self.muons: outputs.append(self._data.histogram( kinds = 'muons', binning = self.output_binning, weights_col = 'pisa_weight', errors = True, name = 'muons', tex = r'\rm{muons}' )) if self.noise: outputs.append(self._data.histogram( kinds = 'noise', binning = self.output_binning, weights_col = 'pisa_weight', errors = True, name = 'noise', tex = r'\rm{noise}' )) name = self.config.get('general', 'name') return MapSet(maps=outputs, name=name)
def __init__(self, params, input_binning, output_binning, input_names, transform_groups, error_method=None, debug_mode=None, disk_cache=None, transforms_cache_depth=20, outputs_cache_depth=20): self.xsec = None self.xsec_hash = None """Hash of GENIE spline file""" expected_params = ('xsec_file', 'livetime', 'ice_p', 'fid_vol', 'mr_h20', 'x_energy_scale') def suffix_channel(sign, suf): return '%s_%s' % (sign, suf) if isinstance(input_names, basestring): input_names = (''.join(input_names.split(' '))).split(',') self.output_channels = ('cc', 'nc') all_names = [ suffix_channel(in_name, out_chan) for in_name, out_chan in product(input_names, self.output_channels) ] if transform_groups is None: output_names = all_names else: transform_groups = flavintGroupsFromString(transform_groups) output_names = [] for grp in transform_groups: flavints = [str(g) for g in grp.flavints] if set(flavints).intersection(all_names) \ and str(grp) not in output_names: output_names.append(str(grp)) self.transform_groups = [ NuFlavIntGroup(flavint) for flavint in output_names ] super(genie, self).__init__(use_transforms=True, params=params, expected_params=expected_params, input_names=input_names, output_names=output_names, error_method=error_method, disk_cache=disk_cache, outputs_cache_depth=outputs_cache_depth, transforms_cache_depth=transforms_cache_depth, input_binning=input_binning, output_binning=output_binning, debug_mode=debug_mode) self.include_attrs_for_hashes('transform_groups')
def parse_event_type_names(names,return_flags=False) : #Split into list if has not already been done if isinstance(names, str): names = split(names) #Parse the names parsed_names = [] for name in names : if 'all_nu' in name: parsed_names.extend( [str(NuFlavIntGroup(f)) for f in ALL_NUFLAVINTS] ) else : parsed_names.append(name) parsed_names = [ n.lower() for n in parsed_names ] #Set some flags muons = False noise = False neutrinos = False for name in parsed_names: if 'muons' in name: muons = True elif 'noise' in name: noise = True elif name.startswith("nu"): neutrinos = True else : raise ValueError("Unrecognised event type '%s' found"%name) if return_flags : return parsed_names,muons,noise,neutrinos else : return parsed_names
def _compute_outputs(self, inputs=None): """Compute histograms for output channels.""" logging.debug('Entering fit._compute_outputs') if not isinstance(inputs, Data): raise AssertionError('inputs is not a Data object, instead is ' 'type {0}'.format(type(inputs))) self.weight_hash = deepcopy(inputs.metadata['weight_hash']) logging.trace('{0} fit weight_hash = ' '{1}'.format(inputs.metadata['name'], self.weight_hash)) logging.trace('{0} fit fit_hash = ' '{1}'.format(inputs.metadata['name'], self.fit_hash)) self._data = inputs self.reweight() if self.output_events: return self._data outputs = [] if self.neutrinos: trans_nu_data = self._data.transform_groups( self._output_nu_groups ) for fig in trans_nu_data.iterkeys(): outputs.append( trans_nu_data.histogram( kinds=fig, binning=self.output_binning, weights_col='pisa_weight', errors=True, name=str(NuFlavIntGroup(fig)), ) ) if self.muons: outputs.append( self._data.histogram( kinds='muons', binning=self.output_binning, weights_col='pisa_weight', errors=True, name='muons', tex=text2tex('muons') ) ) if self.noise: outputs.append( self._data.histogram( kinds='noise', binning=self.output_binning, weights_col='pisa_weight', errors=True, name='noise', tex=text2tex('noise') ) ) return MapSet(maps=outputs, name=self._data.metadata['name'])
def _define_interpolant(self, flavintgroup=None): """If `flavintgroup` is None, compute all (separate) flavint interpolants; otherwise, compute interpolant for specified `flavintgroup`. Do not re-compute if already present. """ if flavintgroup is None: flavintgroups = [NuFlavIntGroup(fi) for fi in self.flavints] else: flavintgroups = [NuFlavIntGroup(flavintgroup)] for fig in flavintgroups: if fig in self._interpolants: continue combined_xs = self._combine_xs(fig) self._interpolants[fig] = interp1d( x=self.energy, y=combined_xs, kind='linear', copy=False, bounds_error=True, fill_value=0 )
def validate_params(self, params): # do some checks on the parameters # Check type of pid_events assert isinstance(params.pid_events.value, (basestring, Events)) # Check the groupings of the pid_events file events = Events(params.pid_events.value) should_be_joined = sorted([ NuFlavIntGroup('nue_cc + nuebar_cc'), NuFlavIntGroup('numu_cc + numubar_cc'), NuFlavIntGroup('nutau_cc + nutaubar_cc'), NuFlavIntGroup('nuall_nc + nuallbar_nc'), ]) are_joined = sorted( [NuFlavIntGroup(s) for s in events.metadata['flavints_joined']]) if are_joined != should_be_joined: raise ValueError('Events passed have %s joined groupings but' ' it is required to have %s joined groupings.' % (are_joined, should_be_joined))
def load_pid_energy_param(source): """Load pid energy-dependent parameterisation from file or dictionary. Parameters ---------- source : string or mapping If string, interprete as resource location of the file; if mapping, use directly. Returns ------- pid_energy_param_dict : OrderedDict Keys are `NuFlavIntGroup`s and values are callables of one arg. """ # Get the original dict if isinstance(source, str): orig_dict = from_file(source) elif isinstance(source, Mapping): orig_dict = source else: raise TypeError('`source` must either be string or mapping; got %s' ' instead.' % type(source)) # Build dict with flavintgroups as keys; subdict with signatures as keys # and callables as values pid_energy_param_dict = OrderedDict() for flavintgroup_str, subdict in orig_dict.items(): flavintgroup = NuFlavIntGroup(flavintgroup_str) pid_energy_param_dict[flavintgroup] = OrderedDict() for signature, sig_param_spec in subdict.items(): if isinstance(sig_param_spec, str): sig_param_func = eval(sig_param_spec) if not callable(sig_param_func): raise ValueError( 'Group %s PID signature %s param spec "%s" does' ' not evaluate to a callable.' % (flavintgroup_str, signature, sig_param_spec) ) elif callable(sig_param_spec): sig_param_func = sig_param_spec else: raise TypeError( 'Group %s PID signature %s parameterization is a "%s"' ' but must be a string or callable.' % (flavintgroup_str, signature, type(sig_param_spec)) ) pid_energy_param_dict[flavintgroup][signature] = sig_param_func return pid_energy_param_dict
def __init__(self, params, output_binning, output_names, output_events=True, error_method=None, debug_mode=None, disk_cache=None, memcache_deepcopy=True, transforms_cache_depth=20, outputs_cache_depth=20, fix_truth_variable_names=False): self.sample_hash = None """Hash of event sample""" expected_params = ( 'data_sample_config', 'dataset', 'keep_criteria', ) output_names, self.muons, self.noise, self.neutrinos = parse_event_type_names( output_names, return_flags=True) self._output_nu_groups = [ NuFlavIntGroup(name) for name in output_names ] if not isinstance(output_events, bool): raise AssertionError( 'output_events must be of type bool, instead it is supplied ' 'with type {0}'.format(type(output_events))) if output_events: output_binning = None self.output_events = output_events super(sample, self).__init__(use_transforms=False, params=params, expected_params=expected_params, output_names=output_names, error_method=error_method, debug_mode=debug_mode, disk_cache=disk_cache, memcache_deepcopy=memcache_deepcopy, outputs_cache_depth=outputs_cache_depth, transforms_cache_depth=transforms_cache_depth, output_binning=output_binning) #User can specify that truth variables have their names prefixed with "truth_" self.fix_truth_variable_names = fix_truth_variable_names self.truth_variables = ["energy", "coszen"] self.truth_variable_prefix = "true_" self._compute_outputs()
def _combine_xs(self, flavintgroup): """Combine all cross sections specified by the flavints in `flavintgroup`. All CC and NC interactions are separately grouped together and averaged, then the average of each interaction type is added to the other. If CC and NC interactions are present, they *must* be from the same flavor(s). I.e., it doesn't make sense (and so causes an exception) if you combine numu CC with numubar NC. It does make sense if you combine numu and numubar CC with numu and numubar NC, though, and this is allowed. Notes ----- Does not yet implement *Ngen/spectrum-weighted* averages, which are necessary when combining cross sections of disparate flavor/interaction types from different Monte Carlo simulation runs. """ flavintgroup = NuFlavIntGroup(flavintgroup) # Trivial case: nothing to combine if len(flavintgroup.flavints) == 1: return self[flavintgroup.flavints[0]] cc_flavints = flavintgroup.cc_flavints nc_flavints = flavintgroup.nc_flavints if cc_flavints and nc_flavints: assert flavintgroup.cc_flavs == flavintgroup.nc_flavs, \ 'Combining CC and NC but CC flavors do not match NC flavors' cc_avg_xs = 0 if cc_flavints: logging.trace('cc_flavints = %s' % (cc_flavints,)) cc_avg_xs = np.sum([self[k] for k in cc_flavints], axis=0) \ / len(cc_flavints) nc_avg_xs = 0 if nc_flavints: logging.trace('nc_flavints = %s' % (nc_flavints,)) nc_avg_xs = np.sum([self[k] for k in nc_flavints], axis=0) \ / len(nc_flavints) tot_xs = cc_avg_xs + nc_avg_xs logging.trace('mean(tot_xs) = %s' % (np.mean(tot_xs),)) return tot_xs
def get_xs_value(self, flavintgroup, energy): """Get (combined) cross section value (in units of m^2) for `flavintgroup` at `energy` (in units of GeV). Parameters ---------- flavintgroup : NuFlavIntGroup or convertible thereto energy : numeric or sequence thereof Energy (or energies) at which to evaluate total cross section, in units of GeV Returns ------- Combined cross section for flavor/interaction types in units of m^2, evaluated at each energy. Shape of returned value matches that of passed `energy` parameter. """ flavintgroup = NuFlavIntGroup(flavintgroup) if flavintgroup not in self._interpolants: self._define_interpolant(flavintgroup=flavintgroup) return self._interpolants[flavintgroup](energy)
def test_CrossSections(outdir=None): """Unit tests for CrossSections class""" from shutil import rmtree from tempfile import mkdtemp remove_dir = False if outdir is None: remove_dir = True outdir = mkdtemp() try: # "Standard" location of cross sections file in PISA; retrieve 2.6.4 for # testing purposes pisa_xs_file = 'cross_sections/cross_sections.json' xs = CrossSections(ver='genie_2.6.4', xsec=pisa_xs_file) # Location of the root file to use (not included in PISA at the moment) test_dir = expand(os.path.join('/tmp', 'pisa_tests', 'cross_sections')) #root_xs_file = os.path.join(test_dir, 'genie_2.6.4_simplified.root') root_xs_file = find_resource(os.path.join( #'tests', 'data', 'xsec', 'genie_2.6.4_simplified.root' 'cross_sections', 'genie_xsec_H2O.root' )) # Make sure that the XS newly-imported from ROOT match those stored in # PISA if os.path.isfile(root_xs_file): xs_from_root = CrossSections.new_from_root(root_xs_file, ver='genie_2.6.4') logging.info('Found and loaded ROOT source cross sections file %s', root_xs_file) #assert xs_from_root.allclose(xs, rtol=1e-7) # Check XS ratio for numu_cc to numu_cc + numu_nc (user must inspect) kg0 = NuFlavIntGroup('numu_cc') kg1 = NuFlavIntGroup('numu_nc') logging.info( r'\int_1^80 xs(numu_cc) E^{-1} dE = %e', xs.get_xs_ratio_integral(kg0, None, e_range=[1, 80], gamma=1) ) logging.info( '(int E^{-gamma} * (sigma_numu_cc)/int(sigma_(numu_cc+numu_nc)) dE)' ' / (int E^{-gamma} dE) = %e', xs.get_xs_ratio_integral(kg0, kg0+kg1, e_range=[1, 80], gamma=1, average=True) ) # Check that XS ratio for numu_cc+numu_nc to the same is 1.0 int_val = xs.get_xs_ratio_integral(kg0+kg1, kg0+kg1, e_range=[1, 80], gamma=1, average=True) if not recursiveEquality(int_val, 1): raise ValueError('Integral of nc + cc should be 1.0; get %e' ' instead.' % int_val) # Check via plot that the # Plot all cross sections stored in PISA xs file try: alldata = from_file(pisa_xs_file) xs_versions = alldata.keys() for ver in xs_versions: xs = CrossSections(ver=ver, xsec=pisa_xs_file) xs.plot(save=os.path.join( outdir, 'pisa_' + ver + '_nuxCCNC_H2O_cross_sections.pdf' )) except ImportError as exc: logging.debug('Could not plot; possible that matplotlib not' 'installed. ImportError: %s', exc) finally: if remove_dir: rmtree(outdir)
def get_xs_ratio_integral(self, flavintgroup0, flavintgroup1, e_range, gamma=0, average=False): """Energy-spectrum-weighted integral of (possibly a ratio of) (possibly-combined) flavor/interaction type cross sections. Parameters ---------- flavintgroup0 : NuFlavIntGroup or convertible thereto Flavor(s)/interaction type(s) for which to combine cross sections for numerator of ratio flavintgroup1 : None, NuFlavIntGroup or convertible thereto Flavor(s)/interaction type(s) for which to combine cross sections for denominator of ratio. If None is passed, the denominator of the "ratio" is effectively 1. e_range Range of energy over which to integrate (GeV) gamma : float >= 0 Power law spectral index used for weighting the integral, E^{-`gamma`}. Note that `gamma` should be >= 0. average : bool If True, return the average of the cross section (ratio) If False, return the integral of the cross section (ratio) See also -------- See _combine_xs for detals on how flavints are combined. """ e_min = min(e_range) e_max = max(e_range) assert e_min > 0, '`e_range` must lie strictly above 0' assert e_max > e_min, \ 'max(`e_range`) must be strictly larger than min(`e_range`)' assert gamma >= 0, '`gamma` must be >= 0' if flavintgroup1 is None: flavintgroups = [NuFlavIntGroup(flavintgroup0)] else: flavintgroups = [NuFlavIntGroup(flavintgroup0), NuFlavIntGroup(flavintgroup1)] # Create interpolant(s) (to get xs at energy range's endpoints) for fg in flavintgroups: self._define_interpolant(flavintgroup=fg) all_energy = self._interpolants[flavintgroups[0]].x xs_data = [self._interpolants[fg].y for fg in flavintgroups] for xd in xs_data: logging.trace('mean(xs_data) = %e' % np.mean(xd)) # Get indices of data points within the specified energy range idx = (all_energy > e_min) & (all_energy < e_max) # Get xsec at endpoints xs_endpoints = [self._interpolants[fg]((e_min, e_max)) for fg in flavintgroups] for ep in xs_endpoints: logging.trace('xs_emin = %e, xs_emax = %e' % (ep[0], ep[1])) # Attach endpoints energy = np.concatenate([[e_min], all_energy[idx], [e_max]]) xs = [np.concatenate([[ep[0]], xsd[idx], [ep[1]]]) for ep, xsd in zip(xs_endpoints, xs_data)] if len(xs) == 1: xs = xs[0] else: xs = xs[0] / xs[1] # Weight xsec (or ratio) by energy spectrum if gamma == 0: wtd_xs = xs else: wtd_xs = xs*energy**(-gamma) logging.trace('mean(wtd_xs) = %e' % np.mean(wtd_xs)) # Integrate via trapezoidal rule wtd_xs_integral = np.trapz(y=wtd_xs, x=energy) logging.trace('wtd_xs_integral = %e' % wtd_xs_integral) # Need to divide by integral of the weight function (over the same # energy interval as wtd_xs integral was computed) to get the average if average: if gamma == 0: # Trivial case xs_average = wtd_xs_integral / (e_max - e_min) else: # Otherwise use trapezoidal rule to approximate integral xs_average = wtd_xs_integral / \ np.trapz(y=energy**(-gamma), x=energy) #* (e_max-e_min) logging.trace('xs_average = %e' %(xs_average)) return xs_average return wtd_xs_integral
def load_reco_param(source): """Load reco parameterisation (energy-dependent) from file or dictionary. Parameters ---------- source : string or mapping Source of the parameterization. If string, treat as file path or resource location and load from the file; this must yield a mapping. If `source` is a mapping, it is used directly. See notes below on format. Returns ------- reco_params : OrderedDict Keys are stringified flavintgroups and values are dicts of strings representing the different reco dimensions and lists of distribution properties. These latter have a 'fraction', a 'dist' and a 'kwargs' key. The former two hold callables, while the latter holds a dict of key-callable pairs ('loc', 'scale'), which can be evaluated at the desired energies and passed into the respective `scipy.stats` distribution. The distributions for a given dimension will be superimposed according to their relative weights to form the reco kernels (via integration) when called with energy values (parameterisations are functions of energy only!). Notes ----- The mapping passed via `source` or loaded therefrom must have the format: { <flavintgroup_string>: { <dimension_string>:[ { "dist": dist_id, "fraction": val, "kwargs": { "loc": val, "scale": val, ... } }, ... ] }, <flavintgroup_string>: ... } `flavintgroup_string`s must be parsable by pisa.utils.flavInt.NuFlavIntGroup. Note that the `transform_groups` defined in a pipeline config file using this must match the groupings defined above. `dimension_string`s denote the observables/dimensions whose reco error distribution is parameterised (`"energy"` or `"coszen"`). `dist_id` needs to be a string identifying a probability distribution/statistical function provided by `scipy.stats`. No implicit assumptions about the distribution will be made if the `"dist"` key is missing. `"fraction"` holds the relative weight of the distribution. For a given dimension, the sum of all fractions present must be 1. Valid kwargs for distributions must at least include `"loc"` and `"scale"` - these will be passed into the respective `scipy.stats` function. `val`s can be one of the following: - Callable with one argument - String such that `eval(val)` yields a callable with one argument """ if not (source is None or isinstance(source, (basestring, Mapping))): raise TypeError('`source` must be string, mapping, or None') if isinstance(source, basestring): orig_dict = from_file(source) elif isinstance(source, Mapping): orig_dict = source else: raise TypeError('Cannot load reco parameterizations from a %s' % type(source)) valid_dimensions = ('coszen', 'energy') required_keys = ('dist', 'fraction', 'kwargs') # Build dict of parameterizations (each a callable) per flavintgroup reco_params = OrderedDict() for flavint_key, dim_dict in orig_dict.iteritems(): flavintgroup = NuFlavIntGroup(flavint_key) reco_params[flavintgroup] = {} for dimension in dim_dict.iterkeys(): dim_dist_list = [] if not isinstance(dimension, basestring): raise TypeError("The dimension needs to be given as a string!" " Allowed: %s." % valid_dimensions) if dimension not in valid_dimensions: raise ValueError("Dimension '%s' not recognised!" % dimension) for dist_dict in dim_dict[dimension]: dist_spec_dict = {} # allow reading in even if kwargs not present - computation of # transform will fail because "loc" and "scale" hard-coded # requirement for required in required_keys: if required not in dist_dict: raise ValueError("Found distribution property dict " "without required '%s' key for " "%s - %s!" % (required, flavintgroup, dimension)) for k in dist_dict.iterkeys(): if k not in required_keys: logging.warn("Unrecognised key in distribution" " property dict: '%s'" % k) dist_spec = dist_dict['dist'] if not isinstance(dist_spec, basestring): raise TypeError(" The resolution function needs to be" " given as a string!") if not dist_spec: raise ValueError("Empty string found for resolution" " function!") try: dist = getattr(stats, dist_spec.lower()) except AttributeError: try: import scipy sp_ver_str = scipy.__version__ except: sp_ver_str = "N/A" raise AttributeError("'%s' is not a valid distribution" " from scipy.stats (your scipy" " version: '%s')." % (dist_spec.lower(), sp_ver_str)) logging.debug("Found %s - %s resolution function: '%s'" % (flavintgroup, dimension, dist.name)) dist_spec_dict['dist'] = dist frac = dist_dict['fraction'] if isinstance(frac, basestring): frac_func = eval(frac) elif callable(frac): frac_func = frac else: raise TypeError( "Expected 'fraction' to be either a string" " that can be interpreted by eval or a callable." " Got '%s'." % type(frac)) dist_spec_dict['fraction'] = frac_func kwargs = dist_dict['kwargs'] if not isinstance(kwargs, dict): raise TypeError( "'kwargs' must hold a dictionary. Got '%s' instead." % type(kwargs)) dist_spec_dict['kwargs'] = kwargs for kwarg, kwarg_spec in kwargs.iteritems(): if isinstance(kwarg_spec, basestring): kwarg_eval = eval(kwarg_spec) elif callable(kwarg_spec) or isscalar(kwarg_spec): kwarg_eval = kwarg_spec else: raise TypeError( "Expected kwarg '%s' spec to be either a string" " that can be interpreted by eval, a callable or" " a scalar. Got '%s'." % type(kwarg_spec)) dist_spec_dict['kwargs'][kwarg] = kwarg_eval dim_dist_list.append(dist_spec_dict) reco_params[flavintgroup][dimension] = dim_dist_list return reco_params
def __init__(self, params, particles, transform_groups, sum_grouped_flavints, input_binning, output_binning, memcache_deepcopy, transforms_cache_depth, outputs_cache_depth, input_names=None, error_method=None, debug_mode=None): assert particles in ['neutrinos', 'muons'] self.particles = particles """Whether stage is instantiated to process neutrinos or muons""" self.transform_groups = flavintGroupsFromString(transform_groups) """Particle/interaction types to group for computing transforms""" assert isinstance(sum_grouped_flavints, bool) self.sum_grouped_flavints = sum_grouped_flavints # All of the following params (and no more) must be passed via the # `params` argument. expected_params = [ 'aeff_energy_paramfile', 'aeff_coszen_paramfile', 'livetime', 'aeff_scale' ] if particles == 'neutrinos': expected_params.append('nutau_cc_norm') if isinstance(input_names, str): input_names = input_names.replace(' ', '').split(',') elif input_names is None: if particles == 'neutrinos': input_names = ('nue', 'nuebar', 'numu', 'numubar', 'nutau', 'nutaubar') if self.particles == 'neutrinos': # TODO: if sum_grouped_flavints, then the output names should be # e.g. 'nue_cc_nuebar_cc' and 'nue_nc_nuebar_nc' if nue and nuebar # are grouped... (?) if self.sum_grouped_flavints: output_names = [str(g) for g in self.transform_groups] else: input_flavints = NuFlavIntGroup(input_names) output_names = [str(fi) for fi in input_flavints] elif self.particles == 'muons': raise NotImplementedError else: raise ValueError('Particle type `%s` is not valid' % self.particles) logging.trace('transform_groups = %s', self.transform_groups) logging.trace('output_names = %s', ' :: '.join(output_names)) super().__init__( use_transforms=True, params=params, expected_params=expected_params, input_names=input_names, output_names=output_names, error_method=error_method, memcache_deepcopy=memcache_deepcopy, outputs_cache_depth=outputs_cache_depth, transforms_cache_depth=transforms_cache_depth, input_binning=input_binning, output_binning=output_binning, debug_mode=debug_mode ) self.include_attrs_for_hashes('particles') self.include_attrs_for_hashes('transform_groups') self.ecen = self.input_binning.true_energy.weighted_centers.magnitude """input energy-binning weighted centers""" self.has_cz = False """Whether the stage has true_coszen input binning""" self.czcen = None """input coszen-binning weighted centers (or None if no coszen dim)""" if 'true_coszen' in self.input_binning.names: self.has_cz = True self.czcen = self.input_binning.true_coszen.weighted_centers.m_as('dimensionless') self._param_hashes = dict(energy=None, coszen=None) self.aeff_params = dict(energy=dict()) if self.has_cz: self.aeff_params['coszen'] = None
def compute_transforms(service): """Compute effective area transforms, taking aeff systematics into account. Systematics are: `aeff_scale`, `livetime`, and `nutau_cc_norm` """ aeff_scale = service.params.aeff_scale.m_as('dimensionless') livetime_s = service.params.livetime.m_as('sec') base_scale = aeff_scale * livetime_s logging.trace('livetime = %s --> %s sec', service.params.livetime.value, livetime_s) if service.particles == 'neutrinos': if not hasattr(service, 'nutau_cc_norm_must_be_one'): service.nutau_cc_norm_must_be_one = False """If any flav/ints besides nutau_cc and nutaubar_cc are grouped with one or both of those for transforms, then a `nutau_cc_norm` != 1 cannot be applied.""" nutaucc_and_nutaubarcc = set(NuFlavIntGroup('nutau_cc+nutaubar_cc')) for group in service.transform_groups: # If nutau_cc, nutaubar_cc, or both are the group and other flavors # are present, nutau_cc_norm must be one! group_set = set(group) if group_set.intersection(nutaucc_and_nutaubarcc) and \ group_set.difference(nutaucc_and_nutaubarcc): service.nutau_cc_norm_must_be_one = True nutau_cc_norm = service.params.nutau_cc_norm.m_as('dimensionless') if nutau_cc_norm != 1 and service.nutau_cc_norm_must_be_one: raise ValueError( '`nutau_cc_norm` = %e but can only be != 1 if nutau CC and' ' nutaubar CC are separated from other flav/ints.' ' Transform groups are: %s' % (nutau_cc_norm, service.transform_groups) ) if hasattr(service, 'sum_grouped_flavints'): sum_grouped_flavints = service.sum_grouped_flavints else: sum_grouped_flavints = False new_transforms = [] for transform in service.nominal_transforms: this_scale = base_scale if service.particles == 'neutrinos': out_nfig = NuFlavIntGroup(transform.output_name) if 'nutau_cc' in out_nfig or 'nutaubar_cc' in out_nfig: this_scale *= nutau_cc_norm if this_scale != 1: aeff_transform = transform.xform_array * this_scale else: aeff_transform = transform.xform_array new_xform = BinnedTensorTransform( input_names=transform.input_names, output_name=transform.output_name, input_binning=transform.input_binning, output_binning=transform.output_binning, xform_array=aeff_transform, sum_inputs=sum_grouped_flavints ) new_transforms.append(new_xform) return TransformSet(new_transforms)
def _compute_transforms(self): """Generate reconstruction "smearing kernels" by histogramming true and reconstructed variables from a Monte Carlo events file. The resulting transform is a 2N-dimensional histogram, where N is the dimensionality of the input binning. The transform maps the truth bin counts to the reconstructed bin counts. I.e., for the case of 1D input binning, the ith element of the reconstruction kernel will be a map showing the distribution of events over all the reco space from truth bin i. This will be normalised to the total number of events in truth bin i. Notes ----- In the current implementation these histograms are made **UN**weighted. This is probably quite wrong... """ e_res_scale = self.params.e_res_scale.value.m_as('dimensionless') cz_res_scale = self.params.cz_res_scale.value.m_as('dimensionless') e_reco_bias = self.params.e_reco_bias.value.m_as('GeV') cz_reco_bias = self.params.cz_reco_bias.value.m_as('dimensionless') res_scale_ref = self.params.res_scale_ref.value.strip().lower() assert res_scale_ref in ['zero'] # TODO: , 'mean', 'median'] self.load_events(self.params.reco_events) self.cut_events(self.params.transform_events_keep_criteria) # Computational units must be the following for compatibility with # events file comp_units = dict(true_energy='GeV', true_coszen=None, true_azimuth='rad', reco_energy='GeV', reco_coszen=None, reco_azimuth='rad', pid=None) # Select only the units in the input/output binning for conversion # (can't pass more than what's actually there) in_units = { dim: unit for dim, unit in comp_units.items() if dim in self.input_binning } out_units = { dim: unit for dim, unit in comp_units.items() if dim in self.output_binning } # These binnings will be in the computational units defined above input_binning = self.input_binning.to(**in_units) output_binning = self.output_binning.to(**out_units) xforms = [] for xform_flavints in self.transform_groups: logging.debug("Working on %s reco kernels" % xform_flavints) repr_flavint = xform_flavints[0] true_energy = self.events[repr_flavint]['true_energy'] true_coszen = self.events[repr_flavint]['true_coszen'] reco_energy = self.events[repr_flavint]['reco_energy'] reco_coszen = self.events[repr_flavint]['reco_coszen'] e_reco_err = reco_energy - true_energy cz_reco_err = reco_coszen - true_coszen if self.params.res_scale_ref.value.strip().lower() == 'zero': self.events[repr_flavint]['reco_energy'] = ( true_energy + e_reco_err * e_res_scale + e_reco_bias) self.events[repr_flavint]['reco_coszen'] = ( true_coszen + cz_reco_err * cz_res_scale + cz_reco_bias) # True (input) + reco {+ PID} (output)-dimensional histogram # is the basis for the transformation reco_kernel = self.events.histogram( kinds=xform_flavints, binning=input_binning * output_binning, weights_col=self.params.reco_weights_name.value, errors=(self.error_method not in [None, False])) # Extract just the numpy array to work with reco_kernel = reco_kernel.hist # This takes into account the correct kernel normalization: # What this means is that we have to normalise the reco map # to the number of events in the truth bin. # # I.e., we have N events from the truth bin which then become # spread out over the whole map due to reconstruction. # The normalisation is dividing this map by N. # # Previously this was hard-coded for 2 dimensions, but I have tried # to generalise it to arbitrary dimensionality. # Truth-only (N-dimensional) histogram will be used for # normalization (so transform is in terms of fraction-of-events in # input--i.e. truth--bin). Sum over the input dimensions. true_event_counts = self.events.histogram( kinds=xform_flavints, binning=input_binning, weights_col=self.params.reco_weights_name.value, errors=(self.error_method not in [None, False])) # Extract just the numpy array to work with true_event_counts = true_event_counts.hist # If there weren't any events in the input (true_*) bin, make this # bin have no effect -- i.e., populate all output bins # corresponding to the input bin with zeros via `nan_to_num`. with np.errstate(divide='ignore', invalid='ignore'): true_event_counts[true_event_counts == 0] = np.nan norm_factors = 1.0 / true_event_counts norm_factors = np.nan_to_num(norm_factors) # Numpy broadcasts lower-dimensional things to higher dimensions # from last dimension to first; if we simply mult the reco_kernel # by norm_factors, this will apply the normalization to the # __output__ dimensions rather than the input dimensions. Add # "dummy" dimensions to norm_factors where we want the "extra # dimensions": at the end. for dim in self.output_binning: norm_factors = np.expand_dims(norm_factors, axis=-1) # Apply the normalization to the kernels reco_kernel *= norm_factors assert np.all(reco_kernel >= 0), \ 'number of elements less than 0 = %d' \ % np.sum(reco_kernel < 0) sum_over_axes = tuple(range(-len(self.output_binning), 0)) totals = np.sum(reco_kernel, axis=sum_over_axes) assert np.all( totals <= 1 + 1e-14), 'max = ' + str(np.max(totals) - 1) # Now populate this transform to each input for which it applies. if self.sum_grouped_flavints: xform_input_names = [] for input_name in self.input_names: input_flavs = NuFlavIntGroup(input_name) if len(set(xform_flavints).intersection(input_flavs)) > 0: xform_input_names.append(input_name) for output_name in self.output_names: if output_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=xform_input_names, output_name=output_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, sum_inputs=self.sum_grouped_flavints) xforms.append(xform) else: # NOTES: # * Output name is same as input name # * Use `self.input_binning` and `self.output_binning` so maps # are returned in user-defined units (rather than # computational units, which are attached to the non-`self` # versions of these binnings). for input_name in self.input_names: if input_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=input_name, output_name=input_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, ) xforms.append(xform) return TransformSet(transforms=xforms)
def load_neutrino_events(config, dataset): nu_data = [] if dataset == 'neutrinos%sgen_lvl' % SEP: gen_cfg = from_file(config.get(dataset, 'gen_cfg_file')) name = gen_cfg.get('general', 'name') datadir = gen_cfg.get('general', 'datadir') event_types = split(gen_cfg.get('general', 'event_type')) weights = split(gen_cfg.get('general', 'weights')) weight_units = gen_cfg.get('general', 'weight_units') keep_keys = split(gen_cfg.get('general', 'keep_keys')) aliases = gen_cfg.items('aliases') logging.info('Extracting neutrino dataset "{0}" from generator ' 'level sample "{1}"'.format(dataset, name)) for idx, flav in enumerate(event_types): fig = NuFlavIntGroup(flav) all_flavints = fig.flavints events_file = datadir + gen_cfg.get(flav, 'filename') flav_fidg = sample.load_from_nu_file( events_file, all_flavints, weights[idx], weight_units, keep_keys, aliases ) nu_data.append(flav_fidg) else: name = config.get('general', 'name') flavours = split(config.get('neutrinos', 'flavours')) weights = split(config.get('neutrinos', 'weights')) weight_units = config.get('neutrinos', 'weight_units') sys_list = split(config.get('neutrinos', 'sys_list')) base_prefix = config.get('neutrinos', 'baseprefix') keep_keys = split(config.get('neutrinos', 'keep_keys')) aliases = config.items('neutrinos%saliases' % SEP) logging.info('Extracting neutrino dataset "{0}" from sample ' '"{1}"'.format(dataset, name)) if base_prefix == 'None': base_prefix = '' for idx, flav in enumerate(flavours): f = int(flav) all_flavints = NuFlavIntGroup(f, -f).flavints if dataset == 'nominal': prefixes = [] for sys in sys_list: ev_sys = 'neutrinos%s%s' % (SEP, sys) nominal = config.get(ev_sys, 'nominal') ev_sys_nom = ev_sys + SEP + nominal prefixes.append(config.get(ev_sys_nom, 'file_prefix')) if len(set(prefixes)) > 1: raise AssertionError( 'Choice of nominal file is ambigous. Nominal ' 'choice of systematic parameters must coincide ' 'with one and only one file. Options found are: ' '{0}'.format(prefixes) ) file_prefix = flav + prefixes[0] else: file_prefix = flav + config.get(dataset, 'file_prefix') events_file = path.join( config.get('general', 'datadir'), base_prefix + file_prefix ) flav_fidg = sample.load_from_nu_file( events_file, all_flavints, weights[idx], weight_units, keep_keys, aliases ) nu_data.append(flav_fidg) nu_data = Data( reduce(add, nu_data), metadata={'name': name, 'sample': dataset} ) return nu_data
def __init__(self, params, particles, transform_groups, sum_grouped_flavints, input_binning, output_binning, memcache_deepcopy, transforms_cache_depth, outputs_cache_depth, input_names=None, error_method=None, debug_mode=None): assert particles in ['neutrinos', 'muons'] self.particles = particles """Whether stage is instantiated to process neutrinos or muons""" self.transform_groups = flavintGroupsFromString(transform_groups) """Particle/interaction types to group for computing transforms""" self.sum_grouped_flavints = sum_grouped_flavints # All of the following params (and no more) must be passed via the # `params` argument. expected_params = [ 'aeff_events', 'livetime', 'aeff_scale', 'aeff_e_smooth_factor', 'aeff_cz_smooth_factor', 'transform_events_keep_criteria', ] if particles == 'neutrinos': expected_params.append('nutau_cc_norm') if isinstance(input_names, str): input_names = input_names.replace(' ', '').split(',') elif input_names is None: if particles == 'neutrinos': input_names = ('nue', 'nuebar', 'numu', 'numubar', 'nutau', 'nutaubar') # Define the names of objects expected in inputs and produced as # outputs if self.particles == 'neutrinos': if self.sum_grouped_flavints: output_names = [str(g) for g in self.transform_groups] else: input_flavints = NuFlavIntGroup(input_names) output_names = [str(fi) for fi in input_flavints] elif self.particles == 'muons': raise NotImplementedError else: raise ValueError('Particle type `%s` is not valid' % self.particles) logging.trace('transform_groups = %s' % self.transform_groups) logging.trace('output_names = %s' % ' :: '.join(output_names)) # Invoke the init method from the parent class, which does a lot of # work for you. super().__init__(use_transforms=True, params=params, expected_params=expected_params, input_names=input_names, output_names=output_names, error_method=error_method, memcache_deepcopy=memcache_deepcopy, outputs_cache_depth=outputs_cache_depth, transforms_cache_depth=transforms_cache_depth, input_binning=input_binning, output_binning=output_binning, debug_mode=debug_mode) # Can do these now that binning has been set up in call to Stage's init self.include_attrs_for_hashes('particles') self.include_attrs_for_hashes('transform_groups')
def load_aeff_param(source): """Load aeff parameterisation (energy- or coszen-dependent) from file or dictionary. Parameters ---------- source : string or mapping Source of the parameterization. If string, treat as file path or resource location and load from the file; this must yield a mapping. If `source` is a mapping, it is used directly. See notes below on format. Returns ------- aeff_params : OrderedDict Keys are stringified flavintgroups and values are the callables that produce aeff when called with energy or coszen values. Notes ----- The mapping passed via `source` or loaded therefrom msut have the format: { <flavintgroup_string>: val, <flavintgroup_string>: val, ... } `flavintgroup_string`s must be parsable by pisa.utils.flavInt.NuFlavIntGroup. Note that the `transform_groups` defined in a pipeline config file using this must match the groupings defined above. `val`s can be one of the following: - Callable with one argument - String such that `eval(val)` yields a callable with one argument - Mapping with the format: { <"energy" or "coszen">: [sequence of values], "aeff": [sequence of values]A } the two sequences are used to form a linear interpolant callable that maps energy or coszen values to aeff values.. """ if not (source is None or isinstance(source, (str, Mapping))): raise TypeError('`source` must be string, mapping, or None') if isinstance(source, str): orig_dict = from_file(source) elif isinstance(source, Mapping): orig_dict = source else: raise TypeError('Cannot load aeff parameterizations from a %s' % type(source)) # Build dict of parameterizations (each a callable) per flavintgroup aeff_params = OrderedDict() for flavint_key, param_spec in orig_dict.items(): flavintgroup = NuFlavIntGroup(flavint_key) if isinstance(param_spec, str): param_func = eval(param_spec) elif callable(param_spec): param_func = param_spec elif isinstance(param_spec, Mapping): is_energy = 'energy' in param_spec is_coszen = 'coszen' in param_spec valid = True if 'aeff' not in param_spec: valid = False elif not (is_energy or is_coszen): valid = False if not valid: raise ValueError( 'Expected keys of "aeff" and either "energy" or' ' "coszen" to construct a spline. Got %s instead.' ' Aeff param spec source: %s, flavintgroup %s' % (param_spec.keys(), source, flavintgroup) ) var = 'energy' if is_energy else 'coszen' x_vals = param_spec[var] aeff_vals = param_spec['aeff'] # TODO: Could potentially add interp1d options to config param_func = interp1d(x_vals, aeff_vals, kind='linear', bounds_error=False, fill_value=0) else: raise TypeError( 'Expected parameteriation spec to be either a string that' ' can be interpreted by eval or as a mapping of values' ' from which to construct a spline. Got "%s".' % type(param_spec) ) aeff_params[flavintgroup] = param_func return aeff_params
def histogram_set(self, binning, nu_weights_col, mu_weights_col, noise_weights_col, mapset_name, errors=False): """Uses the above histogram function but returns the set of all of them for everything in the Data object. Parameters ---------- binning : OneDimBinning, MultiDimBinning The definition of the binning for the histograms. nu_weights_col : None or string The column in the Data object by which to weight the neutrino histograms. Specify None for unweighted histograms. mu_weights_col : None or string The column in the Data object by which to weight the muon histograms. Specify None for unweighted histograms. noise_weights_col : None or string The column in the Data object by which to weight the noise histograms. Specify None for unweighted histograms. mapset_name : string The name by which the resulting MapSet will be identified. errors : boolean A flag for whether to calculate errors on the histograms or not. This defaults to False. Returns ------- MapSet : A MapSet containing all of the Maps for everything in this Data object. """ if not isinstance(binning, MultiDimBinning): if not isinstance(binning, OneDimBinning): raise TypeError('binning should be either MultiDimBinning or ' 'OneDimBinning object. Got %s.' % type(binning)) if nu_weights_col is not None: if not isinstance(nu_weights_col, basestring): raise TypeError('nu_weights_col should be a string. Got %s' % type(nu_weights_col)) if mu_weights_col is not None: if not isinstance(mu_weights_col, basestring): raise TypeError('mu_weights_col should be a string. Got %s' % type(mu_weights_col)) if not isinstance(errors, bool): raise TypeError('flag for whether to calculate errors or not ' 'should be a boolean. Got %s.' % type(errors)) outputs = [] if self.contains_neutrinos: for fig in self.iterkeys(): outputs.append( self.histogram(kinds=fig, binning=binning, weights_col=nu_weights_col, errors=errors, name=str(NuFlavIntGroup(fig)))) if self.contains_muons: outputs.append( self.histogram(kinds='muons', binning=binning, weights_col=mu_weights_col, errors=errors, name='muons', tex=r'\rm{muons}')) if self.contains_noise: outputs.append( self.histogram(kinds='noise', binning=binning, weights_col=mu_weights_col, errors=errors, name='noise', tex=r'\rm{noise}')) return MapSet(maps=outputs, name=mapset_name)
def histogram(self, kinds, binning, binning_cols=None, weights_col=None, errors=False, name=None, tex=None): """Histogram the events of all `kinds` specified, with `binning` and optionally applying `weights`. Parameters ---------- kinds : string, sequence of NuFlavInt, or NuFlavIntGroup binning : OneDimBinning, MultiDimBinning or sequence of arrays (one array per binning dimension) binning_cols : string or sequence of strings Bin only these dimensions, ignoring other dimensions in `binning` weights_col : None or string Column to use for weighting the events errors : bool Whether to attach errors to the resulting Map name : None or string Name to give to resulting Map. If None, a default is derived from `kinds` and `weights_col`. tex : None or string TeX label to give to the resulting Map. If None, default is dereived from the `name` specified (or its value derived from `kinds` and `weights_col`). Returns ------- Map : numpy ndarray with as many dimensions as specified by `binning` argument """ # TODO: make able to take integer for `binning` and--in combination # with units in the Events columns--generate an appropriate # MultiDimBinning object, attach this and return the package as a Map. if not isinstance(kinds, NuFlavIntGroup): kinds = NuFlavIntGroup(kinds) if isinstance(binning_cols, basestring): binning_cols = [binning_cols] assert weights_col is None or isinstance(weights_col, basestring) # TODO: units of columns, and convert bin edges if necessary if isinstance(binning, OneDimBinning): binning = MultiDimBinning([binning]) elif isinstance(binning, MultiDimBinning): pass elif (isinstance(binning, Iterable) and not isinstance(binning, Sequence)): binning = list(binning) elif isinstance(binning, Sequence): pass else: raise TypeError('Unhandled type %s for `binning`.' % type(binning)) if isinstance(binning, Sequence): raise NotImplementedError( 'Simle sequences not handled at this time. Please specify a' ' OneDimBinning or MultiDimBinning object for `binning`.') #assert len(binning_cols) == len(binning) #bin_edges = binning # TODO: units support for Events will mean we can do `m_as(...)` here! bin_edges = [edges.magnitude for edges in binning.bin_edges] if binning_cols is None: binning_cols = binning.names else: assert set(binning_cols).issubset(set(binning.names)) # Extract the columns' data into a list of array(s) for histogramming repr_flavint = kinds[0] sample = [self[repr_flavint][colname] for colname in binning_cols] err_weights = None hist_weights = None if weights_col is not None: hist_weights = self[repr_flavint][weights_col] if errors: err_weights = np.square(hist_weights) hist, edges = np.histogramdd(sample=sample, weights=hist_weights, bins=bin_edges) if errors: sumw2, edges = np.histogramdd(sample=sample, weights=err_weights, bins=bin_edges) hist = unp.uarray(hist, np.sqrt(sumw2)) if name is None: if tex is None: tex = kinds.tex if weights_col is not None: tex += r', \; {\rm weights=' + text2tex(weights_col) + r'}' name = str(kinds) if weights_col is not None: name += ', weights=' + weights_col if tex is None: tex = text2tex(name) return Map(name=name, hist=hist, binning=binning, tex=tex)
def _compute_nominal_transforms(self): """Compute new PID transforms.""" logging.debug('Updating pid.param PID histograms...') self.load_pid_energy_param(self.params.pid_energy_paramfile.value) nominal_transforms = [] for xform_flavints in self.transform_groups: logging.debug('Working on %s PID', xform_flavints) xform_array = np.empty(self.transform_output_binning.shape) subdict = self.pid_energy_param_dict[xform_flavints] for signature, sig_param_func in subdict.items(): # Get the PID probabilities vs. energy at the energy bins' # (weighted) centers pid1d = sig_param_func(self.ebin_centers) # Broadcast this 1d array across the reco_coszen dimension # since it's independent of reco_coszen broadcasted_pid = self.transform_output_binning.broadcast( pid1d, from_dim='reco_energy', to_dims='reco_coszen') pid_indexer = (self.transform_output_binning.indexer( pid=signature)) # Assign the broadcasted array to the correct PID bin xform_array[pid_indexer] = broadcasted_pid if self.sum_grouped_flavints: xform_input_names = [] for input_name in self.input_names: input_flavs = NuFlavIntGroup(input_name) if set(xform_flavints).intersection(input_flavs): xform_input_names.append(input_name) for output_name in self.output_names: if output_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=xform_input_names, output_name=str(xform_flavints), input_binning=self.input_binning, output_binning=self.transform_output_binning, xform_array=xform_array, sum_inputs=self.sum_grouped_flavints) nominal_transforms.append(xform) else: for input_name in self.input_names: if input_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=input_name, output_name=input_name, input_binning=self.input_binning, output_binning=self.transform_output_binning, xform_array=xform_array, ) nominal_transforms.append(xform) return TransformSet(transforms=nominal_transforms)
def _calculate_fit_coeffs(data, params, fit_binning, nu_params=None, mu_params=None): """ Calculate the fit coefficients for each systematic, flavint, bin for a polynomial. """ logging.debug('Calculating fit coefficients') config = from_file(params['discr_sys_sample_config'].value) degree = int(params['poly_degree'].value) force_through_nominal = params['force_through_nominal'].value if force_through_nominal: def fit_func(vals, *poly_coeffs): return np.polynomial.polynomial.polyval( vals, [1.] + list(poly_coeffs)) else: def fit_func(vals, *poly_coeffs): return np.polynomial.polynomial.polyval( vals, list(poly_coeffs)) # add free param for constant term degree += 1 template_maker = Pipeline(params['pipeline_config'].value) dataset_param = template_maker.params['dataset'] def parse(string): return string.replace(' ', '').split(',') sys_fit_coeffs = OrderedDict() if nu_params is not None: sys_list = parse(config.get('neutrinos', 'sys_list')) nu_params = deepcopy(map(lambda x: x[3:], nu_params)) if set(nu_params) != set(sys_list): raise AssertionError( 'Systematics list listed in the sample config file does ' 'not match the params in the pipeline config file\n {0} ' '!= {1}'.format(set(nu_params), set(sys_list))) for sys in sys_list: ev_sys = 'neutrinos|' + sys runs = parse(config.get(ev_sys, 'runs')[1:-1]) nominal = config.get(ev_sys, 'nominal') mapset_dict = OrderedDict() flavint_groups = None for run in runs: logging.info('Loading run {0} of systematic ' '{1}'.format(run, sys)) dataset_param.value = ev_sys + '|' + run template_maker.update_params(dataset_param) template = template_maker.get_outputs( idx=int(params['stop_after_stage'].m)) if not isinstance(template, Data): raise AssertionError( 'Template output is not a Data object, instead is ' 'type {0}'.format(type(template))) if flavint_groups is None: flavint_groups = template.flavint_groups else: if set(flavint_groups) != set(template.flavint_groups): raise AssertionError( 'Mismatch of flavint_groups - ({0}) does not ' 'match flavint_groups ' '({1})'.format(flavint_groups, template.flavint_groups)) outputs = [] for fig in template.keys(): outputs.append( template.histogram(kinds=fig, binning=fit_binning, weights_col='pisa_weight', errors=False, name=str(NuFlavIntGroup(fig)))) mapset_dict[run] = MapSet(outputs, name=run) nom_mapset = mapset_dict[nominal] fracdiff_mapset_dict = OrderedDict() for run in runs: mapset = [] for flavintg_map in mapset_dict[run]: # TODO(shivesh): error propagation? flavintg = flavintg_map.name mask = ~(nom_mapset[flavintg].hist == 0.) div = np.zeros(flavintg_map.shape) with np.errstate(divide='ignore', invalid='ignore'): div[mask] = \ unp.nominal_values(flavintg_map.hist[mask]) /\ unp.nominal_values(nom_mapset[flavintg].hist[mask]) mapset.append( Map(name=flavintg, binning=flavintg_map.binning, hist=div)) fracdiff_mapset_dict[run] = MapSet(mapset) delta_runs = np.array([float(x) for x in runs]) - float(nominal) coeff_binning = OneDimBinning(name='coeff', num_bins=degree, is_lin=True, domain=[-1, 1]) combined_binning = fit_binning + coeff_binning params_mapset = [] for fig in template.keys(): # TODO(shivesh): Fix numpy warning on this line pvals_hist = np.empty(map(int, combined_binning.shape), dtype=object) hists = [ fracdiff_mapset_dict[run][fig].hist for run in runs ] zip_hists = np.dstack(hists) for idx in np.ndindex(fit_binning.shape): y_values = [] y_sigma = [] for run in fracdiff_mapset_dict: y_values.append( unp.nominal_values( fracdiff_mapset_dict[run][fig].hist[idx])) y_sigma.append( unp.std_devs( fracdiff_mapset_dict[run][fig].hist[idx])) if np.any(y_sigma): popt, pcov = curve_fit(fit_func, delta_runs, y_values, sigma=y_sigma, p0=np.ones(degree)) else: popt, pcov = curve_fit(fit_func, delta_runs, y_values, p0=np.ones(degree)) # perr = np.sqrt(np.diag(pcov)) # pvals = unp.uarray(popt, perr) pvals_hist[idx] = popt pvals_hist = np.array(pvals_hist.tolist()) params_mapset.append( Map(name=fig, binning=combined_binning, hist=pvals_hist)) params_mapset = MapSet(params_mapset, name=sys) if sys in sys_fit_coeffs: sys_fit_coeffs[sys] = MapSet( [sys_fit_coeffs[sys], params_mapset]) else: sys_fit_coeffs[sys] = params_mapset if mu_params is not None: sys_list = parse(config.get('muons', 'sys_list')) mu_params = deepcopy(map(lambda x: x[3:], mu_params)) if set(mu_params) != set(sys_list): raise AssertionError( 'Systematics list listed in the sample config file does ' 'not match the params in the pipeline config file\n {0} ' '!= {1}'.format(set(mu_params), set(sys_list))) for sys in sys_list: ev_sys = 'muons|' + sys runs = parse(config.get(ev_sys, 'runs')[1:-1]) nominal = config.get(ev_sys, 'nominal') map_dict = OrderedDict() flavint_groups = None for run in runs: logging.info('Loading run {0} of systematic ' '{1}'.format(run, sys)) dataset_param.value = ev_sys + '|' + run template_maker.update_params(dataset_param) template = template_maker.get_outputs( idx=int(params['stop_after_stage'].m)) if not isinstance(template, Data): raise AssertionError( 'Template output is not a Data object, instead is ' 'type {0}'.format(type(template))) if not template.contains_muons: raise AssertionError( 'Template output does not contain muons') output = template.histogram( kinds='muons', binning=fit_binning, # NOTE: weights cancel in fraction weights_col=None, errors=False, name='muons') map_dict[run] = output nom_map = map_dict[nominal] fracdiff_map_dict = OrderedDict() for run in runs: mask = ~(nom_map.hist == 0.) div = np.zeros(nom_map.shape) with np.errstate(divide='ignore', invalid='ignore'): div[mask] = \ unp.nominal_values(map_dict[run].hist[mask]) /\ unp.nominal_values(nom_map.hist[mask]) fracdiff_map_dict[run] = Map(name='muons', binning=nom_map.binning, hist=div) delta_runs = np.array([float(x) for x in runs]) - float(nominal) coeff_binning = OneDimBinning(name='coeff', num_bins=degree, is_lin=True, domain=[-1, 1]) combined_binning = fit_binning + coeff_binning pvals_hist = np.empty(map(int, combined_binning.shape), dtype=object) hists = [fracdiff_map_dict[run].hist for run in runs] zip_hists = np.dstack(hists) for idx in np.ndindex(fit_binning.shape): y_values = [] y_sigma = [] for run in fracdiff_mapset_dict: y_values.append( unp.nominal_values( fracdiff_mapset_dict[run][fig].hist[idx])) y_sigma.append( unp.std_devs( fracdiff_mapset_dict[run][fig].hist[idx])) if np.any(y_sigma): popt, pcov = curve_fit(fit_func, delta_runs, y_values, sigma=y_sigma, p0=np.ones(degree)) else: popt, pcov = curve_fit(fit_func, delta_runs, y_values, p0=np.ones(degree)) # perr = np.sqrt(np.diag(pcov)) # pvals = unp.uarray(popt, perr) pvals_hist[idx] = popt pvals_hist = np.array(pvals_hist.tolist()) params_map = Map(name='muons', binning=combined_binning, hist=pvals_hist) if sys in sys_fit_coeffs: sys_fit_coeffs[sys] = MapSet( [sys_fit_coeffs[sys], params_map]) else: sys_fit_coeffs[sys] = params_map return sys_fit_coeffs
def makeEventsFile(data_files, detector, proc_ver, cut, outdir, run_settings=None, data_proc_params=None, join=None, cust_cuts=None, extract_fields=EXTRACT_FIELDS, output_fields=OUTPUT_FIELDS): r"""Take the simulated and reconstructed HDF5 file(s) (as converted from I3 by icecube.hdfwriter.I3HDFTableService) as input and write out a simplified PISA-standard-format HDF5 file for use in aeff, reco, and/or PID stages. Parameters ---------- data_files : dict File paths for finding data files for each run, formatted as: { <string run>: <list of file paths>, <string run>: <list of file paths>, ... <string run>: <list of file paths>, } detector : string Name of the detector (e.g. IceCube, DeepCore, PINGU, etc.) as found in e.g. mc_sim_run_settings.json and data_proc_params.json files. proc_ver Version of processing applied to the events, as found in e.g. data_proc_params.json. cut Name of a standard cut to use; must be specified in the relevant detector/processing version node of the data processing parameters (file from which the data_proc_params object was instantiated) outdir Directory path in which to store resulting files; will be generated if it does not already exist (including any parent directories that do not exist) run_settings : string or MCSimRunSettings Resource location of mc_sim_run_settings.json or an MCSimRunSettings object instantiated therefrom. data_proc_params : string or DataProcParams Resource location of data_proc_params.json or a DataProcParams object instantiated therefrom. join String specifying any flavor/interaction types (flavInts) to join together. Separate flavInts with commas (',') and separate groups with semicolons (';'). E.g. an acceptable string is: 'numucc+numubarcc; nuall bar NC, nuall NC' cust_cuts dict with a single DataProcParams cut specification or list of same (see help for DataProcParams for detailed description of cut spec) extract_fields : None or iterable of strings Field names to extract from source HDF5 file. If None, extract all fields. output_fields : None or iterable of strings Fields to include in the generated PISA-standard-format events HDF5 file; note that if 'weighted_aeff' is not preent, effective area will not be computed. If None, all fields will be written. Notes ----- Compute "weighted_aeff" field: Within each int type (CC or NC), ngen should be added together; events recorded of that int type then get their one_weight divided by the total *for that int type only* to obtain the "weighted_aeff" for that event (even if int types are being grouped/joined together). This has the effect that within a group, ... ... and within an interaction type, effective area is a weighted average of that of the flavors being combined. E.g. for CC, \sum_{run x}\sum_{flav y} (Aeff_{x,y} * ngen_{x,y}) Aeff_CC = ----------------------------------------------------- , \sum_{run x}\sum_{flav y} (ngen_{x,y}) ... and then across interaction types, the results of the above for each int type need to be summed together, i.e.: Aeff_total = Aeff_CC + Aeff_NC Note that each grouping of flavors is calculated with the above math completely independently from other flavor groupings specified. See Justin Lanfranchi's presentation on the PINGU Analysis call, 2015-10-21, for more details: https://wikispaces.psu.edu/download/attachments/282040606/meff_report_jllanfranchi_v05_2015-10-21.pdf """ if isinstance(run_settings, str): run_settings = DetMCSimRunsSettings(find_resource(run_settings), detector=detector) assert isinstance(run_settings, DetMCSimRunsSettings) assert run_settings.detector == detector if isinstance(data_proc_params, str): data_proc_params = DataProcParams( detector=detector, proc_ver=proc_ver, data_proc_params=find_resource(data_proc_params)) assert data_proc_params.detector == detector assert data_proc_params.proc_ver == proc_ver runs = sorted(data_files.keys()) all_flavs = [] flavs_by_run = {} run_norm_factors = {} bin_edges = set() runs_by_flavint = FlavIntData() for flavint in runs_by_flavint.flavints: runs_by_flavint[flavint] = [] #ngen_flavint_by_run = {run:FlavIntData() for run in runs} ##ngen_per_flav_by_run = {run:FlavIntData() for run in runs} #eint_per_flav_by_run = {run:FlavIntData() for run in runs} #for run in runs: # flavints_in_run = run_settings.get_flavints(run=run) # e_range = run_settings.get_energy_range(run) # gamma = run_settings.get_spectral_index(run) # for flavint in flavints_in_run: # runs_by_flavint[flavint].append(run) # ngen_flav = run_settings.get_num_gen( # run=run, flav_or_flavint=flavint, include_physical_fract=True # ) # #runs_by_flavint[flavint].append(run) # #this_flav = flavint. # #xsec_fract_en_wtd_avg[run][flavint] = \ # ngen_flavint_by_run[run][flavint] = \ # xsec.get_xs_ratio_integral( # flavintgrp0=flavint, # flavintgrp1=flavint.flav, # e_range=e_range, # gamma=gamma, # average=True # ) # xsec_ver = run_settings.get_xsec_version(run=run) # if xsec_ver_ref is None: # xsec_ver_ref = xsec_ver # # An assumption of below logic is that all MC is generated using the # # same cross sections version. # # # # TODO / NOTE: # # It would be possible to combine runs with different cross sections so # # long as each (flavor, interaction type) cross sections are # # weighted-averaged together using weights # # N_gen_{n,flav+inttype} * E_x^{-gamma_n} / # # ( \int_{E_min_n}^{E_max_n} E^{-\gamma_n} dE ) # # where E_x are the energy sample points specified in the cross # # sections (and hence these must also be identical across all cross # # sections that get combined, unless interpolation is performed). # assert xsec_ver == xsec_ver_ref # #ngen_weighted_energy_integral[str(run)] = powerLawIntegral( # #flavs_by_run[run] = run_settings.flavs(run) ##flavs_present = detector_geom = run_settings[runs[0]]['geom'] # Create Events object to store data evts = Events() evts.metadata.update({ 'detector': run_settings.detector, 'proc_ver': data_proc_params.proc_ver, 'geom': detector_geom, 'runs': runs, }) cuts = [] if isinstance(cust_cuts, dict): cust_cuts = [cust_cuts] if cut is not None: evts.metadata['cuts'].append(cut) cuts.append(cut) if cust_cuts is not None: for ccut in cust_cuts: evts.metadata['cuts'].append('custom: ' + ccut['pass_if']) cuts.append(ccut) orig_outdir = outdir outdir = expand(outdir) logging.info('Output dir spec\'d: %s', orig_outdir) if outdir != orig_outdir: logging.info('Output dir expands to: %s', outdir) mkdir(outdir) detector_label = str(data_proc_params.detector) proc_label = 'proc_' + str(data_proc_params.proc_ver) # What flavints to group together if join is None or join == '': grouped = [] ungrouped = [NuFlavIntGroup(k) for k in ALL_NUFLAVINTS] groups_label = 'unjoined' logging.info('Events in the following groups will be joined together:' ' (none)') else: grouped, ungrouped = xlateGroupsStr(join) evts.metadata['flavints_joined'] = [str(g) for g in grouped] groups_label = 'joined_G_' + '_G_'.join([str(g) for g in grouped]) logging.info( 'Events in the following groups will be joined together: ' + '; '.join([str(g) for g in grouped])) # Find any flavints not included in the above groupings flavint_groupings = grouped + ungrouped if len(ungrouped) == 0: ungrouped = ['(none)'] logging.info('Events of the following flavints will NOT be joined' 'together: ' + '; '.join([str(k) for k in ungrouped])) # Enforce that flavints composing groups are mutually exclusive for grp_n, flavintgrp0 in enumerate(flavint_groupings[:-1]): for flavintgrp1 in flavint_groupings[grp_n + 1:]: assert len(set(flavintgrp0).intersection(set(flavintgrp1))) == 0 flavintgrp_names = [str(flavintgrp) for flavintgrp in flavint_groupings] # Instantiate storage for all intermediate destination fields; # The data structure looks like: # extracted_data[group #][interaction type][field name] = list of data if extract_fields is None: extracted_data = [{inttype: {} for inttype in ALL_NUINT_TYPES} for _ in flavintgrp_names] else: extracted_data = [{ inttype: {field: [] for field in extract_fields} for inttype in ALL_NUINT_TYPES } for _ in flavintgrp_names] # Instantiate generated-event counts for destination fields; count # CClseparately from NC because aeff's for CC & NC add, whereas # aeffs intra-CC should be weighted-averaged (as for intra-NC) ngen = [{inttype: {} for inttype in ALL_NUINT_TYPES} for _ in flavintgrp_names] # Loop through all of the files, retrieving the events, filtering, # and recording the number of generated events pertinent to # calculating aeff filecount = {} detector_geom = None bad_files = [] for run, fnames in data_files.items(): file_count = 0 for fname in fnames: # Retrieve data from all nodes specified in the processing # settings file logging.trace('Trying to get data from file %s', fname) try: data = data_proc_params.get_data(fname, run_settings=run_settings) except (ValueError, KeyError, IOError): logging.warning('Bad file encountered: %s', fname) bad_files.append(fname) continue file_count += 1 # Check to make sure only one run is present in the data runs_in_data = set(data['run']) assert len(runs_in_data) == 1, 'Must be just one run in data' #run = int(data['run'][0]) if not run in filecount: filecount[run] = 0 filecount[run] += 1 rs_run = run_settings[run] # Record geom; check that geom is consistent with other runs if detector_geom is None: detector_geom = rs_run['geom'] assert rs_run['geom'] == detector_geom, \ 'All runs\' geometries must match!' # Loop through all flavints spec'd for run for run_flavint in rs_run['flavints']: barnobar = run_flavint.bar_code int_type = run_flavint.intType # Retrieve this-interaction-type- & this-barnobar-only events # that also pass cuts. (note that cut names are strings) intonly_cut_data = data_proc_params.apply_cuts( data, cuts=cuts + [str(int_type), str(barnobar)], return_fields=extract_fields) # Record the generated count and data for this run/flavor for # each group to which it's applicable for grp_n, flavint_group in enumerate(flavint_groupings): if not run_flavint in flavint_group: continue # Instantiate a field for particles and antiparticles, # keyed by the output of the bar_code property for each if not run in ngen[grp_n][int_type]: ngen[grp_n][int_type][run] = { NuFlav(12).bar_code: 0, NuFlav(-12).bar_code: 0, } # Record count only if it hasn't already been recorded if ngen[grp_n][int_type][run][barnobar] == 0: # Note that one_weight includes cc/nc:total fraction, # so DO NOT specify the full flavint here, only flav # (since one_weight does NOT take bar/nobar fraction, # it must be included here in the ngen computation) flav_ngen = run_settings.get_num_gen(run=run, barnobar=barnobar) ngen[grp_n][int_type][run][barnobar] = flav_ngen # Append the data. Note that extracted_data is: # extracted_data[group n][int_type][extract field name] = # list if extract_fields is None: for f in intonly_cut_data.keys(): if f not in extracted_data[grp_n][int_type]: extracted_data[grp_n][int_type][f] = [] extracted_data[grp_n][int_type][f].extend( intonly_cut_data[f]) else: for f in extract_fields: extracted_data[grp_n][int_type][f].extend( intonly_cut_data[f]) logging.info('File count for run %s: %d', run, file_count) to_file(bad_files, '/tmp/bad_files.json') if ((output_fields is None and (extract_fields is None or 'one_weight' in extract_fields)) or 'weighted_aeff' in output_fields): fmtfields = (' ' * 12 + 'flavint_group', 'int type', ' run', 'part/anti', 'part/anti count', 'aggregate count') fmt_n = [len(f) for f in fmtfields] fmt = ' '.join([r'%' + str(n) + r's' for n in fmt_n]) lines = ' '.join(['-' * n for n in fmt_n]) logging.info(fmt, fmtfields) logging.info(lines) for grp_n, flavint_group in enumerate(flavint_groupings): for int_type in set([fi.intType for fi in flavint_group.flavints]): ngen_it_tot = 0 for run, run_counts in ngen[grp_n][int_type].items(): for barnobar, barnobar_counts in run_counts.items(): ngen_it_tot += barnobar_counts logging.info(fmt, flavint_group.simple_str(), int_type, str(run), barnobar, int(barnobar_counts), int(ngen_it_tot)) # Convert data to numpy array if extract_fields is None: for field in extracted_data[grp_n][int_type].keys(): extracted_data[grp_n][int_type][field] = \ np.array(extracted_data[grp_n][int_type][field]) else: for field in extract_fields: extracted_data[grp_n][int_type][field] = \ np.array(extracted_data[grp_n][int_type][field]) # Generate weighted_aeff field for this group / int type's data extracted_data[grp_n][int_type]['weighted_aeff'] = \ extracted_data[grp_n][int_type]['one_weight'] \ / ngen_it_tot * CMSQ_TO_MSQ # Report file count per run for run, count in filecount.items(): logging.info('Files read, run %s: %d', run, count) ref_num_i3_files = run_settings[run]['num_i3_files'] if count != ref_num_i3_files: logging.warning( 'Run %s, Number of files read (%d) != number of ' 'source I3 files (%d), which may indicate an error.', run, count, ref_num_i3_files) # Generate output data for flavint in ALL_NUFLAVINTS: int_type = flavint.intType for grp_n, flavint_group in enumerate(flavint_groupings): if not flavint in flavint_group: logging.trace('flavint %s not in flavint_group %s, passing.', flavint, flavint_group) continue else: logging.trace( 'flavint %s **IS** in flavint_group %s, storing.', flavint, flavint_group) if output_fields is None: evts[flavint] = extracted_data[grp_n][int_type] else: evts[flavint] = { f: extracted_data[grp_n][int_type][f] for f in output_fields } # Generate file name numerical_runs = [] alphanumerical_runs = [] for run in runs: try: int(run) numerical_runs.append(int(run)) except ValueError: alphanumerical_runs.append(str(run)) run_labels = [] if len(numerical_runs) > 0: run_labels.append(list2hrlist(numerical_runs)) if len(alphanumerical_runs) > 0: run_labels += sorted(alphanumerical_runs) run_label = 'runs_' + ','.join(run_labels) geom_label = '' + detector_geom fname = 'events__' + '__'.join([ detector_label, geom_label, run_label, proc_label, groups_label, ]) + '.hdf5' outfpath = os.path.join(outdir, fname) logging.info('Writing events to %s', outfpath) # Save data to output file evts.save(outfpath)
def populate_transforms(service, xform_flavints, xform_array): """General function for populating a BinnedTensorTransform with a single aeff transform array, taking into account e.g. sum_grouped_flavints etc. Any rebinning is assumed to be performed outside of the transform, so the transform's `output_binning` is the same as its `input_binning`. This does _not_ mean that the stage's output binning needs to match its input binning, though, since a rebinning can occur after the transform is applied but before the maps are emitted from the stage. Note that, as certain assumptions (like the above) are made about input and outputs names and binning, this function should _only_ be applied to aeff services_ (unless very carefully considered). Parameters ---------- service : Stage The aeff serivce xform_array : numpy.ndarray Raw transform array Returns ------- transforms : list of BinnedTensorTransform """ transforms = [] # If combining grouped flavints: # Create a single transform for each group and assign all inputs # that contribute to the group as the single transform's inputs. # The actual sum of the input event rate maps will be performed by # the BinnedTensorTransform object upon invocation of the `apply` # method. if service.sum_grouped_flavints: xform_input_names = [] for input_name in service.input_names: if set(NuFlavIntGroup(input_name)).isdisjoint(xform_flavints): continue xform_input_names.append(input_name) for output_name in service.output_names: if output_name not in xform_flavints: continue logging.trace(' inputs: %s, output: %s, xform: %s', xform_input_names, output_name, xform_flavints) xform = BinnedTensorTransform( input_names=xform_input_names, output_name=output_name, input_binning=service.input_binning, output_binning=service.input_binning, xform_array=xform_array, sum_inputs=service.sum_grouped_flavints ) transforms.append(xform) # If *not* combining grouped flavints: # Copy the transform for each input flavor, regardless if the # transform is computed from a combination of flavors. else: for input_name in service.input_names: # Since aeff "splits" neutrino flavors into # flavor+interaction types, need to check if the output # flavints are encapsulated by the input flavor(s). if set(NuFlavIntGroup(input_name)).isdisjoint(xform_flavints): continue for output_name in service.output_names: if (output_name not in NuFlavIntGroup(input_name) or output_name not in xform_flavints): continue logging.trace(' input: %s, output: %s, xform: %s', input_name, output_name, xform_flavints) xform = BinnedTensorTransform( input_names=input_name, output_name=output_name, input_binning=service.input_binning, output_binning=service.input_binning, xform_array=xform_array, sum_inputs=service.sum_grouped_flavints ) transforms.append(xform) return transforms
def __init__(self, params, output_binning, input_names, output_names, output_events=True, error_method=None, debug_mode=None, disk_cache=None, memcache_deepcopy=True, outputs_cache_depth=20): self.sample_hash = None """Hash of input event sample.""" self.weight_hash = None """Hash of event sample.""" self.fit_hash = None """Hash of fit sample.""" self.fitcoeffs_hash = None """Hash of fit coefficients.""" self.fitcoeffs_cache_hash = None """Hash of cached fit coefficients.""" self.fit_params = ('pipeline_config', 'discr_sys_sample_config', 'stop_after_stage', 'poly_degree', 'force_through_nominal', 'smoothing') self.nu_params = ('nu_dom_eff', 'nu_hole_ice') self.mu_params = ('mu_dom_eff', 'mu_hole_ice') self.other_params = ('cache_fit', ) expected_params = self.fit_params + self.other_params if ('all_nu' in input_names) or ('neutrinos' in input_names): expected_params += self.nu_params if 'muons' in input_names: expected_params += self.mu_params self.neutrinos = False self.muons = False self.noise = False if input_names != output_names: raise AssertionError( 'Input names must match output names for this ' 'stage\n{0}(input names) != {1}(output ' 'names)'.format(input_names, output_names)) output_names = output_names.replace(' ', '').split(',') clean_outnames = [] self._output_nu_groups = [] for name in output_names: if 'muons' in name: self.muons = True clean_outnames.append(name) elif 'noise' in name: self.noise = True clean_outnames.append(name) elif 'all_nu' in name: self.neutrinos = True self._output_nu_groups = \ [NuFlavIntGroup(f) for f in ALL_NUFLAVINTS] else: self.neutrinos = True self._output_nu_groups.append(NuFlavIntGroup(name)) if self.neutrinos: clean_outnames += [str(f) for f in self._output_nu_groups] if not isinstance(output_events, bool): raise AssertionError( 'output_events must be of type bool, instead it is supplied ' 'with type {0}'.format(type(output_events))) self.fit_binning = deepcopy(output_binning) if output_events: output_binning = None self.output_events = output_events super().__init__(use_transforms=False, params=params, expected_params=expected_params, input_names=clean_outnames, output_names=clean_outnames, error_method=error_method, debug_mode=debug_mode, disk_cache=disk_cache, memcache_deepcopy=memcache_deepcopy, outputs_cache_depth=outputs_cache_depth, output_binning=output_binning) if self.params['smoothing'].value is not None: if self.params['smoothing'].value != 'gauss': raise AssertionError( 'Parameter "smoothing" accepts "none" or "gauss" as ' 'input, instead got {0} as ' 'input'.format(self.params['smoothing'].value)) self.include_attrs_for_hashes('sample_hash')
def _compute_transforms(self): """ Generate reconstruction "smearing kernels" by reading in a set of parameterisation functions from a json file. This should have the same dimensionality as the input binning i.e. if you have energy and coszenith input binning then the kernels provided should have both energy and coszenith resolution functions. Any superposition of distributions from scipy.stats is supported. """ res_scale_ref = self.params.res_scale_ref.value.strip().lower() assert res_scale_ref in ['zero'] # TODO: , 'mean', 'median'] reco_param_source = self.params.reco_paramfile.value if reco_param_source is None: raise ValueError( 'non-None reco parameterization params.reco_paramfile' ' must be provided') reco_param_hash = hash_obj(reco_param_source) if (self._reco_param_hash is None or reco_param_hash != self._reco_param_hash): reco_param = load_reco_param(reco_param_source) # Transform groups are implicitly defined by the contents of the # reco paramfile's keys implicit_transform_groups = reco_param.keys() # Make sure these match transform groups specified for the stage if set(implicit_transform_groups) != set(self.transform_groups): raise ValueError( 'Transform groups (%s) defined implicitly by' ' %s reco parameterizations do not match those' ' defined as the stage\'s `transform_groups` (%s).' % (implicit_transform_groups, reco_param_source, self.transform_groups)) self.param_dict = reco_param self._reco_param_hash = reco_param_hash self.eval_dict = self.evaluate_reco_param() self.reco_scales_and_biases_applicable() # everything seems to be fine, so rescale and shift distributions eval_dict = self.scale_and_shift_reco_dists() # Computational units must be the following for compatibility with # events file comp_units = dict(true_energy='GeV', true_coszen=None, true_azimuth='rad', reco_energy='GeV', reco_coszen=None, reco_azimuth='rad', pid=None) # Select only the units in the input/output binning for conversion # (can't pass more than what's actually there) in_units = { dim: unit for dim, unit in comp_units.items() if dim in self.input_binning } out_units = { dim: unit for dim, unit in comp_units.items() if dim in self.output_binning } # These binnings will be in the computational units defined above input_binning = self.input_binning.to(**in_units) output_binning = self.output_binning.to(**out_units) en_centers_in = self.input_binning[ 'true_energy'].weighted_centers.magnitude en_edges_in = self.input_binning['true_energy'].bin_edges.magnitude cz_centers_in = self.input_binning[ 'true_coszen'].weighted_centers.magnitude cz_edges_in = self.input_binning['true_coszen'].bin_edges.magnitude en_edges_out = self.output_binning['reco_energy'].bin_edges.magnitude cz_edges_out = self.output_binning['reco_coszen'].bin_edges.magnitude n_e_in = len(en_centers_in) n_cz_in = len(cz_centers_in) n_e_out = len(en_edges_out) - 1 n_cz_out = len(cz_edges_out) - 1 if self.coszen_flipback: cz_edges_out, flipback_mask, keep = \ self.extend_binning_for_coszen(ext_low=-3., ext_high=+3.) xforms = [] for xform_flavints in self.transform_groups: logging.debug("Working on %s reco kernel..." % xform_flavints) this_params = eval_dict[xform_flavints] reco_kernel = np.zeros((n_e_in, n_cz_in, n_e_out, n_cz_out)) for (i, j) in itertools.product(range(n_e_in), range(n_cz_in)): e_kern_cdf = self.make_cdf(bin_edges=en_edges_out, enval=en_centers_in[i], enindex=i, czval=None, czindex=j, dist_params=this_params['energy']) cz_kern_cdf = self.make_cdf(bin_edges=cz_edges_out, enval=en_centers_in[i], enindex=i, czval=cz_centers_in[j], czindex=j, dist_params=this_params['coszen']) if self.coszen_flipback: cz_kern_cdf = perform_coszen_flipback( cz_kern_cdf, flipback_mask, keep) reco_kernel[i, j] = np.outer(e_kern_cdf, cz_kern_cdf) # Sanity check of reco kernels - intolerable negative values? logging.trace(" Ensuring reco kernel sanity...") kern_neg_invalid = reco_kernel < -EQUALITY_PREC if np.any(kern_neg_invalid): raise ValueError("Detected intolerable negative entries in" " reco kernel! Min.: %.15e" % np.min(reco_kernel)) # Set values numerically compatible with zero to zero np.where((np.abs(reco_kernel) < EQUALITY_PREC), reco_kernel, 0) sum_over_axes = tuple(range(-len(self.output_binning), 0)) totals = np.sum(reco_kernel, axis=sum_over_axes) totals_large = totals > (1 + EQUALITY_PREC) if np.any(totals_large): raise ValueError("Detected overflow in reco kernel! Max.:" " %0.15e" % (np.max(totals))) if self.input_binning.basenames[0] == "coszen": # The reconstruction kernel has been set up with energy as its # first dimension, so swap axes if it is applied to an input # binning where 'coszen' is the first logging.trace(" Swapping kernel dimensions since 'coszen' has" " been requested as the first.") reco_kernel = np.swapaxes(reco_kernel, 0, 1) reco_kernel = np.swapaxes(reco_kernel, 2, 3) if self.sum_grouped_flavints: xform_input_names = [] for input_name in self.input_names: if set(NuFlavIntGroup(input_name)).isdisjoint( xform_flavints): continue xform_input_names.append(input_name) for output_name in self.output_names: if output_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=xform_input_names, output_name=output_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, sum_inputs=self.sum_grouped_flavints) xforms.append(xform) # If *not* combining grouped flavints: # Copy the transform for each input flavor, regardless if the # transform is computed from a combination of flavors. else: for input_name in self.input_names: if set(NuFlavIntGroup(input_name)).isdisjoint( xform_flavints): continue for output_name in self.output_names: if (output_name not in NuFlavIntGroup(input_name) or output_name not in xform_flavints): continue logging.trace(' input: %s, output: %s, xform: %s', input_name, output_name, xform_flavints) xform = BinnedTensorTransform( input_names=input_name, output_name=output_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, sum_inputs=self.sum_grouped_flavints) xforms.append(xform) return TransformSet(transforms=xforms)