def __init__(self, val=None): self.metadata = OrderedDict([ ('detector', ''), ('geom', ''), ('runs', []), ('proc_ver', ''), ('cuts', []), ('flavints_joined', []), ]) meta = OrderedDict() data = FlavIntData() if isinstance(val, (str, h5py.Group)): data = hdf.from_hdf(val) meta = getattr(data, 'attrs', OrderedDict()) elif isinstance(val, Events): meta = deepcopy(val.metadata) data = deepcopy(val) elif isinstance(val, Mapping): data = deepcopy(val) if hasattr(val, 'metadata'): meta = deepcopy(val.metadata) elif hasattr(val, 'attrs'): meta = deepcopy(val.attrs) for key, val_ in meta.items(): if hasattr(val_, 'tolist') and callable(val_.tolist): meta[key] = val_.tolist() self.metadata.update(meta) self.validate(data) self.update(data) self.update_hash()
def apply_cuts(self, data, cuts, boolean_op='&', return_fields=None): """Perform `cuts` on `data` and return a dict containing `return_fields` from events that pass the cuts. Parameters ---------- data : single-level dict or FlavIntData object cuts : string or dict, or sequence thereof boolean_op : string return_fields : string or sequence thereof """ if isinstance(data, FlavIntData): outdata = FlavIntData() for flavint in data.flavints: outdata[flavint] = self.apply_cuts(data[flavint], cuts=cuts, boolean_op=boolean_op, return_fields=return_fields) return outdata if isinstance(cuts, (str, dict)): cuts = [cuts] # Default is to return all fields if return_fields is None: return_fields = data.keys() # If no cuts specified, return all data from specified fields if len(cuts) == 0: return self.subselect(data, return_fields) cut_strings = set() cut_fields = set() for cut in cuts: if isinstance(cut, dict): self.validate_cut_spec(cut) elif cut.lower() in self['cuts']: cut = self['cuts'][cut.lower()] else: raise Exception('Unrecognized or invalid cut: "' + str(cut) + '"') cut_strings.add(cut['pass_if']) cut_fields.update(cut['fields']) # Combine cut criteria strings together with boolean operation cut_string = boolean_op.join(['(' + cs + ')' for cs in cut_strings]) # Load the fields necessary for the cut into the global namespace for field in set(cut_fields): globals()[field] = data[field] # Evaluate cuts, returning a boolean array try: bool_idx = eval(cut_string) # pylint: disable=eval-used except: logging.error('Failed to evaluate `cut_string` "%s"', cut_string) raise # Return specified (or all) fields, indexed by boolean array return {f: np.array(data[f])[bool_idx] for f in return_fields}
def subselect(data, fields, indices=None): if isinstance(data, FlavIntData): outdata = FlavIntData() for flavint in data.flavints: outdata[flavint] = DataProcParams.subselect(data[flavint], fields=fields, indices=indices) elif isinstance(data, Mapping): if indices is None: return {k:v for k, v in data.items() if k in fields} return {k:v[indices] for k, v in data.items() if k in fields}
def __init__(self, val=None): self.metadata = OrderedDict([ ('detector', ''), ('geom', ''), ('runs', []), ('proc_ver', ''), ('cuts', []), ('flavints_joined', []), ]) meta = {} data = FlavIntData() if isinstance(val, (basestring, h5py.Group)): data, meta = self.__load(val) elif isinstance(val, Events): meta = deepcopy(val.metadata) data = deepcopy(val) elif isinstance(val, dict): data = deepcopy(val) self.metadata.update(meta) self.validate(data) self.update(data) self.update_hash()
def validate_xsec(energy, xsec): """Validate cross sections""" # TODO: different validation based on cross sections version string # Make sure the basics are present xsec = FlavIntData(xsec) ## No NaN's assert not np.any(np.isnan(energy)) # Energy spans at least 1-100 GeV assert np.min(energy) <= 1 assert np.max(energy) >= 100 # All event flavints need to be present for k in ALL_NUFLAVINTS: # Uses "standard" PISA indexing scheme x = xsec[k] # Arrays are same lengths assert len(x) == len(energy) # No NaN's assert np.sum(np.isnan(x)) == 0 # Max xsec/energy value is in range for units of [m^2/GeV] assert np.max(x/energy) < 40e-42, np.max(x/energy)
def load_root_file(fpath, ver, tot_sfx='_tot', o_sfx='_o16', h_sfx='_h1', plt_sfx='_plot'): """Load cross sections from root file, where graphs are first-level in hierarchy. This is yet crude and not very flexible, but at least it's recorded here for posterity. Requires ROOT and ROOT python module be installed Parameters ---------- fpath : string Path to ROOT file ver : string Necessary to differentaite among different file formats that Ken has sent out tot_sfx : string (default = '_tot') Suffix for finding total cross sections in ROOT file (if these fields are found, the oxygen/hydrogen fields are skipped) o_sfx : string (default = '_o16') Suffix for finding oxygen-16 cross sections in ROOT file h_sfx : string (default = '_h1') Suffix for finding hydrogen-1 cross sections in ROOT file plt_sfx : string (default = '_plt') Suffix for plots containing cross sections per GeV in ROOT file Returns ------- xsec : :class:`pisa.utils.flavInt.FlavIntData` Object containing the loaded cross sections """ import ROOT def extractData(f, key): """Extract x and y info from (already-opened) ROOT TFile.""" try: g = ROOT.gDirectory.Get(key) x = np.array(g.GetX()) y = np.array(g.GetY()) except AttributeError: raise ValueError('Possibly missing file "%s" or missing key' ' "%s" within that file?' % (f, key)) return x, y rfile = ROOT.TFile(fpath) # pylint: disable=no-member try: energy = None xsec = FlavIntData() for flavint in ALL_NUFLAVINTS: if ver == 'genie_2.6.4': # Expected to contain xsect per atom; summing 2*Hydrogen # and 1*Oxygen yields total cross section for water # molecule. # Format as found in, e.g., "genie_2.6.4_simplified.root" key = str(flavint) + o_sfx o16_e, o16_xs = extractData(rfile, key) key = str(flavint) + h_sfx h1_e, h1_xs = extractData(rfile, key) tot_xs = h1_xs*2 + o16_xs*1 assert np.alltrue(h1_e == o16_e) ext_e = o16_e elif ver == 'genie_2.8.6': # Expected to contain xsect-per-nucleon-per-energy, so # multiplying by energy and by # of nucleons (18) yields # cross sections per molecule. # Format as found in, e.g., "genie_2.8.6_simplified.root" key = str(flavint) + plt_sfx ext_e, fract_xs = extractData(rfile, key) tot_xs = fract_xs * ext_e * 18 else: raise ValueError('Invalid or not implemented `ver`: "%s"' % ver) if energy is None: energy = ext_e assert np.alltrue(ext_e == energy) # Note that units in the ROOT files are [1e-38 cm^2] but PISA # requires units of [m^2], so this conversion is made here. xsec[flavint] = tot_xs * 1e-38 * 1e-4 finally: rfile.Close() CrossSections.validate_xsec(energy, xsec) return energy, xsec
def get_combined_xsec(fpath, ver=None): """Load the cross-section values from a ROOT file and instantiate a CombinedSpline object.""" # NOTE: ROOT import here as it is optional but still want to import # module for e.g. building docs import ROOT fpath = find_resource(fpath) logging.info('Loading GENIE ROOT cross-section file %s', fpath) # Name of neutrino flavours in the ROOT file. flavs = ('nu_e', 'nu_mu', 'nu_tau', 'nu_e_bar', 'nu_mu_bar', 'nu_tau_bar') rfile = ROOT.TFile.Open(fpath, 'read') # pylint: disable=no-member xsec_splines = FlavIntData() for flav in flavs: for int_ in ALL_NUINT_TYPES: xsec_splines[flav, int_] = {} for part in ('O16', 'H1'): str_repr = flav + '_' + part + '/' + 'tot_' + str(int_) xsec_splines[flav + str(int_)][part] = \ ROOT.gDirectory.Get(str_repr) # pylint: disable=no-member rfile.Close() def eval_spl(spline, binning, out_units=ureg.m**2, x_energy_scale=1, **kwargs): init_names = ['true_energy'] init_units = [ureg.GeV] if set(binning.names) != set(init_names): raise ValueError('Input binning names {0} does not match ' 'instantiation binning names ' '{1}'.format(binning.names, init_names)) if set(map(str, binning.units)) != set(map(str, init_units)): for name in init_names: binning[name].to(init_units) bin_centers = [x.m for x in binning.weighted_centers][0] nu_O16, nu_H1 = [], [] for e_val in bin_centers: nu_O16.append(spline['O16'].Eval(e_val)) nu_H1.append(spline['H1'].Eval(e_val)) nu_O16, nu_H1 = map(np.array, (nu_O16, nu_H1)) nu_xsec = ((0.8879 * nu_O16) + (0.1121 * nu_H1)) * 1E-38 * ureg.cm**2 nu_xsec_hist = nu_xsec.to(out_units).magnitude return Map(hist=nu_xsec_hist, binning=binning, **kwargs) def validate_spl(binning): if np.all(binning.true_energy.midpoints.m > 1E3): raise ValueError('Energy value {0} out of range in array ' '{0}'.format(binning.true_energy)) inXSec = [] for flav in flavs: for int_ in ALL_NUINT_TYPES: flavint = NuFlavInt(flav + str(int_)) xsec = Spline(name=str(flavint), spline=xsec_splines[flavint], eval_spl=eval_spl, validate_spl=validate_spl) inXSec.append(xsec) return CombinedSpline(inXSec, interactions=True, ver=ver)
def makeEventsFile(data_files, detector, proc_ver, cut, outdir, run_settings=None, data_proc_params=None, join=None, cust_cuts=None, extract_fields=EXTRACT_FIELDS, output_fields=OUTPUT_FIELDS): r"""Take the simulated and reconstructed HDF5 file(s) (as converted from I3 by icecube.hdfwriter.I3HDFTableService) as input and write out a simplified PISA-standard-format HDF5 file for use in aeff, reco, and/or PID stages. Parameters ---------- data_files : dict File paths for finding data files for each run, formatted as: { <string run>: <list of file paths>, <string run>: <list of file paths>, ... <string run>: <list of file paths>, } detector : string Name of the detector (e.g. IceCube, DeepCore, PINGU, etc.) as found in e.g. mc_sim_run_settings.json and data_proc_params.json files. proc_ver Version of processing applied to the events, as found in e.g. data_proc_params.json. cut Name of a standard cut to use; must be specified in the relevant detector/processing version node of the data processing parameters (file from which the data_proc_params object was instantiated) outdir Directory path in which to store resulting files; will be generated if it does not already exist (including any parent directories that do not exist) run_settings : string or MCSimRunSettings Resource location of mc_sim_run_settings.json or an MCSimRunSettings object instantiated therefrom. data_proc_params : string or DataProcParams Resource location of data_proc_params.json or a DataProcParams object instantiated therefrom. join String specifying any flavor/interaction types (flavInts) to join together. Separate flavInts with commas (',') and separate groups with semicolons (';'). E.g. an acceptable string is: 'numucc+numubarcc; nuall bar NC, nuall NC' cust_cuts dict with a single DataProcParams cut specification or list of same (see help for DataProcParams for detailed description of cut spec) extract_fields : None or iterable of strings Field names to extract from source HDF5 file. If None, extract all fields. output_fields : None or iterable of strings Fields to include in the generated PISA-standard-format events HDF5 file; note that if 'weighted_aeff' is not preent, effective area will not be computed. If None, all fields will be written. Notes ----- Compute "weighted_aeff" field: Within each int type (CC or NC), ngen should be added together; events recorded of that int type then get their one_weight divided by the total *for that int type only* to obtain the "weighted_aeff" for that event (even if int types are being grouped/joined together). This has the effect that within a group, ... ... and within an interaction type, effective area is a weighted average of that of the flavors being combined. E.g. for CC, \sum_{run x}\sum_{flav y} (Aeff_{x,y} * ngen_{x,y}) Aeff_CC = ----------------------------------------------------- , \sum_{run x}\sum_{flav y} (ngen_{x,y}) ... and then across interaction types, the results of the above for each int type need to be summed together, i.e.: Aeff_total = Aeff_CC + Aeff_NC Note that each grouping of flavors is calculated with the above math completely independently from other flavor groupings specified. See Justin Lanfranchi's presentation on the PINGU Analysis call, 2015-10-21, for more details: https://wikispaces.psu.edu/download/attachments/282040606/meff_report_jllanfranchi_v05_2015-10-21.pdf """ if isinstance(run_settings, str): run_settings = DetMCSimRunsSettings(find_resource(run_settings), detector=detector) assert isinstance(run_settings, DetMCSimRunsSettings) assert run_settings.detector == detector if isinstance(data_proc_params, str): data_proc_params = DataProcParams( detector=detector, proc_ver=proc_ver, data_proc_params=find_resource(data_proc_params)) assert data_proc_params.detector == detector assert data_proc_params.proc_ver == proc_ver runs = sorted(data_files.keys()) all_flavs = [] flavs_by_run = {} run_norm_factors = {} bin_edges = set() runs_by_flavint = FlavIntData() for flavint in runs_by_flavint.flavints: runs_by_flavint[flavint] = [] #ngen_flavint_by_run = {run:FlavIntData() for run in runs} ##ngen_per_flav_by_run = {run:FlavIntData() for run in runs} #eint_per_flav_by_run = {run:FlavIntData() for run in runs} #for run in runs: # flavints_in_run = run_settings.get_flavints(run=run) # e_range = run_settings.get_energy_range(run) # gamma = run_settings.get_spectral_index(run) # for flavint in flavints_in_run: # runs_by_flavint[flavint].append(run) # ngen_flav = run_settings.get_num_gen( # run=run, flav_or_flavint=flavint, include_physical_fract=True # ) # #runs_by_flavint[flavint].append(run) # #this_flav = flavint. # #xsec_fract_en_wtd_avg[run][flavint] = \ # ngen_flavint_by_run[run][flavint] = \ # xsec.get_xs_ratio_integral( # flavintgrp0=flavint, # flavintgrp1=flavint.flav, # e_range=e_range, # gamma=gamma, # average=True # ) # xsec_ver = run_settings.get_xsec_version(run=run) # if xsec_ver_ref is None: # xsec_ver_ref = xsec_ver # # An assumption of below logic is that all MC is generated using the # # same cross sections version. # # # # TODO / NOTE: # # It would be possible to combine runs with different cross sections so # # long as each (flavor, interaction type) cross sections are # # weighted-averaged together using weights # # N_gen_{n,flav+inttype} * E_x^{-gamma_n} / # # ( \int_{E_min_n}^{E_max_n} E^{-\gamma_n} dE ) # # where E_x are the energy sample points specified in the cross # # sections (and hence these must also be identical across all cross # # sections that get combined, unless interpolation is performed). # assert xsec_ver == xsec_ver_ref # #ngen_weighted_energy_integral[str(run)] = powerLawIntegral( # #flavs_by_run[run] = run_settings.flavs(run) ##flavs_present = detector_geom = run_settings[runs[0]]['geom'] # Create Events object to store data evts = Events() evts.metadata.update({ 'detector': run_settings.detector, 'proc_ver': data_proc_params.proc_ver, 'geom': detector_geom, 'runs': runs, }) cuts = [] if isinstance(cust_cuts, dict): cust_cuts = [cust_cuts] if cut is not None: evts.metadata['cuts'].append(cut) cuts.append(cut) if cust_cuts is not None: for ccut in cust_cuts: evts.metadata['cuts'].append('custom: ' + ccut['pass_if']) cuts.append(ccut) orig_outdir = outdir outdir = expand(outdir) logging.info('Output dir spec\'d: %s', orig_outdir) if outdir != orig_outdir: logging.info('Output dir expands to: %s', outdir) mkdir(outdir) detector_label = str(data_proc_params.detector) proc_label = 'proc_' + str(data_proc_params.proc_ver) # What flavints to group together if join is None or join == '': grouped = [] ungrouped = [NuFlavIntGroup(k) for k in ALL_NUFLAVINTS] groups_label = 'unjoined' logging.info('Events in the following groups will be joined together:' ' (none)') else: grouped, ungrouped = xlateGroupsStr(join) evts.metadata['flavints_joined'] = [str(g) for g in grouped] groups_label = 'joined_G_' + '_G_'.join([str(g) for g in grouped]) logging.info( 'Events in the following groups will be joined together: ' + '; '.join([str(g) for g in grouped])) # Find any flavints not included in the above groupings flavint_groupings = grouped + ungrouped if len(ungrouped) == 0: ungrouped = ['(none)'] logging.info('Events of the following flavints will NOT be joined' 'together: ' + '; '.join([str(k) for k in ungrouped])) # Enforce that flavints composing groups are mutually exclusive for grp_n, flavintgrp0 in enumerate(flavint_groupings[:-1]): for flavintgrp1 in flavint_groupings[grp_n + 1:]: assert len(set(flavintgrp0).intersection(set(flavintgrp1))) == 0 flavintgrp_names = [str(flavintgrp) for flavintgrp in flavint_groupings] # Instantiate storage for all intermediate destination fields; # The data structure looks like: # extracted_data[group #][interaction type][field name] = list of data if extract_fields is None: extracted_data = [{inttype: {} for inttype in ALL_NUINT_TYPES} for _ in flavintgrp_names] else: extracted_data = [{ inttype: {field: [] for field in extract_fields} for inttype in ALL_NUINT_TYPES } for _ in flavintgrp_names] # Instantiate generated-event counts for destination fields; count # CClseparately from NC because aeff's for CC & NC add, whereas # aeffs intra-CC should be weighted-averaged (as for intra-NC) ngen = [{inttype: {} for inttype in ALL_NUINT_TYPES} for _ in flavintgrp_names] # Loop through all of the files, retrieving the events, filtering, # and recording the number of generated events pertinent to # calculating aeff filecount = {} detector_geom = None bad_files = [] for run, fnames in data_files.items(): file_count = 0 for fname in fnames: # Retrieve data from all nodes specified in the processing # settings file logging.trace('Trying to get data from file %s', fname) try: data = data_proc_params.get_data(fname, run_settings=run_settings) except (ValueError, KeyError, IOError): logging.warning('Bad file encountered: %s', fname) bad_files.append(fname) continue file_count += 1 # Check to make sure only one run is present in the data runs_in_data = set(data['run']) assert len(runs_in_data) == 1, 'Must be just one run in data' #run = int(data['run'][0]) if not run in filecount: filecount[run] = 0 filecount[run] += 1 rs_run = run_settings[run] # Record geom; check that geom is consistent with other runs if detector_geom is None: detector_geom = rs_run['geom'] assert rs_run['geom'] == detector_geom, \ 'All runs\' geometries must match!' # Loop through all flavints spec'd for run for run_flavint in rs_run['flavints']: barnobar = run_flavint.bar_code int_type = run_flavint.intType # Retrieve this-interaction-type- & this-barnobar-only events # that also pass cuts. (note that cut names are strings) intonly_cut_data = data_proc_params.apply_cuts( data, cuts=cuts + [str(int_type), str(barnobar)], return_fields=extract_fields) # Record the generated count and data for this run/flavor for # each group to which it's applicable for grp_n, flavint_group in enumerate(flavint_groupings): if not run_flavint in flavint_group: continue # Instantiate a field for particles and antiparticles, # keyed by the output of the bar_code property for each if not run in ngen[grp_n][int_type]: ngen[grp_n][int_type][run] = { NuFlav(12).bar_code: 0, NuFlav(-12).bar_code: 0, } # Record count only if it hasn't already been recorded if ngen[grp_n][int_type][run][barnobar] == 0: # Note that one_weight includes cc/nc:total fraction, # so DO NOT specify the full flavint here, only flav # (since one_weight does NOT take bar/nobar fraction, # it must be included here in the ngen computation) flav_ngen = run_settings.get_num_gen(run=run, barnobar=barnobar) ngen[grp_n][int_type][run][barnobar] = flav_ngen # Append the data. Note that extracted_data is: # extracted_data[group n][int_type][extract field name] = # list if extract_fields is None: for f in intonly_cut_data.keys(): if f not in extracted_data[grp_n][int_type]: extracted_data[grp_n][int_type][f] = [] extracted_data[grp_n][int_type][f].extend( intonly_cut_data[f]) else: for f in extract_fields: extracted_data[grp_n][int_type][f].extend( intonly_cut_data[f]) logging.info('File count for run %s: %d', run, file_count) to_file(bad_files, '/tmp/bad_files.json') if ((output_fields is None and (extract_fields is None or 'one_weight' in extract_fields)) or 'weighted_aeff' in output_fields): fmtfields = (' ' * 12 + 'flavint_group', 'int type', ' run', 'part/anti', 'part/anti count', 'aggregate count') fmt_n = [len(f) for f in fmtfields] fmt = ' '.join([r'%' + str(n) + r's' for n in fmt_n]) lines = ' '.join(['-' * n for n in fmt_n]) logging.info(fmt, fmtfields) logging.info(lines) for grp_n, flavint_group in enumerate(flavint_groupings): for int_type in set([fi.intType for fi in flavint_group.flavints]): ngen_it_tot = 0 for run, run_counts in ngen[grp_n][int_type].items(): for barnobar, barnobar_counts in run_counts.items(): ngen_it_tot += barnobar_counts logging.info(fmt, flavint_group.simple_str(), int_type, str(run), barnobar, int(barnobar_counts), int(ngen_it_tot)) # Convert data to numpy array if extract_fields is None: for field in extracted_data[grp_n][int_type].keys(): extracted_data[grp_n][int_type][field] = \ np.array(extracted_data[grp_n][int_type][field]) else: for field in extract_fields: extracted_data[grp_n][int_type][field] = \ np.array(extracted_data[grp_n][int_type][field]) # Generate weighted_aeff field for this group / int type's data extracted_data[grp_n][int_type]['weighted_aeff'] = \ extracted_data[grp_n][int_type]['one_weight'] \ / ngen_it_tot * CMSQ_TO_MSQ # Report file count per run for run, count in filecount.items(): logging.info('Files read, run %s: %d', run, count) ref_num_i3_files = run_settings[run]['num_i3_files'] if count != ref_num_i3_files: logging.warning( 'Run %s, Number of files read (%d) != number of ' 'source I3 files (%d), which may indicate an error.', run, count, ref_num_i3_files) # Generate output data for flavint in ALL_NUFLAVINTS: int_type = flavint.intType for grp_n, flavint_group in enumerate(flavint_groupings): if not flavint in flavint_group: logging.trace('flavint %s not in flavint_group %s, passing.', flavint, flavint_group) continue else: logging.trace( 'flavint %s **IS** in flavint_group %s, storing.', flavint, flavint_group) if output_fields is None: evts[flavint] = extracted_data[grp_n][int_type] else: evts[flavint] = { f: extracted_data[grp_n][int_type][f] for f in output_fields } # Generate file name numerical_runs = [] alphanumerical_runs = [] for run in runs: try: int(run) numerical_runs.append(int(run)) except ValueError: alphanumerical_runs.append(str(run)) run_labels = [] if len(numerical_runs) > 0: run_labels.append(list2hrlist(numerical_runs)) if len(alphanumerical_runs) > 0: run_labels += sorted(alphanumerical_runs) run_label = 'runs_' + ','.join(run_labels) geom_label = '' + detector_geom fname = 'events__' + '__'.join([ detector_label, geom_label, run_label, proc_label, groups_label, ]) + '.hdf5' outfpath = os.path.join(outdir, fname) logging.info('Writing events to %s', outfpath) # Save data to output file evts.save(outfpath)