def _get_reco_kernels(self, reco_vbwkde_evts_file=None, evts_dict=None, reco_vbwkde_make_plots=False, **kwargs): """Given a reco events resource (resource file name or dictionary), retrieve data from it then serialize and hash the data. If the object attribute kernels were computed from the same source data, simply return those. Otherwise, compute the kernels anew and return them. Arguments --------- NOTE: One--and only one--of the two arguments must be specified. reco_vbwkde_evts_file : str (or dict) Name or path to file containing event reco info. See doc for __init__ method for details about contents. If a dict is passed in, it is automatically populated to evts_dict (see below). evts_dict : dict Dictionary containing event reco info. Allows user to pass in a non-string-object to avoid re-loading a file to check whether the contents have changed each time. See doc for __init__ method for details about the dictionary's format. reco_vbwkde_make_plots : bool """ if not isinstance(reco_vbwkde_make_plots, bool): raise ValueError("Option reco_vbwkde_make_plots must be specified and of bool type") for reco_scale in ['e_reco_scale', 'cz_reco_scale']: if reco_scale in kwargs and kwargs[reco_scale] != 1: raise ValueError('%s = %.2f, must be 1.0 for RecoServiceVBWKDE!' %(reco_scale, kwargs[reco_scale])) REMOVE_SIM_DOWNGOING = True if (reco_vbwkde_evts_file is not None) and (evts_dict is not None): raise TypeError( 'One--and only one--of {reco_vbwkde_evts_file|evts_dict} ' + 'may be specified' ) if isinstance(reco_vbwkde_evts_file, dict): evts_dict = reco_vbwkde_evts_file evts_dict = None if isinstance(reco_vbwkde_evts_file, str): logging.info('Constructing VBWKDEs from event true & reco ' + 'info in file: %s' % reco_vbwkde_evts_file) fpath = find_resource(reco_vbwkde_evts_file) eventsdict = hdf.from_hdf(fpath) new_hash = utils.hash_file(fpath) elif isinstance(evts_dict, dict): eventsdict = evts_dict new_hash = utils.hash_obj(eventsdict) else: raise TypeError('A {reco_vbwkde_evts_file|evts_dict} must be' + 'provided, where the former must be a str ' + 'and the latter must be a dict.') if (self.kernels is not None) and (new_hash == self.reco_events_hash): return self.kernels self.kernels = self.all_kernels_from_events( eventsdict=eventsdict, remove_sim_downgoing=REMOVE_SIM_DOWNGOING, make_plots=reco_vbwkde_make_plots ) self.reco_events_hash = new_hash return self.kernels
def all_kernels_from_events(self, eventsdict, remove_sim_downgoing, make_plots=False): """Given a reco events dictionary, retrieve reco/true information from it, group MC data by flavor & interaction type, and return VBWKDE-based PISA reco kernels for all flavors/types. Checks are performed if duplicate data has already been computed, in which case a (deep) copy of the already-computed kernels are populated. Arguments --------- eventsdict : dict Dictionary containing event reco info. See docstr for __init__ for details. remove_sim_downgoing : bool Whether to remove MC-true downgoing events prior to computing resolutions. """ all_flavs = \ ['nue', 'nue_bar', 'numu', 'numu_bar', 'nutau', 'nutau_bar'] all_ints = ['cc', 'nc'] flav_ints = itertools.product(all_flavs, all_ints) kernels = {f:{} for f in all_flavs} kernels['ebins'] = self.ebins kernels['czbins'] = self.czbins computed_datahashes = {} for flav, int_type in flav_ints: logging.info("Working on %s/%s kernels" % (flav, int_type)) e_true = eventsdict[flav][int_type]['true_energy'] e_reco = eventsdict[flav][int_type]['reco_energy'] cz_true = eventsdict[flav][int_type]['true_coszen'] cz_reco = eventsdict[flav][int_type]['reco_coszen'] if remove_sim_downgoing: logging.info("Removing simulated downgoing " + "events in KDE construction.") keep_inds = np.where(cz_true < 0.0) e_true = e_true[keep_inds] e_reco = e_reco[keep_inds] cz_true = cz_true[keep_inds] cz_reco = cz_reco[keep_inds] datahash = utils.hash_obj((e_true.tolist(), e_reco.tolist(), cz_true.tolist(), cz_reco.tolist())) if datahash in computed_datahashes: ref_flav, ref_int_type = computed_datahashes[datahash] logging.info(" > Found duplicate source data; " + "copying kernels already computed for " + "%s/%s to %s/%s." % (ref_flav, ref_int_type, flav, int_type)) kernels[flav][int_type] = copy.deepcopy( kernels[ref_flav][ref_int_type] ) continue kernels[flav][int_type] = self.single_kernel_set( e_true=e_true, cz_true=cz_true, e_reco=e_reco, cz_reco=cz_reco, flav=flav, int_type=int_type, make_plots=make_plots, out_dir=None ) computed_datahashes[datahash] = (flav, int_type) return kernels
def all_kernels_from_events(self, eventsdict, remove_sim_downgoing, make_plots=False): """Given a reco events dictionary, retrieve reco/true information from it, group MC data by flavor & interaction type, and return VBWKDE-based PISA reco kernels for all flavors/types. Checks are performed if duplicate data has already been computed, in which case a (deep) copy of the already-computed kernels are populated. Arguments --------- eventsdict : dict Dictionary containing event reco info. See docstr for __init__ for details. remove_sim_downgoing : bool Whether to remove MC-true downgoing events prior to computing resolutions. """ all_flavs = \ ['nue', 'nue_bar', 'numu', 'numu_bar', 'nutau', 'nutau_bar'] all_ints = ['cc', 'nc'] flav_ints = itertools.product(all_flavs, all_ints) kernels = {f: {} for f in all_flavs} kernels['ebins'] = self.ebins kernels['czbins'] = self.czbins computed_datahashes = {} for flav, int_type in flav_ints: logging.info("Working on %s/%s kernels" % (flav, int_type)) e_true = eventsdict[flav][int_type]['true_energy'] e_reco = eventsdict[flav][int_type]['reco_energy'] cz_true = eventsdict[flav][int_type]['true_coszen'] cz_reco = eventsdict[flav][int_type]['reco_coszen'] if remove_sim_downgoing: logging.info("Removing simulated downgoing " + "events in KDE construction.") keep_inds = np.where(cz_true < 0.0) e_true = e_true[keep_inds] e_reco = e_reco[keep_inds] cz_true = cz_true[keep_inds] cz_reco = cz_reco[keep_inds] datahash = utils.hash_obj((e_true.tolist(), e_reco.tolist(), cz_true.tolist(), cz_reco.tolist())) if datahash in computed_datahashes: ref_flav, ref_int_type = computed_datahashes[datahash] logging.info(" > Found duplicate source data; " + "copying kernels already computed for " + "%s/%s to %s/%s." % (ref_flav, ref_int_type, flav, int_type)) kernels[flav][int_type] = copy.deepcopy( kernels[ref_flav][ref_int_type]) continue kernels[flav][int_type] = self.single_kernel_set( e_true=e_true, cz_true=cz_true, e_reco=e_reco, cz_reco=cz_reco, flav=flav, int_type=int_type, make_plots=make_plots, out_dir=None) computed_datahashes[datahash] = (flav, int_type) return kernels
def _get_reco_kernels(self, reco_vbwkde_evts_file=None, evts_dict=None, reco_vbwkde_make_plots=False, **kwargs): """Given a reco events resource (resource file name or dictionary), retrieve data from it then serialize and hash the data. If the object attribute kernels were computed from the same source data, simply return those. Otherwise, compute the kernels anew and return them. Arguments --------- NOTE: One--and only one--of the two arguments must be specified. reco_vbwkde_evts_file : str (or dict) Name or path to file containing event reco info. See doc for __init__ method for details about contents. If a dict is passed in, it is automatically populated to evts_dict (see below). evts_dict : dict Dictionary containing event reco info. Allows user to pass in a non-string-object to avoid re-loading a file to check whether the contents have changed each time. See doc for __init__ method for details about the dictionary's format. reco_vbwkde_make_plots : bool """ if not isinstance(reco_vbwkde_make_plots, bool): raise ValueError( "Option reco_vbwkde_make_plots must be specified and of bool type" ) for reco_scale in ['e_reco_scale', 'cz_reco_scale']: if reco_scale in kwargs and kwargs[reco_scale] != 1: raise ValueError( '%s = %.2f, must be 1.0 for RecoServiceVBWKDE!' % (reco_scale, kwargs[reco_scale])) REMOVE_SIM_DOWNGOING = True if (reco_vbwkde_evts_file is not None) and (evts_dict is not None): raise TypeError( 'One--and only one--of {reco_vbwkde_evts_file|evts_dict} ' + 'may be specified') if isinstance(reco_vbwkde_evts_file, dict): evts_dict = reco_vbwkde_evts_file evts_dict = None if isinstance(reco_vbwkde_evts_file, str): logging.info('Constructing VBWKDEs from event true & reco ' + 'info in file: %s' % reco_vbwkde_evts_file) fpath = find_resource(reco_vbwkde_evts_file) eventsdict = hdf.from_hdf(fpath) new_hash = utils.hash_file(fpath) elif isinstance(evts_dict, dict): eventsdict = evts_dict new_hash = utils.hash_obj(eventsdict) else: raise TypeError('A {reco_vbwkde_evts_file|evts_dict} must be' + 'provided, where the former must be a str ' + 'and the latter must be a dict.') if (self.kernels is not None) and (new_hash == self.reco_events_hash): return self.kernels self.kernels = self.all_kernels_from_events( eventsdict=eventsdict, remove_sim_downgoing=REMOVE_SIM_DOWNGOING, make_plots=reco_vbwkde_make_plots) self.reco_events_hash = new_hash return self.kernels
def store_recursively(fhandle, node, path=None, node_hashes=None): if path is None: path = [] if node_hashes is None: node_hashes = {} full_path = '/' + '/'.join(path) if isinstance(node, dict): logging.trace(" creating Group `%s`" % full_path) try: fhandle.create_group(full_path) except ValueError: pass for key in sorted(node.iterkeys()): key_str = str(key) if not isinstance(key, str): logging.warn('Stringifying key `' + key_str + '`for use as name in HDF5 file') val = node[key] new_path = path + [key_str] store_recursively(fhandle=fhandle, node=val, path=new_path, node_hashes=node_hashes) else: # Check for existing node node_hash = utils.hash_obj(node) if node_hash in node_hashes: logging.trace(" creating hardlink for Dataset: `%s` -> `%s`" % (full_path, node_hashes[node_hash])) # Hardlink the matching existing dataset fhandle[full_path] = fhandle[node_hashes[node_hash]] return # For now, convert None to np.nan since h5py appears to not handle None if node is None: node = np.nan logging.warn(" encountered `None` at node `%s`; converting to" " np.nan" % full_path) # "Scalar datasets don't support chunk/filter options". Shuffling # is a good idea otherwise since subsequent compression will # generally benefit; shuffling requires chunking. Compression is # not done here since it is slow. if np.isscalar(node): shuffle = False chunks = None else: shuffle = True chunks = True # Store the node_hash for linking to later if this is more than # a scalar datatype. Assumed that "None" has node_hashes[node_hash] = full_path # TODO: Treat strings as follows? Would this break compatibility # with pytables/Pandas? What are benefits? Leaving out for now. # if isinstance(node, basestr): # dtype = h5py.special_dtype(vlen=str) # fh.create_dataset(k,data=v,dtype=dtype) logging.trace(" creating dataset at node `%s`" % full_path) try: fhandle.create_dataset(name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False) except TypeError: try: shuffle = False chunks = None fhandle.create_dataset(name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False) except: logging.error(' full_path: ' + full_path) logging.error(' chunks : ' + str(chunks)) logging.error(' shuffle : ' + str(shuffle)) logging.error(' node : ' + str(node)) raise