def main(): """Main function for calling summarize_clsim_table as a script""" t0 = time() args = parse_args() kwargs = vars(args) table_fpaths = [] for fpath in kwargs.pop('table-fpaths'): table_fpaths.extend(glob(expand(fpath))) for fpath in table_fpaths: kwargs['table_fpath'] = fpath summarize_clsim_table(**kwargs) total_time = time() - t0 if len(table_fpaths) > 1: avg = np.round(total_time / len(table_fpaths), 3) wstderr('Average time to summarize tables: {} s/table\n'.format(avg))
def summarize_clsim_table(table_fpath, table=None, save_summary=True, outdir=None): """ Parameters ---------- table_fpath : string Path to table (or just the table's filename if `outdir` is specified) table : mapping, optional If the table has already been loaded, it can be passed here to avoid re-loading the table. save_summary : bool Whether to save the table summary to disk. outdir : string, optional If `save_summary` is True, write the summary to this directory. If `outdir` is not specified and `save_summary` is True, the summary will be written to the same directory that contains `table_fpath`. Returns ------- table See `load_clsim_table` for details of the data structure summary : OrderedDict """ t_start = time() if save_summary: from pisa.utils.jsons import from_json, to_json table_fpath = expand(table_fpath) srcdir, clsim_fname = dirname(table_fpath), basename(table_fpath) invalid_fname = False try: fname_info = interpret_clsim_table_fname(clsim_fname) except ValueError: invalid_fname = True fname_info = {} if outdir is None: outdir = srcdir outdir = expand(outdir) mkdir(outdir) if invalid_fname: metapath = None else: metaname = (CLSIM_TABLE_METANAME_PROTO[-1].format( hash_val=fname_info['hash_val'])) metapath = join(outdir, metaname) if metapath and isfile(metapath): meta = from_json(metapath) else: meta = dict() if table is None: table = load_clsim_table(table_fpath) summary = OrderedDict() for key in table.keys(): if key == 'table': continue summary[key] = table[key] if fname_info: for key in ('hash_val', 'string', 'depth_idx', 'seed'): summary[key] = fname_info[key] # TODO: Add hole ice info when added to tray_kw_to_hash if meta: summary['n_events'] = meta['tray_kw_to_hash']['NEvents'] summary['ice_model'] = meta['tray_kw_to_hash']['IceModel'] summary['tilt'] = not meta['tray_kw_to_hash']['DisableTilt'] for key, val in meta.items(): if key.endswith('_binning_kw'): summary[key] = val elif 'fname_version' in fname_info and fname_info['fname_version'] == 1: summary['n_events'] = fname_info['n_events'] summary['ice_model'] = 'spice_mie' summary['tilt'] = False summary['r_binning_kw'] = dict(min=0.0, max=400.0, n_bins=200, power=2) summary['costheta_binning_kw'] = dict(min=-1, max=1, n_bins=40) summary['t_binning_kw'] = dict(min=0.0, max=3000.0, n_bins=300) summary['costhetadir_binning_kw'] = dict(min=-1, max=1, n_bins=20) summary['deltaphidir_binning_kw'] = dict(min=0.0, max=np.pi, n_bins=20) # Save marginal distributions and info to file norm = ( 1 / table['n_photons'] / (SPEED_OF_LIGHT_M_PER_NS / table['phase_refractive_index'] * np.mean(np.diff(table['t_bin_edges']))) #* table['angular_acceptance_fract'] * (len(table['costheta_bin_edges']) - 1)) summary['norm'] = norm dim_names = ('r', 'costheta', 't', 'costhetadir', 'deltaphidir') n_dims = len(table['table_shape']) assert n_dims == len(dim_names) # Apply norm to underflow and overflow so magnitudes can be compared # relative to plotted marginal distributions for flow, idx in product(('underflow', 'overflow'), iter(range(n_dims))): summary[flow][idx] = summary[flow][idx] * norm wstderr('Finding marginal distributions...\n') wstderr(' masking off zeros in table...') t0 = time() nonzero_table = np.ma.masked_equal(table['table'], 0) wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3, 3))) t0_marg = time() summary['dimensions'] = OrderedDict() for keep_axis, ax_name in zip(tuple(range(n_dims)), dim_names): remove_axes = list(range(n_dims)) remove_axes.pop(keep_axis) remove_axes = tuple(remove_axes) axis = OrderedDict() wstderr(' mean across non-{} axes...'.format(ax_name)) t0 = time() axis['mean'] = norm * np.asarray( np.mean(table['table'], axis=remove_axes)) wstderr(' ({} s)\n'.format(np.round(time() - t0, 3))) wstderr(' median across non-{} axes...'.format(ax_name)) t0 = time() axis['median'] = norm * np.asarray( np.ma.median(nonzero_table, axis=remove_axes)) wstderr(' ({} s)\n'.format(np.round(time() - t0, 3))) wstderr(' max across non-{} axes...'.format(ax_name)) t0 = time() axis['max'] = norm * np.asarray( np.max(table['table'], axis=remove_axes)) wstderr(' ({} s)\n'.format(np.round(time() - t0, 3))) summary['dimensions'][ax_name] = axis wstderr(' Total time to find marginal distributions: {} s\n'.format( np.round(time() - t0_marg, 3))) if save_summary: ext = None base_fname = clsim_fname while ext not in ('', '.fits'): base_fname, ext = splitext(base_fname) ext = ext.lower() outfpath = join(outdir, base_fname + '_summary.json.bz2') to_json(summary, outfpath) print('saved summary to "{}"'.format(outfpath)) wstderr('Time to summarize table: {} s\n'.format( np.round(time() - t_start, 3))) return table, summary
def load_ckv_table_compr(fpath, mmap): """Load a Cherenkov table from disk. Parameters ---------- fpath : string Path to directory containing the table's .npy files. mmap : bool Whether to memory map the table (if it's stored in a directory containing .npy files). Returns ------- table : OrderedDict Items are - 'n_photons' : - 'group_refractive_index' : - 'phase_refractive_index' : - 'r_bin_edges' : - 'costheta_bin_edges' : - 't_bin_edges' : - 'costhetadir_bin_edges' : - 'deltaphidir_bin_edges' : - 'ckv_template_map' : np.ndarray - 't_indep_ckv_table' : np.ndarray (if available) """ fpath = expand(fpath) table = OrderedDict() if mmap: mmap_mode = 'r' else: mmap_mode = None if DEBUG: wstderr('Loading ckv table from {} ...\n'.format(fpath)) if isfile(fpath): assert basename(fpath) == 'ckv_table.npy' fpath = dirname(fpath) t0 = time() indir = fpath for key in CKV_TABLE_KEYS + ['t_indep_ckv_table']: fpath = join(indir, key + '.npy') if DEBUG: wstderr(' loading {} from "{}" ...'.format(key, fpath)) t1 = time() if isfile(fpath): table[key] = np.load(fpath) elif key != 't_indep_ckv_table': raise ValueError( 'Could not find file "{}" for loading table key "{}"' .format(fpath, key) ) if DEBUG: wstderr(' ({} ms)\n'.format(np.round((time() - t1)*1e3, 3))) if DEBUG: wstderr(' Total time to load: {} s\n'.format(np.round(time() - t0, 3))) return table
def load_clsim_table_minimal(fpath, mmap=False, include_overflow=False): """Load a CLSim table from disk (optionally compressed with zstd). Similar to the `load_clsim_table` function but the full table, including under/overflow bins, is kept and no normalization or further processing is performed on the table data besides populating the ouptput OrderedDict. Parameters ---------- fpath : string Path to file to be loaded. If the file has extension 'zst', 'zstd', or 'zstandard', the file will be decompressed using the `python-zstandard` Python library before passing to `fits` for interpreting. mmap : bool, optional Whether to memory map the table include_overflow : bool, optional By default, overflow bins (if present) are removed Returns ------- table : OrderedDict """ t0 = time() table = OrderedDict() fpath = expand(fpath) if DEBUG: wstderr('Loading table from {} ...\n'.format(fpath)) if isdir(fpath): indir = fpath if mmap: mmap_mode = 'r' else: mmap_mode = None for rel_fpath in listdir(indir): key, ext = splitext(rel_fpath) abs_fpath = join(indir, rel_fpath) if not (isfile(abs_fpath) and ext == '.npy'): continue if DEBUG: wstderr(' loading {} from "{}" ...'.format(key, abs_fpath)) t1 = time() val = np.load(abs_fpath, mmap_mode=mmap_mode) # Pull "small" things (less than 10 MiB) into memory so we don't # have too many file handles open due to memory mapping if mmap and val.nbytes < 10 * 1024**2: val = np.copy(val) table[key] = val if DEBUG: wstderr(' ({} ms)\n'.format(np.round((time() - t1) * 1e3, 3))) elif isfile(fpath): from astropy.io import fits fobj = get_decompressd_fobj(fpath) pf_table = None try: pf_table = fits.open(fobj, mode='readonly', memmap=mmap) header = pf_table[0].header # pylint: disable=no-member table['table_shape'] = np.array(pf_table[0].data.shape, dtype=int) # pylint: disable=no-member table['group_refractive_index'] = set_explicit_dtype( force_little_endian(header['_i3_n_group'])) table['phase_refractive_index'] = set_explicit_dtype( force_little_endian(header['_i3_n_phase'])) n_dims = len(table['table_shape']) new_style = False axnames = [None] * n_dims binning = [None] * n_dims for key in header.keys(): if not key.startswith('_i3_ax_'): continue new_style = True axnum = header[key] axname = key[len('_i3_ax_'):] be0 = header['_i3_{}_min'.format(axname)] be1 = header['_i3_{}_max'.format(axname)] n_bins = header['_i3_{}_n_bins'.format(axname)] power = header.get('_i3_{}_power'.format(axname), 1) bin_edges = force_little_endian(pf_table[axnum + 1].data) # pylint: disable=no-member assert np.isclose(bin_edges[0], be0), '%f .. %f' % (be0, bin_edges[0]) assert np.isclose(bin_edges[-1], be1), '%f .. %f' % (be1, bin_edges[-1]) assert len(bin_edges) == n_bins + 1, '%d vs. %d' % ( len(bin_edges), n_bins + 1) assert np.allclose( bin_edges, powerspace(start=be0, stop=be1, num=n_bins + 1, power=power), ) axnames[axnum] = axname binning[axnum] = bin_edges if not new_style: if n_dims == 5: axnames = [ 'r', 'costheta', 't', 'costhetadir', 'deltaphidir' ] elif n_dims == 6: axnames = [ 'r', 'costheta', 'phi', 't', 'costhetadir', 'deltaphidir' ] else: raise NotImplementedError( '{}-dimensional table not handled for old-style CLSim' ' tables'.format(n_dims)) binning = [ force_little_endian(pf_table[i + 1].data).flat for i in range(len(axnames)) ] # pylint: disable=no-member for axnum, (axname, bin_edges) in enumerate(zip(axnames, binning)): assert axname is not None, 'missing axis %d name' % axnum assert bin_edges is not None, 'missing axis %d binning' % axnum dtype = np.dtype([(axname, np.float64, dim.size) for axname, dim in zip(axnames, binning)]) table['binning'] = np.array(tuple(binning), dtype=dtype) for keyroot in GENERIC_KEYS: keyname = '_i3_' + keyroot if keyname in header: val = force_little_endian(header[keyname]) if keyroot in ( 't_is_residual_time', 'disable_tilt', 'disable_anisotropy', ): val = np.bool8(val) else: val = set_explicit_dtype(val) table[keyroot] = val # Get string values from keys that have a prefix preceded by the # value all in the key (I3 software had issues saving strings as # values in the header "dict" so the workaround was to store the # string value in this way) for infix in INFIX_KEYS: keyroot = '_i3_' + infix + '_' for keyname in header.keys(): if not keyname.startswith(keyroot): continue val = keyname[len(keyroot):] table[infix] = np.string0(val) if include_overflow: slicer = (slice(None), ) * n_dims else: slicer = (slice(1, -1), ) * n_dims table['table'] = force_little_endian(pf_table[0].data[slicer]) # pylint: disable=no-member wstderr(' (load took {} s)\n'.format(np.round(time() - t0, 3))) except: wstderr('ERROR: Failed to load "{}"\n'.format(fpath)) raise finally: del pf_table if hasattr(fobj, 'close'): fobj.close() del fobj else: # fpath is neither dir nor file raise ValueError('Table does not exist at path "{}"'.format(fpath)) if 'step_length' not in table: table['step_length'] = 1 if 't_is_residual_time' not in table: table['t_is_residual_time'] = True if DEBUG: wstderr(' Total time to load: {} s\n'.format(np.round(time() - t0, 3))) return table
def load_clsim_table(fpath, angular_acceptance_fract, quantum_efficiency): """Load a CLSim table from disk (optionally compressed with zstd). Parameters ---------- fpath : string Path to file to be loaded. If the file has extension 'zst', 'zstd', or 'zstandard', the file will be decompressed using the `python-zstandard` Python library before passing to `fits` for interpreting. Returns ------- table : OrderedDict Items include - 'table_shape' : tuple of int - 'table' : np.ndarray - 't_indep_table' : np.ndarray - 'n_photons' : - 'group_refractive_index' : - 'phase_refractive_index' : If the table is 5D, items also include - 'r_bin_edges' : - 'costheta_bin_edges' : - 't_bin_edges' : - 'costhetadir_bin_edges' : - 'deltaphidir_bin_edges' : - 'table_norm' """ table = OrderedDict() assert isfile(fpath) table = load_clsim_table_minimal(fpath=fpath, include_overflow=True) if 'is_normed' not in table: table['is_normed'] = False is_normed = table['is_normed'] if not is_normed: table['table_norm'] = get_table_norm( angular_acceptance_fract=angular_acceptance_fract, quantum_efficiency=quantum_efficiency, step_length=table['step_length'], **{k: table[k] for k in TABLE_NORM_KEYS if k != 'step_length'}) table[ 't_indep_table_norm'] = quantum_efficiency * angular_acceptance_fract wstderr('Interpreting table...\n') t0 = time() n_dims = len(table['table_shape']) # Cut off first and last bin in each dimension (underflow and # overflow bins) slice_wo_overflow = (slice(1, -1), ) * n_dims wstderr(' slicing to remove underflow/overflow bins...') t0 = time() table_wo_overflow = table['table'][slice_wo_overflow] wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3))) wstderr(' slicing and summarizing underflow and overflow...') is_normed = table['is_normed'] if not is_normed: table['table_norm'] = get_table_norm( angular_acceptance_fract=angular_acceptance_fract, quantum_efficiency=quantum_efficiency, step_length=table['step_length'], **{k: table[k] for k in TABLE_NORM_KEYS if k != 'step_length'}) table[ 't_indep_table_norm'] = quantum_efficiency * angular_acceptance_fract wstderr('Interpreting table...\n') t0 = time() n_dims = len(table['table_shape']) # Cut off first and last bin in each dimension (underflow and # overflow bins) slice_wo_overflow = (slice(1, -1), ) * n_dims wstderr(' slicing to remove underflow/overflow bins...') t0 = time() table_wo_overflow = table['table'][slice_wo_overflow] wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3))) wstderr(' slicing and summarizing underflow and overflow...') t0 = time() underflow, overflow = [], [] for n in range(n_dims): sl = tuple([slice(1, -1)] * n + [0] + [slice(1, -1)] * (n_dims - 1 - n)) underflow.append(table['table'][sl].sum()) sl = tuple([slice(1, -1)] * n + [-1] + [slice(1, -1)] * (n_dims - 1 - n)) overflow.append(table['table'][sl].sum()) wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3))) table['table'] = table_wo_overflow table['underflow'] = np.array(underflow) table['overflow'] = np.array(overflow) return table
def get_all_stats( outdir, min_pulses_per_event, overwrite=False, only_sets=None, processes=None, verbosity=0, ): """Get stats for all data and MC sets. Parameters ---------- outdir : string min_pulses_per_event : int >= 0 overwrite : bool, optional Whether to overwrite any existing stats files only_sets : string, iterable thereof, or None, optional If specified, string(s) must be keys of `MC_NAME_DIRINFOS` and/or `DATA_NAME_DIRINFOS` dicts. processes : None or int > 0, optional verbosity : int >= 0, optional Returns ------- stats : OrderedDict Keys are dataset names and values are OrderedDicts containing the stats for the corresponding datasets. """ outdir = expand(outdir) if isinstance(only_sets, string_types): only_sets = [only_sets] to_process = chain.from_iterable( [MC_NAME_DIRINFOS.items(), DATA_NAME_DIRINFOS.items()]) if only_sets is not None: only_sets = [s.split("/") for s in only_sets] new_to_process = [] for set_name, subsets_list in to_process: new_subsets_list = [] for only_set in only_sets: if set_name != only_set[0]: continue if len(only_set) == 1: new_subsets_list = subsets_list break else: for subset in subsets_list: if subset["id"] == only_set[1]: new_subsets_list.append(subset) if len(new_subsets_list) > 0: new_to_process.append((set_name, new_subsets_list)) to_process = new_to_process #((key, val) for key, val in to_process if key in only_sets) print(to_process) mkdir(outdir) stats = OrderedDict() for name, dirinfos in to_process: t0 = time.time() this_stats = OrderedDict() for dirinfo in dirinfos: augmented_name = "{}.{}".format(name, dirinfo["id"]) outfile = join(outdir, "stats_{}.npz".format(augmented_name)) if isfile(outfile) and not overwrite: contents = OrderedDict([(k, v) for k, v in np.load(outfile).items()]) if verbosity >= 1: wstderr( 'loaded stats for set "{}" from file "{}" ({} sec)\n'. format(augmented_name, outfile, time.time() - t0)) else: contents = get_stats( min_pulses_per_event=min_pulses_per_event, dirinfo=dirinfo, processes=processes, verbosity=verbosity, ) #np.savez_compressed(outfile, **contents) np.savez(outfile, **contents) if verbosity >= 1: wstderr('saved stats for set "{}" to file "{}" ({} sec)\n'. format(name, outfile, time.time() - t0)) if name == "data": stats[dirinfo["id"]] = contents else: for key, vals in contents.items(): if key not in this_stats: this_stats[key] = [] this_stats[key].append(vals) del contents if name != "data": stats[name] = OrderedDict([(k, np.concatenate(v)) for k, v in this_stats.items()]) return stats
def combine_tdi_tiles( source_dir, dest_dir, table_hash, gcd, bin_edges_file, tile_spec_file, ): """Combine individual time-independent tiles (one produced per DOM) into a single TDI table. Parameters ---------- source_dir : str dest_dir : str bin_edges_file : str tile_spec_file : str """ source_dir = expand(source_dir) dest_dir = expand(dest_dir) gcd = expand(gcd) bin_edges_file = expand(bin_edges_file) tile_spec_file = expand(tile_spec_file) mkdir(dest_dir) assert isdir(source_dir) assert isfile(bin_edges_file) assert isfile(tile_spec_file) gcd = extract_gcd(gcd) bin_edges = load_pickle(bin_edges_file) x_edges = bin_edges['x'] y_edges = bin_edges['y'] z_edges = bin_edges['z'] ctdir_edges = bin_edges['costhetadir'] phidir_edges = bin_edges['phidir'] n_x = len(x_edges) - 1 n_y = len(y_edges) - 1 n_z = len(z_edges) - 1 n_ctdir = len(ctdir_edges) - 1 n_phidir = len(phidir_edges) - 1 n_dir_bins = n_ctdir * n_phidir x_bw = (x_edges.max() - x_edges.min()) / n_x y_bw = (y_edges.max() - y_edges.min()) / n_y z_bw = (z_edges.max() - z_edges.min()) / n_z bin_vol = x_bw * y_bw * z_bw ctdir_min = ctdir_edges.min() ctdir_max = ctdir_edges.max() phidir_min = phidir_edges.min() phidir_max = phidir_edges.max() with open(tile_spec_file, 'r') as f: tile_specs = [l.strip() for l in f.readlines()] table = np.zeros(shape=(n_x, n_y, n_z, n_ctdir, n_phidir), dtype=np.float32) # Slice all table dimensions to exclude {under,over}flow bins central_slice = (slice(1, -1), ) * 5 angsens_model = None ice_model = None disable_tilt = None disable_anisotropy = None n_phase = None n_group = None tiles_info = [] for tile_spec in tile_specs: info = None try: fields = tile_spec.split() info = OrderedDict() info['tbl_idx'] = int(fields[0]) info['string'] = int(fields[1]) info['dom'] = int(fields[2]) info['seed'] = int(fields[3]) info['n_events'] = int(fields[4]) info['x_min'] = float(fields[5]) info['x_max'] = float(fields[6]) info['n_x'] = int(fields[7]) info['y_min'] = float(fields[8]) info['y_max'] = float(fields[9]) info['n_y'] = int(fields[10]) info['z_min'] = float(fields[11]) info['z_max'] = float(fields[12]) info['n_z'] = int(fields[13]) info['n_ctdir'] = int(fields[14]) info['n_phidir'] = int(fields[15]) tiles_info.append(info) tile_fpath = glob( join( source_dir, 'clsim_table_set' '_{table_hash}' '_tile_{tbl_idx}' '_string_{string}' '_dom_{dom}' '_seed_{seed}' '_n_{n_events}' '.fits'.format(table_hash=table_hash, **info)))[0] try: fits_table = fits.open(tile_fpath, mode='readonly', memmap=True) except: wstderr('Failed on tile_fpath "{}"'.format(tile_fpath)) raise primary = fits_table[0] header = primary.header # pylint: disable=no-member keys = header.keys() this_gcd_i3_md5 = extract_meta_from_keys(keys, '_i3_gcd_i3_md5_') assert this_gcd_i3_md5 == gcd['source_gcd_i3_md5'], \ 'this: {}, ref: {}'.format(this_gcd_i3_md5, gcd['source_gcd_i3_md5']) this_angsens_model = extract_meta_from_keys(keys, '_i3_angsens_') if angsens_model is None: angsens_model = this_angsens_model _, avg_angsens = load_angsens_model(angsens_model) else: assert this_angsens_model == angsens_model this_table_hash = extract_meta_from_keys(keys, '_i3_hash_') assert this_table_hash == table_hash this_ice_model = extract_meta_from_keys(keys, '_i3_ice_') if ice_model is None: ice_model = this_ice_model else: assert this_ice_model == ice_model this_disable_anisotropy = header['_i3_disable_anisotropy'] if disable_anisotropy is None: disable_anisotropy = this_disable_anisotropy else: assert this_disable_anisotropy == disable_anisotropy this_disable_tilt = header['_i3_disable_tilt'] if disable_tilt is None: disable_tilt = this_disable_tilt else: assert this_disable_tilt == disable_tilt this_n_phase = header['_i3_n_phase'] if n_phase is None: n_phase = this_n_phase else: assert this_n_phase == n_phase this_n_group = header['_i3_n_group'] if n_group is None: n_group = this_n_group else: assert this_n_group == n_group assert info['n_ctdir'] == n_ctdir assert info['n_phidir'] == n_phidir assert np.isclose(header['_i3_costhetadir_min'], ctdir_min) assert np.isclose(header['_i3_costhetadir_max'], ctdir_max) assert np.isclose(header['_i3_phidir_min'], phidir_min) assert np.isclose(header['_i3_phidir_max'], phidir_max) n_photons = header['_i3_n_photons'] n_dir_bins = info['n_ctdir'] * info['n_phidir'] this_x_bw = (info['x_max'] - info['x_min']) / info['n_x'] this_y_bw = (info['y_max'] - info['y_min']) / info['n_y'] this_z_bw = (info['z_max'] - info['z_min']) / info['n_z'] assert this_x_bw == x_bw assert this_y_bw == y_bw assert this_z_bw == z_bw assert np.any(np.isclose(info['x_min'], x_edges)) assert np.any(np.isclose(info['x_max'], x_edges)) assert np.any(np.isclose(info['y_min'], y_edges)) assert np.any(np.isclose(info['y_max'], y_edges)) assert np.any(np.isclose(info['z_min'], z_edges)) assert np.any(np.isclose(info['z_max'], z_edges)) quantum_efficiency = 0.25 * gcd['rde'][info['string'] - 1, info['dom'] - 1] norm = n_dir_bins * quantum_efficiency * avg_angsens / (n_photons * bin_vol) if np.isnan(norm): print('\nTile {} norm is nan!'.format(info['tbl_idx'])) print(' quantum_efficiency = {}, n_photons = {}'.format( quantum_efficiency, n_photons)) elif norm == 0: print('\nTile {} norm is 0'.format(info['tbl_idx'])) x_start = np.digitize(info['x_min'] + x_bw / 2, x_edges) - 1 x_stop = np.digitize(info['x_max'] - x_bw / 2, x_edges) y_start = np.digitize(info['y_min'] + y_bw / 2, y_edges) - 1 y_stop = np.digitize(info['y_max'] - y_bw / 2, y_edges) z_start = np.digitize(info['z_min'] + z_bw / 2, z_edges) - 1 z_stop = np.digitize(info['z_max'] - z_bw / 2, z_edges) # NOTE: comparison excludes norm = 0 _and_ norm = NaN if norm > 0: assert not np.isnan(norm) table[x_start:x_stop, y_start:y_stop, z_start:z_stop, :, :] += ( norm * primary.data[central_slice] # pylint: disable=no-member ) except: wstderr('Failed on tile_spec {}'.format(tile_spec)) if info is not None: wstderr('Info:\n{}'.format(info)) raise wstderr('.') wstderr('\n') metadata = OrderedDict() metadata['table_hash'] = table_hash metadata['disable_tilt'] = disable_tilt metadata['disable_anisotropy'] = disable_anisotropy metadata['gcd'] = gcd metadata['angsens_model'] = angsens_model metadata['ice_model'] = ice_model metadata['n_phase'] = n_phase metadata['n_group'] = n_group metadata['tiles_info'] = tiles_info outdir = join( dest_dir, 'tdi_table_{}_tilt_{}_anisotropy_{}'.format( table_hash, 'off' if disable_tilt else 'on', 'off' if disable_anisotropy else 'on', )) mkdir(outdir) name = 'tdi_table.npy' outfpath = join(outdir, name) wstdout('saving table to "{}"\n'.format(outfpath)) np.save(outfpath, table) #outfpath = join(outdir, 'tdi_bin_edges.json') #wstdout('saving bin edges to "{}"\n'.format(outfpath)) #json.dump( # bin_edges, # file(outfpath, 'w'), # sort_keys=False, # indent=2, #) outfpath = join(outdir, 'tdi_bin_edges.pkl') wstdout('saving bin edges to "{}"\n'.format(outfpath)) pickle.dump( bin_edges, open(outfpath, 'wb'), protocol=pickle.HIGHEST_PROTOCOL, ) #outfpath = join(outdir, 'tdi_metadata.json') #wstdout('saving metadata to "{}"\n'.format(outfpath)) #json.dump( # metadata, # file(outfpath, 'w'), # sort_keys=False, # indent=2, #) outfpath = join(outdir, 'tdi_metadata.pkl') wstdout('saving metadata to "{}"\n'.format(outfpath)) pickle.dump( metadata, open(outfpath, 'wb'), protocol=pickle.HIGHEST_PROTOCOL, )
def get_stats(dirinfo, min_pulses_per_event, processes=None, verbosity=0): """ Parameters ---------- dirinfo : dict Must contain keys / vals "id" : string "path" : string "n_files" : int min_pulses_per_event : int >= 0 processes : None or int > 0, optional verbosity : int >= 0 """ if isinstance(dirinfo, string_types): dirinfo = [dirinfo] elif isinstance(dirinfo, Mapping): dirinfo = [dirinfo] #pulses_filter = fixed_charge_filter #pulses_filter = quantize_min_q_filter #pulses_filter = irregular_quantize_min_q_filter pulses_filter = pulse_integration_filter #pulses_filter = None #emax = 100 emax = np.inf pool = Pool(processes=processes) results = [] for root_dirinfo in dirinfo: root_dir = expand(root_dirinfo["path"]) n_files = root_dirinfo["n_files"] for dirpath, dirs_, files in walk(root_dir, followlinks=True): dirs_.sort(key=nsort_key_func) if "events.npy" in files: results.append( pool.apply_async( process_dir, tuple(), dict( dirpath=dirpath, n_files=n_files, pulses_filter=pulses_filter, emax=emax, min_pulses_per_event=min_pulses_per_event, verbosity=verbosity, ), )) pool.close() pool.join() stats = deepcopy(STATS_PROTO) for result in results: result = result.get() for key in result.keys(): stats[key].extend(result[key]) # Concatenate and cull new_stats = OrderedDict() for stat_name in stats.keys(): vals = stats[stat_name] if len(vals) == 0: if verbosity >= 1: wstderr('Not using stat "{}" for dirs {}\n'.format( stat_name, dirinfo)) elif np.isscalar(vals[0]): new_stats[stat_name] = np.array(vals) else: new_stats[stat_name] = np.concatenate(vals) stats = new_stats return stats
def process_dir( dirpath, n_files, min_pulses_per_event, pulses_filter, emax, verbosity=0, ): """ Parameters ---------- dirpath : string n_files : int > 0 min_pulses_per_event : int >= 0 pulses_filter : None or callable, optional emax : 0 <= scalar <= np.inf verbosity : int >= 0 Returns ------- stats : OrderedDict Keys are taken from STATS_PROTO, values are numpy arrays """ stats = deepcopy(STATS_PROTO) events = np.load(join(dirpath, "events.npy"), mmap_mode="r") if len(events) == 0: return stats mask_vals = deepcopy(events["L5_oscNext_bool"]) if np.count_nonzero(mask_vals) == 0: return stats if verbosity >= 2: wstderr(".") if isfile(join(dirpath, "truth.npy")): truth = np.load(join(dirpath, "truth.npy"), mmap_mode="r") weights = truth["weight"] use_weights = True else: weights = np.ones(shape=len(events)) use_weights = False if np.isfinite(emax) and emax > 0: recos = np.load( join(dirpath, "recos", "retro_crs_prefit.npy"), mmap_mode="r", ) with np.errstate(invalid='ignore'): mask_vals &= recos["energy"]["median"] <= emax if np.count_nonzero(mask_vals) == 0: return stats pulses = load_pickle( join(dirpath, "pulses", "{}.pkl".format(PULSE_SERIES_NAME))) for mask_val, event_pulses, weight in zip(mask_vals, pulses, weights): if not mask_val: continue if callable(pulses_filter): event_pulses = pulses_filter(event_pulses) if len(event_pulses) == 0: continue if use_weights: normed_weight = weight / n_files # qtot is sum of charge of all hits on all DOMs event_pulses_ = [] tmp_hits_per_dom = [] tmp_charge_per_dom = [] tmp_time_diffs_within_dom = [] tmp_weight_per_dom = [] for omkey, dom_pulses in event_pulses: event_pulses_.append(dom_pulses) tmp_hits_per_dom.append(len(dom_pulses)) tmp_charge_per_dom.append(dom_pulses["charge"].sum()) #stats["time_diffs_between_hits"].append( # np.concatenate([[0.], np.diff(np.sort(dom_pulses["time"]))]) #) tmp_time_diffs_within_dom.append(dom_pulses["time"] - dom_pulses["time"].min()) if use_weights: tmp_weight_per_dom.append(normed_weight) event_pulses = np.concatenate(event_pulses_) # TODO: move min_pulses_per_event before qmin processing # TODO: small-pulse agglomeration filter if len(event_pulses) < min_pulses_per_event: continue stats["doms_per_event"].append(len(event_pulses)) stats["hits_per_dom"].extend(tmp_hits_per_dom) stats["charge_per_dom"].extend(tmp_charge_per_dom) stats["time_diffs_within_dom"].extend(tmp_time_diffs_within_dom) if use_weights: stats["weight_per_dom"].extend(tmp_weight_per_dom) charge = event_pulses["charge"] stats["charge_per_hit"].append(charge) stats["charge_per_event"].append(charge.sum()) stats["hits_per_event"].append(len(event_pulses)) stats["time_diffs_within_event"].append(event_pulses["time"] - event_pulses["time"].min()) if use_weights: stats["weight_per_event"].append(normed_weight) stats["weight_per_hit"].append( np.full(shape=len(event_pulses), fill_value=normed_weight)) return stats
def combine_tables(table_fpaths, outdir=None, overwrite=False): """Combine multiple tables together into a single table. All tables specified must have the same binnings defined. Tables should also be produced using different random seeds (if all else besides n_photons is equal); if corresponding metadata files can be found in the same directories as the CLSim tables, this will be enforced prior to loading and combining the actual tables together. Parameters ---------- table_fpaths : string or iterable thereof Each string is glob-expanded outdir : string, optional Directory to which to save the combined table; if not specified, the resulting table will be returned but not saved to disk. overwrite : bool Overwrite an existing table. If a table is found at the output path and `overwrite` is False, the function simply returns without raising an exception. Returns ------- combined_table """ t_start = time() # Get all input table filepaths, including glob expansion orig_table_fpaths = deepcopy(table_fpaths) if isinstance(table_fpaths, string_types): table_fpaths = [table_fpaths] table_fpaths_tmp = [] for fpath in table_fpaths: table_fpaths_tmp.extend(glob(expand(fpath))) table_fpaths = sorted(table_fpaths_tmp, key=nsort_key_func) if not table_fpaths: raise ValueError( "Found no tables given `table_fpaths` = {}".format(orig_table_fpaths) ) wstderr( 'Found {} tables to combine:\n {}\n'.format( len(table_fpaths), '\n '.join(table_fpaths) ) ) # Create the output directory if outdir is not None: outdir = expand(outdir) mkdir(outdir) # Combine the tables combined_table = None table_keys = None for fpath in table_fpaths: table = load_clsim_table_minimal(fpath, mmap=True) base = basename(fpath) rootname, ext = splitext(base) if ext.lstrip('.') in COMPR_EXTENSIONS: base = rootname if 'source_tables' not in table: table['source_tables'] = np.array([base], dtype=np.string0) if combined_table is None: combined_table = table table_keys = set(table.keys()) # Formulate output file paths and check if they exist (do on first # table to avoid finding out we are going to overwrite a file # before loading all the source tables) if outdir is not None: output_fpaths = OrderedDict( ( (k, join(outdir, k + '.npy')) for k in sorted(table_keys.difference(NO_WRITE_KEYS)) ) ) if not overwrite: for fp in output_fpaths.values(): if isfile(fp): raise IOError( 'File at {} already exists, NOT overwriting'.format(fp) ) wstderr( 'Output files will be written to:\n {}\n'.format( '\n '.join(output_fpaths.values()) ) ) continue # Make sure keys are the same new_table_keys = set(table.keys()) missing_keys = sorted( table_keys .difference(new_table_keys) .difference(NO_VALIDATE_KEYS) ) additional_keys = sorted( new_table_keys .difference(table_keys) .difference(NO_VALIDATE_KEYS) ) if missing_keys or additional_keys: raise ValueError( 'Table is missing keys {} and/or has additional keys {}'.format( missing_keys, additional_keys ) ) # Validate keys that should be equal for key in sorted(table_keys.difference(NO_VALIDATE_KEYS)): if not np.array_equal(table[key], combined_table[key]): raise ValueError('Unequal "{}" in file {}'.format(key, fpath)) # Add values from keys that should be summed for key in SUM_KEYS: if key not in table: continue combined_table[key] += table[key] # Concatenate and sort new source table(s) in source_tables array combined_table['source_tables'] = np.sort( np.concatenate([combined_table['source_tables'], table['source_tables']]) ) # Make sure to clear table from memory since these can be quite large del table # Save the data to npy files on disk (in a sub-directory for all of this # table's files) if outdir is not None: wstderr('Writing files:\n') len_longest_fpath = np.max([len(p) for p in output_fpaths.values()]) for key in sorted(table_keys.difference(NO_WRITE_KEYS)): fpath = output_fpaths[key] wstderr(' {} ...'.format(fpath.ljust(len_longest_fpath))) t0 = time() np.save(fpath, combined_table[key]) wstderr(' ({:12.3f} s)\n'.format(time() - t0)) wstderr( 'Total time to combine tables: {} s\n'.format(np.round(time() - t_start, 3)) ) return combined_table
def load_clsim_table_minimal(fpath, step_length=None, mmap=False): """Load a CLSim table from disk (optionally compressed with zstd). Similar to the `load_clsim_table` function but the full table, including under/overflow bins, is kept and no normalization or further processing is performed on the table data besides populating the ouptput OrderedDict. Parameters ---------- fpath : string Path to file to be loaded. If the file has extension 'zst', 'zstd', or 'zstandard', the file will be decompressed using the `python-zstandard` Python library before passing to `pyfits` for interpreting. mmap : bool, optional Whether to memory map the table (if it's stored in a directory containing .npy files). Returns ------- table : OrderedDict Items include - 'table_shape' : tuple of int - 'table' : np.ndarray - 't_indep_table' : np.ndarray (if available) - 'n_photons' : - 'phase_refractive_index' : - 'r_bin_edges' : - 'costheta_bin_edges' : - 't_bin_edges' : - 'costhetadir_bin_edges' : - 'deltaphidir_bin_edges' : """ table = OrderedDict() fpath = expand(fpath) if DEBUG: wstderr('Loading table from {} ...\n'.format(fpath)) if isdir(fpath): t0 = time() indir = fpath if mmap: mmap_mode = 'r' else: mmap_mode = None for key in MY_CLSIM_TABLE_KEYS + ['t_indep_table']: fpath = join(indir, key + '.npy') if DEBUG: wstderr(' loading {} from "{}" ...'.format(key, fpath)) t1 = time() if isfile(fpath): table[key] = np.load(fpath, mmap_mode=mmap_mode) elif key != 't_indep_table': raise ValueError( 'Could not find file "{}" for loading table key "{}"' .format(fpath, key) ) if DEBUG: wstderr(' ({} ms)\n'.format(np.round((time() - t1)*1e3, 3))) if step_length is not None and 'step_length' in table: assert step_length == table['step_length'] if DEBUG: wstderr(' Total time to load: {} s\n'.format(np.round(time() - t0, 3))) return table if not isfile(fpath): raise ValueError('Table does not exist at path "{}"'.format(fpath)) if mmap: print('WARNING: Cannot memory map a fits or compressed fits file;' ' ignoring `mmap=True`.') import pyfits t0 = time() fobj = get_decompressd_fobj(fpath) try: pf_table = pyfits.open(fobj) table['table_shape'] = pf_table[0].data.shape # pylint: disable=no-member table['n_photons'] = force_little_endian( pf_table[0].header['_i3_n_photons'] # pylint: disable=no-member ) table['group_refractive_index'] = force_little_endian( pf_table[0].header['_i3_n_group'] # pylint: disable=no-member ) table['phase_refractive_index'] = force_little_endian( pf_table[0].header['_i3_n_phase'] # pylint: disable=no-member ) if step_length is not None: table['step_length'] = step_length n_dims = len(table['table_shape']) if n_dims == 5: # Space-time dimensions table['r_bin_edges'] = force_little_endian( pf_table[1].data # meters # pylint: disable=no-member ) table['costheta_bin_edges'] = force_little_endian( pf_table[2].data # pylint: disable=no-member ) table['t_bin_edges'] = force_little_endian( pf_table[3].data # nanoseconds # pylint: disable=no-member ) # Photon directionality table['costhetadir_bin_edges'] = force_little_endian( pf_table[4].data # pylint: disable=no-member ) table['deltaphidir_bin_edges'] = force_little_endian( pf_table[5].data # pylint: disable=no-member ) else: raise NotImplementedError( '{}-dimensional table not handled'.format(n_dims) ) table['table'] = force_little_endian(pf_table[0].data) # pylint: disable=no-member wstderr(' (load took {} s)\n'.format(np.round(time() - t0, 3))) finally: del pf_table if hasattr(fobj, 'close'): fobj.close() del fobj return table
def combine_clsim_tables(table_fpaths, outdir=None, overwrite=False, step_length=1.0): """Combine multiple CLSim-produced tables together into a single table. All tables specified must have the same binnings defined. Tables should also be produced using different random seeds; if corresponding metadata files can be found in the same directories as the CLSim tables, this will be enforced prior to loading and combining the actual tables together. Parameters ---------- table_fpaths : string or iterable thereof Each string is glob-expanded outdir : string, optional Directory to which to save the combined table; if not specified, the resulting table will be returned but not saved to disk. overwrite : bool Overwrite an existing table. If a table is found at the output path and `overwrite` is False, the function simply returns. step_length : float > 0 in units of meters Needed for computing the normalization to apply to the `table` in order to generate the `t_indep_table` (if the latter doesn't already exist). Note that normalization constants due to `n_photons`, `quantum_efficiency`, and `angular_acceptance_fract` as well as normalization depending (only) upon radial bin (i.e 1/r^2 geometric factor) are _not_ applied to the tables. The _only_ normalization applied (and _only_ to `t_indep_table`) is the multiple-counting factor that is a function of `step_length` and whichever of the time or radial bin dimensions is smaller. Returns ------- combined_table """ t_start = time() # Get all input table filepaths, including glob expansion if isinstance(table_fpaths, basestring): table_fpaths = [table_fpaths] table_fpaths_tmp = [] for fpath in table_fpaths: table_fpaths_tmp.extend(glob(expand(fpath))) table_fpaths = sorted(table_fpaths_tmp) wstderr('Found {} tables to combine:\n {}\n'.format( len(table_fpaths), '\n '.join(table_fpaths))) # Formulate output filenames and check if they exist output_fpaths = None if outdir is not None: outdir = expand(outdir) mkdir(outdir) output_fpaths = OrderedDict( ((k, join(outdir, k + '.npy')) for k in ALL_KEYS)) output_fpaths['source_tables'] = join(outdir, 'source_tables.txt') if not overwrite: for fpath in output_fpaths: if isfile(fpath): raise IOError('File {} exists'.format(fpath)) wstderr('Output files will be written to:\n {}\n'.format('\n '.join( output_fpaths.values()))) # Combine the tables combined_table = None for fpath in table_fpaths: table = load_clsim_table_minimal(fpath, step_length=step_length, mmap=True) if combined_table is None: combined_table = table continue if set(table.keys()) != set(SUM_KEYS + VALIDATE_KEYS): raise ValueError( 'Table keys {} do not match expected keys {}'.format( sorted(table.keys()), sorted(ALL_KEYS))) for key in VALIDATE_KEYS: if not np.array_equal(table[key], combined_table[key]): raise ValueError('Unequal {} in file {}'.format(key, fpath)) for key in SUM_KEYS: combined_table[key] += table[key] del table # Force quantum_efficiency and angular_acceptance_fract to 1 (these should # be handled by the user at the time the table is used to represent a # particular or subgroup of DOMs) t_indep_table, _ = generate_time_indep_table(table=table, quantum_efficiency=1, angular_acceptance_fract=1) table['t_indep_table'] = t_indep_table # Save the data to npy files on disk (in a sub-directory for all of this # table's files) if outdir is not None: basenames = [] for fpath in table_fpaths: base = basename(fpath) rootname, ext = splitext(base) if ext.lstrip('.') in COMPR_EXTENSIONS: base = rootname basenames.append(base) wstderr('Writing files:\n') for key in ALL_KEYS: fpath = output_fpaths[key] wstderr(' {} ...'.format(fpath)) t0 = time() np.save(fpath, combined_table[key]) wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3, 3))) fpath = output_fpaths['source_tables'] wstderr(' {} ...'.format(fpath)) t0 = time() with open(fpath, 'w') as fobj: fobj.write('\n'.join(sorted(basenames))) wstderr(' ({} ms)\n'.format(np.round((time() - t0) * 1e3, 3))) wstderr('Total time to combine tables: {} s\n'.format( np.round(time() - t_start, 3))) return combined_table