def create_mask_h5py(input_path, selection_config, key='events', start=None, end=None, mode="r"): with h5py.File(input_path) as infile: n_events = h5py_get_n_rows(input_path, key=key, mode=mode) start = start or 0 end = min(n_events, end) if end else n_events n_selected = end - start mask = np.ones(n_selected, dtype=bool) for name, (operator, value) in selection_config.items(): before = mask.sum() mask = np.logical_and( mask, OPERATORS[operator](infile[key][name][start:end], value)) after = mask.sum() log.debug('Cut "{} {} {}" removed {} events'.format( name, operator, value, before - after)) return mask
def read_telescope_data_chunked(path, config, chunksize, columns, feature_generation_config=None): ''' Reads data from hdf5 file given as PATH and yields dataframes for each chunk ''' n_rows = h5py_get_n_rows(path, config.telescope_events_key) if chunksize: n_chunks = int(np.ceil(n_rows / chunksize)) else: n_chunks = 1 chunksize = n_rows log.info('Splitting data into {} chunks'.format(n_chunks)) for chunk in range(n_chunks): start = chunk * chunksize end = min(n_rows, (chunk + 1) * chunksize) df = read_telescope_data(path, config=config, columns=columns, first=start, last=end) df.index = np.arange(start, end) if feature_generation_config: feature_generation(df, feature_generation_config, inplace=True) yield df, start, end
def main(configuration_path, input_path, output_path, chunksize, key, verbose): ''' Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and write the result to OUTPUT_PATH. example: ``` selection: numPixelInShower: ['>=', 10] numIslands: ['<=', 5] Width: ['<=', 50] ``` ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() with open(configuration_path) as f: config = yaml.safe_load(f) selection = config.get('selection', {}) if chunksize is None: n_events = h5py_get_n_rows(input_path, key=key, mode='r') mask = create_mask_h5py(input_path, selection, key=key) log.info('Before cuts: {}, after cuts: {}'.format( n_events, mask.sum())) with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'w') as outfile: group = outfile.create_group(key) for name, dataset in infile[key].items(): if dataset.ndim == 1: group.create_dataset(name, data=dataset[mask], maxshape=(None, )) elif dataset.ndim == 2: group.create_dataset(name, data=dataset[mask, :], maxshape=(None, 2)) else: log.warning('Skipping not 1d or 2d column {}'.format(name)) else: apply_cuts_h5py_chunked(input_path, output_path, selection, chunksize=chunksize, key=key) with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'r+') as outfile: if 'runs' in infile.keys(): log.info('Copying runs group to outputfile') infile.copy('/runs', outfile['/'])
def apply_cuts_h5py_chunked( input_path, output_path, selection_config, key='events', chunksize=100000, progress=True, ): ''' Apply cuts defined in selection config to input_path and write result to outputpath. Apply cuts to chunksize events at a time. ''' n_events = h5py_get_n_rows(input_path, key=key, mode="r") n_chunks = int(np.ceil(n_events / chunksize)) log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize)) with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'w') as outfile: group = outfile.create_group(key) for chunk in tqdm(range(n_chunks), disable=not progress, total=n_chunks): start = chunk * chunksize end = min(n_events, (chunk + 1) * chunksize) mask = create_mask_h5py( input_path, selection_config, key=key, start=start, end=end ) for name, dataset in infile[key].items(): if chunk == 0: if dataset.ndim == 1: group.create_dataset(name, data=dataset[start:end][mask], maxshape=(None, )) elif dataset.ndim == 2: group.create_dataset( name, data=dataset[start:end, :][mask, :], maxshape=(None, 2) ) else: log.warning('Skipping not 1d or 2d column {}'.format(name)) else: n_old = group[name].shape[0] n_new = mask.sum() group[name].resize(n_old + n_new, axis=0) if dataset.ndim == 1: group[name][n_old:n_old + n_new] = dataset[start:end][mask] elif dataset.ndim == 2: group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :] else: log.warning('Skipping not 1d or 2d column {}'.format(name))
def main(configuration_path, input_path, output_path, chunksize, key, verbose): ''' Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and write the result to OUTPUT_PATH. example: ``` selection: numPixelInShower: ['>=', 10] numIslands: ['<=', 5] Width: ['<=', 50] ``` ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() with open(configuration_path) as f: config = yaml.safe_load(f) selection = config.get('selection', {}) if chunksize is None: n_events = h5py_get_n_rows(input_path, key=key, mode='r') mask = create_mask_h5py(input_path, selection, key=key) log.info('Before cuts: {}, after cuts: {}'.format(n_events, mask.sum())) with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'w') as outfile: group = outfile.create_group(key) for name, dataset in infile[key].items(): if dataset.ndim == 1: group.create_dataset(name, data=dataset[mask], maxshape=(None, )) elif dataset.ndim == 2: group.create_dataset( name, data=dataset[mask, :], maxshape=(None, 2) ) else: log.warning('Skipping not 1d or 2d column {}'.format(name)) else: apply_cuts_h5py_chunked( input_path, output_path, selection, chunksize=chunksize, key=key ) with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'r+') as outfile: if 'runs' in infile.keys(): log.info('Copying runs group to outputfile') infile.copy('/runs', outfile['/'])
def __init__( self, path, aict_config, chunksize, columns, feature_generation_config=None, ): self.aict_config = aict_config self.columns = columns self.feature_generation_config = feature_generation_config self.n_rows = h5py_get_n_rows(path, aict_config.telescope_events_key) self.path = path if chunksize: self.chunksize = chunksize self.n_chunks = int(np.ceil(self.n_rows / chunksize)) else: self.n_chunks = 1 self.chunksize = self.n_rows log.info('Splitting data into {} chunks'.format(self.n_chunks)) self.current_chunk = 0
def create_mask_h5py(input_path, selection_config, key='events', start=None, end=None, mode="r"): with h5py.File(input_path) as infile: n_events = h5py_get_n_rows(input_path, key=key, mode=mode) start = start or 0 end = min(n_events, end) if end else n_events n_selected = end - start mask = np.ones(n_selected, dtype=bool) for name, (operator, value) in selection_config.items(): before = mask.sum() mask = np.logical_and( mask, OPERATORS[operator](infile[key][name][start:end], value) ) after = mask.sum() log.debug('Cut "{} {} {}" removed {} events'.format( name, operator, value, before - after )) return mask
def apply_cuts_h5py_chunked( input_path, output_path, selection_config, key='events', chunksize=100000, progress=True, ): ''' Apply cuts defined in selection config to input_path and write result to outputpath. Apply cuts to chunksize events at a time. ''' n_events = h5py_get_n_rows(input_path, key=key, mode="r") n_chunks = int(np.ceil(n_events / chunksize)) log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize)) with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'w') as outfile: group = outfile.create_group(key) for chunk in tqdm(range(n_chunks), disable=not progress): start = chunk * chunksize end = min(n_events, (chunk + 1) * chunksize) mask = create_mask_h5py(input_path, selection_config, key=key, start=start, end=end) for name, dataset in infile[key].items(): if chunk == 0: if dataset.ndim == 1: group.create_dataset(name, data=dataset[start:end][mask], maxshape=(None, )) elif dataset.ndim == 2: group.create_dataset( name, data=dataset[start:end, :][mask, :], maxshape=(None, 2)) else: log.warning( 'Skipping not 1d or 2d column {}'.format(name)) else: n_old = group[name].shape[0] n_new = mask.sum() group[name].resize(n_old + n_new, axis=0) if dataset.ndim == 1: group[name][n_old:n_old + n_new] = dataset[start:end][mask] elif dataset.ndim == 2: group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :] else: log.warning( 'Skipping not 1d or 2d column {}'.format(name))