Example #1
0
def create_mask_h5py(input_path,
                     selection_config,
                     key='events',
                     start=None,
                     end=None,
                     mode="r"):

    with h5py.File(input_path) as infile:

        n_events = h5py_get_n_rows(input_path, key=key, mode=mode)
        start = start or 0
        end = min(n_events, end) if end else n_events

        n_selected = end - start
        mask = np.ones(n_selected, dtype=bool)

        for name, (operator, value) in selection_config.items():

            before = mask.sum()
            mask = np.logical_and(
                mask, OPERATORS[operator](infile[key][name][start:end], value))
            after = mask.sum()
            log.debug('Cut "{} {} {}" removed {} events'.format(
                name, operator, value, before - after))

    return mask
Example #2
0
def read_telescope_data_chunked(path,
                                config,
                                chunksize,
                                columns,
                                feature_generation_config=None):
    '''
    Reads data from hdf5 file given as PATH and yields dataframes for each chunk
    '''
    n_rows = h5py_get_n_rows(path, config.telescope_events_key)
    if chunksize:
        n_chunks = int(np.ceil(n_rows / chunksize))
    else:
        n_chunks = 1
        chunksize = n_rows
    log.info('Splitting data into {} chunks'.format(n_chunks))

    for chunk in range(n_chunks):

        start = chunk * chunksize
        end = min(n_rows, (chunk + 1) * chunksize)

        df = read_telescope_data(path,
                                 config=config,
                                 columns=columns,
                                 first=start,
                                 last=end)
        df.index = np.arange(start, end)

        if feature_generation_config:
            feature_generation(df, feature_generation_config, inplace=True)

        yield df, start, end
Example #3
0
def main(configuration_path, input_path, output_path, chunksize, key, verbose):
    '''
    Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and
    write the result to OUTPUT_PATH.

    example:
    ```
    selection:
        numPixelInShower: ['>=', 10]
        numIslands: ['<=', 5]
        Width: ['<=', 50]
    ```
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    with open(configuration_path) as f:
        config = yaml.safe_load(f)

    selection = config.get('selection', {})

    if chunksize is None:
        n_events = h5py_get_n_rows(input_path, key=key, mode='r')

        mask = create_mask_h5py(input_path, selection, key=key)
        log.info('Before cuts: {}, after cuts: {}'.format(
            n_events, mask.sum()))

        with h5py.File(input_path,
                       mode='r') as infile, h5py.File(output_path,
                                                      'w') as outfile:
            group = outfile.create_group(key)

            for name, dataset in infile[key].items():

                if dataset.ndim == 1:
                    group.create_dataset(name,
                                         data=dataset[mask],
                                         maxshape=(None, ))
                elif dataset.ndim == 2:
                    group.create_dataset(name,
                                         data=dataset[mask, :],
                                         maxshape=(None, 2))
                else:
                    log.warning('Skipping not 1d or 2d column {}'.format(name))
    else:
        apply_cuts_h5py_chunked(input_path,
                                output_path,
                                selection,
                                chunksize=chunksize,
                                key=key)

    with h5py.File(input_path,
                   mode='r') as infile, h5py.File(output_path,
                                                  'r+') as outfile:
        if 'runs' in infile.keys():
            log.info('Copying runs group to outputfile')
            infile.copy('/runs', outfile['/'])
Example #4
0
def apply_cuts_h5py_chunked(
        input_path,
        output_path,
        selection_config,
        key='events',
        chunksize=100000,
        progress=True,
        ):
    '''
    Apply cuts defined in selection config to input_path and write result to
    outputpath. Apply cuts to chunksize events at a time.
    '''

    n_events = h5py_get_n_rows(input_path, key=key, mode="r")
    n_chunks = int(np.ceil(n_events / chunksize))
    log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize))

    with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'w') as outfile:
        group = outfile.create_group(key)

        for chunk in tqdm(range(n_chunks), disable=not progress, total=n_chunks):
            start = chunk * chunksize
            end = min(n_events, (chunk + 1) * chunksize)

            mask = create_mask_h5py(
                input_path, selection_config, key=key, start=start, end=end
            )

            for name, dataset in infile[key].items():
                if chunk == 0:
                    if dataset.ndim == 1:
                        group.create_dataset(name, data=dataset[start:end][mask], maxshape=(None, ))
                    elif dataset.ndim == 2:
                        group.create_dataset(
                            name, data=dataset[start:end, :][mask, :], maxshape=(None, 2)
                        )
                    else:
                        log.warning('Skipping not 1d or 2d column {}'.format(name))

                else:

                    n_old = group[name].shape[0]
                    n_new = mask.sum()
                    group[name].resize(n_old + n_new, axis=0)

                    if dataset.ndim == 1:
                        group[name][n_old:n_old + n_new] = dataset[start:end][mask]
                    elif dataset.ndim == 2:
                        group[name][n_old:n_old + n_new, :] = dataset[start:end][mask, :]
                    else:
                        log.warning('Skipping not 1d or 2d column {}'.format(name))
def main(configuration_path, input_path, output_path, chunksize, key, verbose):
    '''
    Apply cuts given in CONFIGURATION_PATH to the data in INPUT_PATH and
    write the result to OUTPUT_PATH.

    example:
    ```
    selection:
        numPixelInShower: ['>=', 10]
        numIslands: ['<=', 5]
        Width: ['<=', 50]
    ```
    '''
    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
    log = logging.getLogger()

    with open(configuration_path) as f:
        config = yaml.safe_load(f)

    selection = config.get('selection', {})

    if chunksize is None:
        n_events = h5py_get_n_rows(input_path, key=key, mode='r')

        mask = create_mask_h5py(input_path, selection, key=key)
        log.info('Before cuts: {}, after cuts: {}'.format(n_events, mask.sum()))

        with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'w') as outfile:
            group = outfile.create_group(key)

            for name, dataset in infile[key].items():

                if dataset.ndim == 1:
                    group.create_dataset(name, data=dataset[mask], maxshape=(None, ))
                elif dataset.ndim == 2:
                    group.create_dataset(
                        name, data=dataset[mask, :], maxshape=(None, 2)
                    )
                else:
                    log.warning('Skipping not 1d or 2d column {}'.format(name))
    else:
        apply_cuts_h5py_chunked(
            input_path, output_path, selection, chunksize=chunksize, key=key
        )

    with h5py.File(input_path, mode='r') as infile, h5py.File(output_path, 'r+') as outfile:
        if 'runs' in infile.keys():
            log.info('Copying runs group to outputfile')
            infile.copy('/runs', outfile['/'])
Example #6
0
    def __init__(
        self,
        path,
        aict_config,
        chunksize,
        columns,
        feature_generation_config=None,
    ):
        self.aict_config = aict_config
        self.columns = columns
        self.feature_generation_config = feature_generation_config
        self.n_rows = h5py_get_n_rows(path, aict_config.telescope_events_key)
        self.path = path
        if chunksize:
            self.chunksize = chunksize
            self.n_chunks = int(np.ceil(self.n_rows / chunksize))
        else:
            self.n_chunks = 1
            self.chunksize = self.n_rows
        log.info('Splitting data into {} chunks'.format(self.n_chunks))

        self.current_chunk = 0
Example #7
0
def create_mask_h5py(input_path, selection_config, key='events', start=None, end=None, mode="r"):

    with h5py.File(input_path) as infile:

        n_events = h5py_get_n_rows(input_path, key=key, mode=mode)
        start = start or 0
        end = min(n_events, end) if end else n_events

        n_selected = end - start
        mask = np.ones(n_selected, dtype=bool)

        for name, (operator, value) in selection_config.items():

            before = mask.sum()
            mask = np.logical_and(
                mask, OPERATORS[operator](infile[key][name][start:end], value)
            )
            after = mask.sum()
            log.debug('Cut "{} {} {}" removed {} events'.format(
                name, operator, value, before - after
            ))

    return mask
Example #8
0
def apply_cuts_h5py_chunked(
    input_path,
    output_path,
    selection_config,
    key='events',
    chunksize=100000,
    progress=True,
):
    '''
    Apply cuts defined in selection config to input_path and write result to
    outputpath. Apply cuts to chunksize events at a time.
    '''

    n_events = h5py_get_n_rows(input_path, key=key, mode="r")
    n_chunks = int(np.ceil(n_events / chunksize))
    log.debug('Using {} chunks of size {}'.format(n_chunks, chunksize))

    with h5py.File(input_path, 'r') as infile, h5py.File(output_path,
                                                         'w') as outfile:
        group = outfile.create_group(key)

        for chunk in tqdm(range(n_chunks), disable=not progress):
            start = chunk * chunksize
            end = min(n_events, (chunk + 1) * chunksize)

            mask = create_mask_h5py(input_path,
                                    selection_config,
                                    key=key,
                                    start=start,
                                    end=end)

            for name, dataset in infile[key].items():
                if chunk == 0:
                    if dataset.ndim == 1:
                        group.create_dataset(name,
                                             data=dataset[start:end][mask],
                                             maxshape=(None, ))
                    elif dataset.ndim == 2:
                        group.create_dataset(
                            name,
                            data=dataset[start:end, :][mask, :],
                            maxshape=(None, 2))
                    else:
                        log.warning(
                            'Skipping not 1d or 2d column {}'.format(name))

                else:

                    n_old = group[name].shape[0]
                    n_new = mask.sum()
                    group[name].resize(n_old + n_new, axis=0)

                    if dataset.ndim == 1:
                        group[name][n_old:n_old +
                                    n_new] = dataset[start:end][mask]
                    elif dataset.ndim == 2:
                        group[name][n_old:n_old +
                                    n_new, :] = dataset[start:end][mask, :]
                    else:
                        log.warning(
                            'Skipping not 1d or 2d column {}'.format(name))