Exemple #1
0
def find_nearest_doms(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/',
                      multiprocess=True,
                      d_name='dom_geom.pickle'):
    
    # * Load precalculated geometry dictionary
    d_geom = pickle.load(open(data_dir_path+d_name, 'rb'))

    # * For each entry, calculate distances to all other DOMs
    # * Extract coordinates and pair with ID
    dom_ids = [dom_id for dom_id in d_geom]
    coords = {key: items['coordinates'] for key, items in d_geom.items()}
    own_coords = [items['coordinates'] for key, items in d_geom.items()]
    
    print(get_time(), 'Calculation of nearest DOMs begun...')
    if multiprocess:
        # * prepare for multiprocessing - we loop over DOM IDs
        coords_list = [coords]*len(dom_ids)
        packed = [pack for pack in zip(dom_ids, own_coords, coords_list)]

        with Pool() as p:
            dicts = p.map(find_nearest_doms_multi, packed)
    else:
        raise ValueError('Only multiprocessing implemented!')
    print(get_time(), 'Calculation finished!')
    
    # * Update the geometry dictionary with the closest DOMs
    for dom_id, d in zip(dom_ids, dicts):
        d_geom[dom_id].update(d)
    
    return d_geom
Exemple #2
0
def move_tars():
    """Scripts used to move tarballs of rpickled data from HEP to gpulab.

    Script must be run on gpulab - cannot ssh from HEP to gpulab, only other way around.

    Uses rsync to move tarballs. WHere, to and how many must be hardcoded for now.
    """

    # * Setup - where to load data, how many events
    n_pickle_dirs = 1131
    data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/'
    if not Path(data_dir).exists():
        Path(data_dir).mkdir()
        print(get_time(), 'Created directory %s' % (data_dir))
    from_ = '[email protected]:/groups/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/tarballs/'
    to_ = data_dir + 'tarballs/'
    if not Path(to_).exists():
        Path(to_).mkdir()
        print(get_time(), 'Created directory %s' % (to_))

    from_tarballs = [from_ + str(i) + '.tar' for i in range(n_pickle_dirs)]
    to_list = [to_ + str(i) + '.tar' for i in range(n_pickle_dirs)]

    # * Zip and multiprocess
    packed = [entry for entry in zip(from_tarballs, to_list)]
    with Pool() as p:
        p.map(move_tar, packed)

    print(get_time(), 'Finished copying tarballs!')
Exemple #3
0
def make_geom_dict(data_dir_path=get_project_root()+'/data/oscnext-genie-level5-v01-01-pass2/',
                   multiprocess=True,
                   d_name='dom_geom.pickle'):
    
    print(get_time(), 'Making geometry dictionary...')
    shelve_path = data_dir_path+'shelve/oscnext-genie-level5-v01-01-pass2'
    
    # * Get filenames
    with shelve.open(shelve_path) as f:
        filenames = [key for key in f]

    # * Prepare for multiprocessing
    path_list = [shelve_path]*len(filenames)
    packed = [entry for entry in zip(filenames, path_list)]
    
    # * Multiprocess
    if multiprocess:
        with Pool() as p:
            all_dicts = p.map(find_unique_ids, packed)

        # * Combine dictionaries
        print(get_time(), 'Combining dictionaries...')
        dom_geom_dict = {}
        for d in all_dicts:
            dom_geom_dict.update(d)
        print(get_time(), 'Dictionaries combined!')
        
    else:
        dom_geom_dict = {}
        for pack in packed:
            dom_geom_dict.update(find_unique_ids(pack))
    
    return dom_geom_dict
Exemple #4
0
def create_transformed_db(old_db, new_db, transformers, chunksize=100000):
    # Expects new_db, which is a copy of the old db.
    # If a key from old_db is in transformers -->
    # load --> transofrm --> save in new

    db_tables = old_db.tables
    tables = ['sequential', 'scalar', 'meta']
    primary_keys = ['row', 'event_no', 'event_no']

    # Since different DBs start at different indices, find the beginning
    id0 = int(old_db.ids[0])

    for table, primary_key in zip(tables, primary_keys):
        for var in db_tables[table]:
            if var in transformers:
                print('')
                if var in [
                        'dom_x', 'dom_y', 'dom_z', 'dom_charge', 'dom_time',
                        'dom_pulse_width'
                ]:
                    continue
                print(get_time(), 'Transforming', var, '...')
                # We found avariable that needs transforming. Transform it!
                transformer = transformers[var]

                # Loop over the primary key in the table
                i = 0
                if primary_key == 'row':
                    start = 0
                else:
                    start = id0

                # Keep transforming until all have been transformed
                while True:
                    _from = start + i * chunksize
                    _to = start + (i + 1) * chunksize
                    indices = [str(e) for e in np.arange(_from, _to)]
                    print(get_time(),
                          'Transforming %s - %s' % (indices[0], indices[-1]))
                    # print(indices[:10], old_db.ids[:10])
                    fetched = old_db.read(table, var, primary_key, indices)
                    n_fetched = len(fetched)
                    transformed = np.squeeze(
                        transformer.transform(fetched.reshape(-1, 1)))

                    # Write to new db
                    new_db.write(table,
                                 var,
                                 indices[:n_fetched],
                                 transformed,
                                 primary_key=primary_key)
                    # Check if we reached the end
                    if n_fetched < chunksize:
                        print(get_time(),
                              'Transformation of %s finished.' % (var))
                        break
                    else:
                        i += 1
Exemple #5
0
def save_png_pgf(path, f, width=1.0, height=1.0):
    FOTW = get_frac_of_textwidth(keyword='single_fig')
    w = get_figure_width(frac_of_textwidth=FOTW)
    h = get_figure_height(width=w)
    f.set_size_inches(w * width, h * height)

    name = path.split('/')[-1]
    f.savefig(path + '.png', bbox_inches='tight')
    print(get_time(), 'Saved %s.png' % (name))
    f.savefig(path + '.pgf', bbox_inches='tight')
    print(get_time(), 'Saved %s.pgf' % (name))
Exemple #6
0
def transform_events(db_path, ids, feature_dicts, transformers, n_nearest_data, geom_features, n_cpus=cpu_count()):
    """Transforms events.

    For each ID, the data induced by feature_dicts is calculated and/or transformed and placed in a a dictionary under event ID --> transformed. Furthermore, meta-information and masks are saved aswell.
    
    Parameters
    ----------
    db_path : str
        FUll path to Shelve database
    ids : list
        List of ids to process
    feature_dicts : dict
        dictionary containing dictionaries with e.g. which transformer to use
    transformers : dict
        Dictionary containing info on which transformer to use
    n_nearest_data : dict
        Dictionary containing the geometry data to transform and add to the event.
    geom_features : dict
        Dictionary containing the informtion required to transform the n_nearest_data
    
    Returns
    -------
    dict
        Dictionary containing transformed events.
    """    
    # * Chunk ID's for multiprocessing
    n_chunks = n_cpus
    id_chunks = np.array_split(ids, n_chunks)
    
    # * Repack the n_nearest_data so that IDs matches
    n_nearest_chunks = [{event_id: n_nearest_data[event_id] for event_id in chunk} for chunk in id_chunks]

    # * Multiprocess - prep by zipping all the required stuff for each process
    db_path_list = [db_path]*n_chunks
    transformers_list = [transformers]*n_chunks
    feature_dicts_list = [feature_dicts]*n_chunks
    geom_features_list = [geom_features]*n_chunks
    packed = zip(id_chunks, n_nearest_chunks, db_path_list, transformers_list, feature_dicts_list, geom_features_list)
    with Pool(processes=n_cpus) as p:
        print(get_time(), 'Transforming events...')
        events_transformed = p.map(transform_events_multiprocess, packed)
        print(get_time(), 'Events transformed!')

    events_unpacked = {}
    for events in events_transformed:
        events_unpacked.update(events)   
    
    return events_unpacked
Exemple #7
0
def find_dom_interval_passed_cands(pack):
    # Unpack
    ids, db, min_doms, max_doms, dom_mask = pack

    accepted = []

    # Split into chunks
    n_chunks = len(ids) // CHUNK_SIZE
    chunks = np.array_split(ids, n_chunks)

    # Loop over chunks
    for i_chunk, chunk in enumerate(chunks):

        # Retrieve the '<MASK_NAME>_event_length' -
        # this value is the number of DOMs in an event
        if dom_mask == 'SplitInIcePulses':
            len_key = 'split_in_ice_pulses_event_length'
        elif dom_mask == 'SRTInIcePulses':
            len_key = 'srt_in_ice_pulses_event_length'

        data_dict = db.fetch_features(all_events=chunk,
                                      meta_features=[len_key])

        for event_id, event_dict in data_dict.items():
            n_doms = event_dict[len_key]
            if min_doms <= n_doms <= max_doms:
                accepted.append(int(event_id))

        # Print for sanity
        print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks))
        sys.stdout.flush()

    return accepted
Exemple #8
0
def find_particles(pack):
    # Unpack
    ids, db, particle_code = pack

    accepted = []

    # Split into chunks
    n_chunks = len(ids) // CHUNK_SIZE
    chunks = np.array_split(ids, n_chunks)

    # Loop over chunks
    for i_chunk, chunk in enumerate(chunks):

        # Retrieve the 'particle_code' from meta -
        # this value determines the particle
        code_name = 'particle_code'
        data_dict = db.fetch_features(all_events=chunk,
                                      meta_features=[code_name])
        for event_id, event_dict in data_dict.items():
            code = event_dict[code_name]
            if str(code) == particle_code:
                accepted.append(int(event_id))

        # Print for sanity
        print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks))
        sys.stdout.flush()

    return accepted
Exemple #9
0
def find_unique_ids(pack):
    # * Unpack and notify
    datafile, path = pack
    print(get_time(), 'Processing %s'%(datafile))
    sys.stdout.flush()

    # * all_doms will be a dictionary with dom_id: coordinates.
    all_doms = {}

    # * Retrieve DOM-ID and coordinates of each event
    # * len(keys) = N_events in file, len(keys[0]) = NUmber of DOMs in event 0
    with shelve.open(path) as f:
        keys = f[datafile]['dom_key']
        dom_xs = f[datafile]['dom_x']
        dom_ys = f[datafile]['dom_y']
        dom_zs = f[datafile]['dom_z']
    
    for key, dom_x, dom_y, dom_z in zip(keys, dom_xs, dom_ys, dom_zs):
        # * Convert x, y, z into one coordinate entry as a np-array
        coords = [{'coordinates': np.array([x, y, z])} for x, y, z in zip(dom_x, dom_y, dom_z)]

        # * Update the dictionary over all events
        all_doms.update(zip(key, coords))
    
    return all_doms
Exemple #10
0
def find_particles(pack):
    # * Unpack
    dirs, particle_code = pack
    
    accepted = []
    i_file = 0
    
    # * Loop over the given directories
    for directory in dirs:

        # * Loop over the events in the subdirectory
        for file in directory.iterdir():
            
            # * Check each file.
            event = pickle.load(open(file, "rb" ))
            if particle_code == event['meta']['particle_code']:
                accepted.append(int(file.stem))

            # * Print for sanity
            i_file += 1
            if (i_file) % PRINT_EVERY == 0:
                print(get_time(), 'Subprocess: Processed %d'%(i_file))
                sys.stdout.flush()
    
    return accepted
Exemple #11
0
def find_energy_interval_passed_cands(pack):
    # * Unpack
    dirs, min_energy, max_energy = pack
    
    accepted = []
    i_file = 0
    
    # * Loop over the given directories
    for directory in dirs:

        # * Loop over the events in the subdirectory
        for file in directory.iterdir():
            
            # * Check each file.
            event = pickle.load(open(file, "rb" ))
            energy = event['raw']['true_primary_energy']
            
            if min_energy <= energy <= max_energy:
                accepted.append(int(file.stem))

            # * Print for sanity
            i_file += 1
            if (i_file)%PRINT_EVERY == 0:
                print(get_time(), 'Subprocess: Processed %d'%(i_file))
                sys.stdout.flush()

    return accepted
Exemple #12
0
def find_dom_interval_passed_cands(pack):
    # * Unpack
    dirs, min_doms, max_doms, dom_mask, process_ID = pack
    
    accepted = []
    i_file = 0
    
    # * Loop over the given directories
    for directory in dirs:

        # * Loop over the events in the subdirectory
        for file in directory.iterdir():
            
            # * Check each file.
            event = pickle.load(open(file, "rb" ))
            dom_indices = event['masks'][dom_mask]
            
            n_doms = event['raw']['dom_charge'][dom_indices].shape[0]
            if min_doms <= n_doms <= max_doms:
                accepted.append(int(file.stem))

            # * Print for sanity
            i_file += 1
            if (i_file)%PRINT_EVERY == 0:
                print(get_time(), 'Subprocess %d: Processed %d'%(process_ID, i_file))
                sys.stdout.flush()

    return accepted
Exemple #13
0
def pickle_events(pack):
    # * Unpack - assumes multiprocessing
    fname, new_names, data_dir, particle_code, n_per_dir = pack
    print(get_time(), 'Pickling %s' % (Path(fname).name))

    # * Loop over events in file - each event is turned into a .pickle
    with h5.File(fname, 'r') as f:
        n_events = f['meta/events'][()]

        for i_event, new_name in zip(range(n_events), new_names):
            event = empty_pickle_event()

            # * Fill the pickle file.
            for group in event:
                for key, data in f[group].items():

                    # * Save in numpy.float32 format - this is the format used in models anyways.
                    if group != 'masks':
                        event[group][key] = data[i_event].astype(np.float32)
                    else:
                        event[group][key] = data[i_event]

            # * Assign metavalues - where is event from, what kind of particle?
            event['meta'] = {}
            event['meta']['file'] = Path(fname).name
            event['meta']['index'] = i_event
            event['meta']['particle_code'] = particle_code

            # * Save it in subdirs - put n_per_dir in each directory
            dir_name = str(new_name // n_per_dir)
            new_name = data_dir + '/' + dir_name + '/' + str(
                new_name) + '.pickle'
            pickle.dump(event, open(new_name, 'wb'))
Exemple #14
0
def save_thesis_pgf(path, f, save_pgf=False, png_name=None, pgf_name=None):
    if pgf_name == None:
        pgf = str(path.parent.stem)
    else:
        pgf = str(path.parent.stem) + '_' + pgf_name

    all_figs_path = str(path.parent.parent) + '/all_pgf/' + pgf + '.pgf'
    if png_name:
        f.savefig(str(path.parent) + '/' + png_name + '.png',
                  bbox_inches='tight')
    else:
        f.savefig(str(path.parent) + '/fig.png', bbox_inches='tight')
    print(get_time(), 'Saved .png')
    if save_pgf:
        f.savefig(all_figs_path, bbox_inches='tight')
        print(get_time(), pgf + ' saved.')
Exemple #15
0
def unpack_remove_tars():
    """Script to unpack .tars holding directories with pickled events.

    Uses multiprocessing to unpack tars with bash:
    > tar -xf <tar_location> -C <pickle_dir_location>.

    For now, where (currently tarball_dir) and to (pickle_dir) has to be hardcoded in the script.
    """
    # * Where are tars located?
    tarball_dir = get_project_root(
    ) + '/data/oscnext-genie-level5-v01-01-pass2/tarballs'
    tarballs = [path for path in Path(tarball_dir).iterdir()]

    # * WHere should they be put?
    pickle_dir = get_project_root(
    ) + '/data/oscnext-genie-level5-v01-01-pass2/pickles/'
    if not Path(pickle_dir).exists():
        Path(pickle_dir).mkdir()

    # * Multiprocess
    available_cores = cpu_count()
    pickle_dir_list = [pickle_dir] * len(tarballs)
    packed = [entry for entry in zip(tarballs, pickle_dir_list)]

    with Pool(available_cores + 2) as p:
        p.map(unpack_tar_remove, packed)
    print(get_time(), 'Finished unpacking tarballs!')
Exemple #16
0
def make_tar(pack):
    pickle_dir, tar_dir = pack

    print(get_time(), 'Making tar of %s' % (pickle_dir))
    sys.stdout.flush()

    tar_path = tar_dir + '/' + pickle_dir.name + '.tar'
    subprocess.run(['tar', '-cf', tar_path, pickle_dir])
def transform_features(pack):
    file, transformers, keys, prefix = pack
    start = time()
    name = Path(file).name

    with h5.File(file, 'a') as f:
        n_events = f['meta/events'][()]

        # * Loop over keys and do all transformations for the whole file.
        # * scikits transformers expect 2D-arrays, hence we reshape into 2D-array and flatten again.
        d = {}
        for key in keys:

            # * For each key, check if already transformed - if yes, don't do it again
            if f['raw/' + key]:  # and prefix+'/'+key not in f:
                transformer = transformers[key]

                # * Prepare an empty dataset
                if f['raw/' + key][0].shape:
                    d[key] = [[]] * n_events

                    # * We must loop due to the sequential nature of DOM sequences
                    for i_event, event in enumerate(f['raw/' + key]):
                        d[key][i_event] = transformer.transform(
                            event.reshape(-1, 1)).flatten()

                else:
                    # * For non-sequential data, we can transform entire set in one go
                    d[key] = transformer.transform(f['raw/' + key][:].reshape(
                        -1, 1)).flatten()

        # * Now save
        for key, data in d.items():
            dataset_path = prefix + '/' + key

            # * Check if it is a DOM-variable or global event-variable
            if data[0].shape:

                # * If dataset already exists, delete it first
                if dataset_path in f:
                    del f[dataset_path]

                f.create_dataset(dataset_path,
                                 data=data,
                                 dtype=h5.special_dtype(vlen=data[0][0].dtype))

            else:

                # * If dataset already exists, delete it first
                if dataset_path in f:
                    del f[dataset_path]

                f.create_dataset(dataset_path, data=data, dtype=data[0].dtype)

    # * Print progress for sanity...
    finish_time = time() - start
    print(hf.get_time(), 'Finished %s in %.0f seconds' % (name, finish_time))
    print('Speed: %.0f Events per second\n' % (n_events / finish_time))
Exemple #18
0
def move_tar(pack):
    from_hep, to_gpu = pack
    if Path(to_gpu).exists():
        pass
    else:
        print(get_time(), 'Copying %s' % (from_hep))
        sys.stdout.flush()
        command = 'rsync'
        subprocess.run([command, from_hep, to_gpu])
Exemple #19
0
def inverse_low_E(
    name, 
    masked_ids, 
    db, 
    debug=False, 
    multiprocess=True,
    interpolator=None
    ):
    if interpolator == None:
        raise ValueError('Not implemented yet - interpolator must be supplied')
    
    # Loop over all events using multiprocessing
    print(get_time(), 'Assigning energy weights...')
    if multiprocess:
        weights = assign_energy_balanced_weights_multiprocess(
            ids, db, interpolator, true_key=['true_primary_energy'], debug=debug
        )
    print(get_time(), 'Energy weights assigned!')

    return weights, interpolator
Exemple #20
0
def uniform_direction(
    name, 
    ids, 
    db, 
    multiprocess=True, 
    debug=False,
    interpolator=None
    ):

    # Get indices used for interpolator-calculation
    if not interpolator:
        n_events = min(len(ids), USE_N_EVENTS)
        event_ids = ids[:n_events]
        print(get_time(), 'Calculating direction bins..')
        x, counts = calc_uniform_direction_weights(event_ids, db)
        weights_unscaled = 1.0/np.array(counts)
        print(get_time(), 'Bins calculated!')
        
        print(get_time(), 'Fitting interpolator')
        interpolator= make_scaled_interpolator(weights_unscaled, counts, x)
        print(get_time(), 'Interprolator fitted!')

    # Loop over all events using multiprocessing
    print(get_time(), 'Assigning energy weights...')
    if multiprocess:
        weights_dict = assign_uniform_direction_weights_multiprocess(
            ids, db, interpolator, true_key=['true_primary_direction_z'], debug=debug
        )
    print(get_time(), 'Energy weights assigned!')

    return weights_dict, interpolator
Exemple #21
0
def energy_balanced(
    name, 
    ids, 
    db, 
    multiprocess=True, 
    debug=False,
    interpolator=None,
    alpha=1.0
    ):

    # Get indices used for interpolator-calculation
    if not interpolator:
        n_events = min(len(ids), USE_N_EVENTS)
        event_ids = ids[:n_events]
        print(get_time(), 'Calculating energy bins..')
        x, counts = calc_energy_balanced_weights(event_ids, db)
        weights_unscaled = np.power(1.0/np.array(counts), alpha)
        print(get_time(), 'Bins calculated!')
        
        print(get_time(), 'Fitting interpolator')
        # In this case, MAX 10 for better gradients
        interpolator = make_scaled_interpolator(weights_unscaled, counts, x)
        print(get_time(), 'Interpolator fitted!')

    # Loop over all events using multiprocessing
    print(get_time(), 'Assigning energy weights...')
    if multiprocess:
        weights = assign_energy_balanced_weights_multiprocess(
            ids, db, interpolator, true_key=['true_primary_energy'], debug=debug
        )
    print(get_time(), 'Energy weights assigned!')

    return weights, interpolator
Exemple #22
0
def get_n_nearest_data(db_path, id_chunk, geom_features, n_cpus=cpu_count()):
    """Finds and extracts data from the nearest n DOMs
    
    Parameters
    ----------
    db_path : str
        Absolute path to the Shelve-database
    id_chunk : list
        list of event IDs to extract data for
    geom_features : dict
        What geometry data to extract, e.g. nearest DOMs x-value
    geom_dict_path : str
        full path to geometry dictionary (dictionary containing nearest DOMs for each DOM)
    
    Returns
    -------
    dict
        Data of nearest N doms for each event ID
    """    
    # * Chunk ID's for multiprocessing
    n_chunks = n_cpus
    id_chunks = np.array_split(id_chunk, n_chunks)

    # * Multiprocess
    db_list = [db_path]*n_chunks
    geom_features_list = [geom_features]*n_chunks
    geom_dict_list = [geom_dict_path]*n_chunks
    packed = zip(id_chunks, db_list, geom_features_list, geom_dict_list)
    with Pool(processes=n_cpus) as p:
        print(get_time(), 'Finding n nearest DOMs...')
        data = p.map(get_n_nearest_data_multiprocess, packed)
        print(get_time(), 'N nearest DOMs found!')
    
    # * Unpack
    all_events = {}
    for events in data:
        all_events.update(events)

    return all_events
Exemple #23
0
def fit_transformers(db_path, n_data, feature_dicts, n_cpus=cpu_count()):
    # * Assumes a RANDOMIZED DB!
    ids = [str(i) for i in range(n_data)]

    # * Load/calculate features, then transform
    keys = [key for key in feature_dicts]
    
    # * Multiprocess
    db_list = [db_path]*len(keys)
    ids_list = [ids]*len(keys)
    n_data_list = [n_data]*len(keys)
    packed = zip(ids_list, feature_dicts.items(), db_list, n_data_list)
    with Pool(processes=n_cpus) as p:
        print(get_time(), 'Fitting transformers...')
        transformers_list = p.map(load_and_fit_transformer, packed)
        print(get_time(), 'Transformers fitted!')

    # * Make a dictionary with the transformers
    transformers = {}
    for transformer in transformers_list:
        transformers.update(transformer)
    
    return transformers
def feature_engineer(pack):
    """Calculates desired features for a h5-datafile and appends the new datasets to the file. Multiprocessing-friendly.
    
    Arguments:
        
        packed {tuple} -- a tuple containing:
            i_file {int} -- Filenumber i of N_FILES - to track progress
            file {str} -- absolute path to h5-datafile.
            N_FILES {int} -- Total number of files to process (via multi- or singleprocesing).
    """

    # * Unpack. One input is expected to be compatible with multiprocessing
    i_file, file, N_FILES = pack
    name = Path(file).name

    # * Print progress for our sanity..
    print(hf.get_time(),
          'Processing %s (file %d of %d)' % (name, i_file + 1, N_FILES))

    # * Retrieve wanted engineers - they have to be predefined in get_wanted_feature_engineers (for now)
    functions = get_wanted_feature_engineers()

    # * Now calculate the features on a per event basis.
    d = calc_features(functions, file)

    # * Append our calculations to the datafile
    with h5.File(file, 'a') as f:
        # * Make a 'raw/'-group if it doesnt exist
        if 'raw' not in f:
            raw = f.create_group("raw")

        # * Now make the datasets
        for key, data in d.items():
            dataset_path = 'raw/' + key
            # * Check if it is a DOM-variable or global event-variable
            if data[0].shape:
                # * If dataset already exists, delete it first
                if dataset_path in f:
                    del f[dataset_path]
                f.create_dataset(dataset_path,
                                 data=data,
                                 dtype=h5.special_dtype(vlen=data[0][0].dtype))

            else:
                # * If dataset already exists, delete it first
                if dataset_path in f:
                    del f[dataset_path]
                f.create_dataset(dataset_path, data=data, dtype=data[0].dtype)
Exemple #25
0
def move_pickle(pack):
    integer, hep_dir, gpu_dir = pack
    n_per_dir = 10000
    path = hep_dir + str(integer)
    name_range = range(integer * n_per_dir, (integer + 1) * n_per_dir)

    print(get_time(), 'Moving %s' % (path))
    sys.stdout.flush()
    command = 'scp'
    for name in name_range:
        from_ = '[email protected]:' + hep_dir + str(
            integer) + '/' + str(name) + '.pickle'
        to = gpu_dir + str(integer) + '/' + str(name) + '.pickle'
        subprocess.run([command, from_, to])

        event = pickle.load(open(to, "rb"))
        if event['meta']['particle_code'] != '140000':
            Path(to).unlink()
Exemple #26
0
def inverse_performance_muon_energy(
    name, 
    ids, 
    db, 
    multiprocess=True, 
    debug=False,
    interpolator=None
    ):
    """Given a pickled dataset, a weight is calculated for each event. The weight is calculated (using a quadratic spline) as 

    w = (icecube_performance)**-0.5.

    In other words, the inverse of Icecubes performance in each energy range. It can be chosen to only use a fraction of the dataset for the creation of the quadratic spline. If an event is not in the mask, it is assigned a nan as weight.

    The weights are normalized such that the average weight of an event in a batch is 1. 

    Arguments:
        masks {list} -- Masknames for the data to calculate weights on
        dataset_path {str} -- path to dataset
    
    Keyword Arguments:
        multiprocess {bool} -- Whether or not to use multiprocessing in calculating weights for each event (default: {True})
        from_frac {float} -- Lower limit of the amount of data to use to calculate the spline (default: {0.0})
        to_frac {float} -- Upper limit of the amount of data to use to calculate the spline (default: {1.0})
    
    Returns:
        dict -- Weights for each event
    """ 

    # Get indices used for interpolator-calculation
    if not interpolator:
        n_events = min(len(ids), USE_N_EVENTS)
        event_ids = ids[:n_events]
        print(get_time(), 'Calculating performance..')
        x, counts, retro_sigmas = calc_energy_performance_weights(event_ids, db)
        weights_unscaled = 1.0/np.array(retro_sigmas)
        print(get_time(), 'Performance calculated!')
        
        print(get_time(), 'Fitting interpolator')
        interpolator = make_scaled_interpolator(weights_unscaled, counts, x)
        print(get_time(), 'Interpolator fitted!')

    # Loop over all events using multiprocessing
    print(get_time(), 'Assigning energy weights...')
    if multiprocess:
        weights_dict = assign_energy_weights_multiprocess(
            ids, db, interpolator, debug=debug
        )
    print(get_time(), 'Energy weights assigned!')

    return weights_dict, interpolator
Exemple #27
0
def unpack_tar_remove(pack):
    tarball, path = pack

    command = 'tar'
    flags_tar = '-xf'
    flags_dir = '-C'

    # * The tar was created in a silly way - it is deeply nested in
    # * lustre/hpc/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/pickles/.
    # * This is unwanted. Therefore, standing in ../pickles run:
    # * mv lustre/hpc/hep/bjoernhm/CubeML/data/oscnext-genie-level5-v01-01-pass2/pickles/* .

    subprocess.run([command, flags_tar, tarball, flags_dir, path])

    # * Remove the tarball
    tarball.unlink()

    print(get_time(), 'Unpacked and removed %s' % (tarball))
    sys.stdout.flush()
Exemple #28
0
def make_tars():
    """Script to pack pickle-directories with single events into .tars 

    Must hardcode where pickles are located and where tars should be put.
    """
    # * Setup - where to load data, how many events
    data_dir = get_project_root() + '/data/oscnext-genie-level5-v01-01-pass2/'
    from_ = data_dir + 'pickles'
    to_ = data_dir + 'tarballs'
    pickle_dirs = [path for path in Path(from_).iterdir()]

    # * Zip and multiprocess
    to_list = [to_] * len(pickle_dirs)
    packed = [entry for entry in zip(pickle_dirs, to_list)]

    available_cores = cpu_count()
    with Pool(available_cores + 2) as p:
        p.map(make_tar, packed)

    print(get_time(), 'Finished making tarballs!')
Exemple #29
0
def calc_weights_multiprocess(pack):
    indices, interpolator, key, path, n_per_dir, subprocess_id = pack

    weights = [-1] * len(indices)
    n_indices = len(indices)
    for i_index, index in enumerate(indices):
        # * Check each file.
        full_path = path + '/pickles/' + str(
            index // n_per_dir) + '/' + str(index) + '.pickle'

        event = pickle.load(open(full_path, "rb"))
        energy = event['raw']['true_primary_energy']
        weights[i_index] = interpolator(energy)

        if (i_index) % PRINT_EVERY == 0:
            print(
                get_time(), 'Subprocess %d: Processed %d of %d' %
                (subprocess_id, i_index, n_indices))
            sys.stdout.flush()

    return weights
Exemple #30
0
def find_energy_interval_passed_cands(pack):
    # Unpack
    ids, db, min_energy, max_energy = pack

    accepted = []
    energy_key = 'true_primary_energy'

    # Split into chunks
    n_chunks = len(ids) // CHUNK_SIZE
    chunks = np.array_split(ids, n_chunks)

    # Load transformer
    transformer_path = '/'.join(
        [PATH_DATA_OSCNEXT, 'sqlite_transformers.pickle'])
    transformers = joblib.load(open(transformer_path, 'rb'))
    transformer = transformers[energy_key]

    # Loop over chunks
    for i_chunk, chunk in enumerate(chunks):

        # Fetch energy
        data_dict = db.fetch_features(all_events=chunk,
                                      scalar_features=[energy_key])
        energies_transformed = np.array(
            [data_d[energy_key] for event_id, data_d in data_dict.items()])

        # inverse transform
        energies = np.squeeze(
            transformer.inverse_transform(energies_transformed.reshape(-1, 1)))

        # add or discard
        for event_id, energy in zip(data_dict.keys(), energies):
            if min_energy <= energy <= max_energy:
                accepted.append(int(event_id))

        # Print for sanity
        print(get_time(), 'Processed chunk %d of %d' % (i_chunk + 1, n_chunks))
        sys.stdout.flush()

    return accepted