Beispiel #1
0
def make_data():
    dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB]
    cleaned_key = 'srt_in_ice_pulses_event_length'
    energy_key = 'true_primary_energy'
    data_d = {cleaned_key: [], energy_key: []}
    transformer = joblib.load(
        open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle',
             'rb'))[energy_key]
    for db_path in dbs:

        # Load seq lengths
        db = SqliteFetcher(db_path)
        data_dicts = db.fetch_features(all_events=db.ids,
                                       meta_features=[cleaned_key],
                                       scalar_features=[energy_key])
        data_d[cleaned_key].extend(
            [d[cleaned_key] for i, d in data_dicts.items()])
        data_d[energy_key].extend(
            [d[energy_key] for i, d in data_dicts.items()])

    data_d[energy_key] = np.squeeze(
        transformer.inverse_transform(
            np.array(data_d[energy_key]).reshape(-1, 1)))

    return data_d[energy_key], data_d[cleaned_key]
Beispiel #2
0
def make_data():
    dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB]
    particles = ['electron_neutrino', 'muon_neutrino', 'tau_neutrino']
    suffixes = ['_train.pickle', '_val.pickle', '_test.pickle']
    key = 'true_primary_energy'
    transformer = joblib.load(
        open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb'))[key]
    data_d = {}
    i = 0
    for particle in particles:
        all_energies = np.array([])
        for db_path, suffix in zip(dbs, suffixes):

            # Load mask
            path = PATH_DATA_OSCNEXT + '/masks/' + particle + suffix
            mask = [str(e) for e in pickle.load(open(path, 'rb'))]

            # Load energy
            db = SqliteFetcher(db_path)
            data_dicts = db.fetch_features(all_events=mask,
                                           scalar_features=[key])
            energies_trans = np.array([d[key] for e, d in data_dicts.items()])

            # Inverse transform them
            energies = np.squeeze(
                transformer.inverse_transform(energies_trans.reshape(-1, 1)))

            # Add to all
            all_energies = np.append(all_energies, energies)
            print(i)
            i += 1
        data_d[particle] = all_energies

    # Decide on bin size
    iqr = np.percentile(data_d['muon_neutrino'], 75) - np.percentile(
        data_d['muon_neutrino'], 25)
    n_data = data_d['muon_neutrino'].shape[0]
    bin_width = 2 * iqr / (n_data**0.3333)
    n_bins = int(4 / bin_width)

    hist_vals = {}
    for particle, data in data_d.items():
        hist_vals[particle], edges = np.histogram(data,
                                                  bins=n_bins,
                                                  range=(0.0, 4.0))
    hist_vals['edges'] = edges
    path = Path(os.path.realpath(__file__))

    # Save data
    with open(str(path.parent) + '/data.pickle', 'wb') as f:
        pickle.dump(hist_vals, f)
Beispiel #3
0
def make_data():
    db_path = PATH_TRAIN_DB
    key = 'dom_charge'
    transformer = joblib.load(
        open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb'))[key]
    db = SqliteFetcher(db_path)
    # Lets go with 1M ~ approximtely 1M/50 = 20k events
    ids = [str(e) for e in range(20000)]

    all_data = db.fetch_features(all_events=ids, seq_features=[key])
    data_lists = [data[key] for event_id, data in all_data.items()]
    data_transformed = np.array(flatten_list_of_lists(data_lists))
    data = np.squeeze(
        transformer.inverse_transform(data_transformed.reshape(-1, 1)))

    return data, data_transformed
Beispiel #4
0
def make_data():
    dbs = [PATH_TRAIN_DB, PATH_VAL_DB, PATH_TEST_DB]
    cleaned_key = 'srt_in_ice_pulses_event_length'
    uncleaned_key = 'split_in_ice_pulses_event_length'
    data_d = {cleaned_key: [], uncleaned_key: []}

    for db_path in dbs:

        # Load seq lengths
        db = SqliteFetcher(db_path)
        data_dicts = db.fetch_features(
            all_events=db.ids, meta_features=[cleaned_key, uncleaned_key])
        data_d[cleaned_key].extend(
            [d[cleaned_key] for i, d in data_dicts.items()])
        data_d[uncleaned_key].extend(
            [d[uncleaned_key] for i, d in data_dicts.items()])

    # Decide on bin size
    maxlen = 200
    minlen = 0
    bins = maxlen - minlen + 1

    hist_vals = {}
    for key, data in data_d.items():
        data_clipped = np.clip(data, 0, maxlen)
        hist_vals[key], edges = np.histogram(data_clipped,
                                             bins=bins,
                                             range=(minlen - 0.5,
                                                    maxlen + 0.5))
    hist_vals['edges'] = edges

    path = Path(os.path.realpath(__file__))

    # Save data
    with open(str(path.parent) + '/data.pickle', 'wb') as f:
        pickle.dump(hist_vals, f)
Beispiel #5
0
        Path(weights_dir).mkdir()

    for name in names:
        if args.interpolator:
            path = PATH_DATA_OSCNEXT + '/weights/' + name + '.pickle'
            interpolator = pickle.load(open(path, 'rb'))
        else:
            interpolator = None

        all_weights = {}
        for path, keyword in zip(
            [PATH_TRAIN_DB, PATH_VAL_DB], 
            ['train', 'val'],
        ):  
            # Get DB and mask
            db = SqliteFetcher(path)
            db_specific_masks = [e+'_'+keyword for e in args.masks]
            ids = load_pickle_mask(PATH_DATA_OSCNEXT, db_specific_masks)
            ids = [
                str(i) for i in ids
            ]
            
            # If developing, use less data
            if args.dev:
                USE_N_EVENTS = 1000
                PRINT_EVERY = 100
                ids = ids[:1000]
                
            
            # Calculate weights and potentially interpolator
            if not interpolator:
Beispiel #6
0
import pickle

from src.modules.reporting import *
from src.modules.constants import *
from matplotlib import pyplot as plt
from src.modules.classes import SqliteFetcher
from src.modules.thesis_plotting import *
import os

p = '/home/bjoernhm/CubeML/models/oscnext-genie-level5-v01-01-pass2/regression/full_reg/2020-04-16-11.34.16/data/predictions.h5'

with h5.File(p, 'r') as f:
    en = f['true_primary_energy'][:]
    ids = np.array([str(i) for i in f['index'][:]])

db = SqliteFetcher(PATH_VAL_DB)
db_ids = db.ids
overlap_ids = np.isin(ids, db_ids)
f_i = ids[overlap_ids]
true_e = db.fetch_features(
    all_events=f_i,
    scalar_features=['true_primary_energy']
)
e_t = np.array([d['true_primary_energy'] for i, d in true_e.items()])
e_p = en[overlap_ids]
error = e_p-e_t 

e_p, error = sort_pairs(e_p, error)
bins = np.linspace(min(e_p), max(e_p), num=20)
# e_p_bins, error_bins = bin_data(e_p, error, bins)
e_p_bins, error_bins = bin_data(e_p, error, bins)
Beispiel #7
0
    min_doms = args.min_doms
    max_doms = args.max_doms

    min_energy = args.min_energy
    max_energy = args.max_energy
    mask_dict = {
        'mask_name': mask_name,
        'min_doms': min_doms,
        'max_doms': max_doms,
        'min_energy': min_energy,
        'max_energy': max_energy
    }

    # If maskdirectory doesn't exist, make it
    mask_dir = '/'.join([PATH_DATA_OSCNEXT, 'masks'])
    if not Path(mask_dir).exists():
        Path(mask_dir).mkdir()

    # Loop over different DBs
    for path, ext in zip([PATH_TRAIN_DB, PATH_VAL_DB],
                         ['_train.pickle', '_val.pickle']):

        db = SqliteFetcher(path)
        print(get_time(),
              '%s mask calculation begun.' % (mask_dict['mask_name']))
        mask, mask_name = make_mask(db, **mask_dict)
        mask_path = '/'.join([mask_dir, mask_name + ext])
        with open(mask_path, 'wb') as f:
            pickle.dump(mask, f)
        print(get_time(), 'Mask created at', mask_path, '\n')
        # Locate the model directory
        model = locate_model(model_dir)
        model_name = Path(model_dir).name

        print('')
        print(get_time(), 'Used model: %s' % (model_name))

        for path in [PATH_TRAIN_DB, PATH_VAL_DB]:
            preds, indices = calc_raw_predictions(
                model,
                n_predictions_wanted=args.n_predictions_wanted,
                db_path=path)

            predictions = {}
            for key in args.prediction_keys:
                predictions[key] = preds[key]

            indices = [str(entry) for entry in indices]
            db = SqliteFetcher(path)
            keys = [key for key in predictions]
            new_keys = [
                key + '_' + remove_dots_and_lines(model_name)
                for key in predictions
            ]
            predictions_newnames = convert_keys(predictions, keys, new_keys)
            print(get_time(), 'Saving to db...')
            for name, values in predictions_newnames.items():
                db.write('scalar', name, indices, values, astype='REAL')
            print(get_time(), 'Data saved.')
Beispiel #9
0
def make_data():
    seq_keys = [
        'dom_charge', 
        'dom_x', 
        'dom_y', 
        'dom_z', 
        'dom_time', 
        'dom_atwd',
        'dom_pulse_width'
    ]
    target_keys = [
        'true_primary_energy', 
        'true_primary_position_x', 
        'true_primary_position_y', 
        'true_primary_position_z', 
        'true_primary_time', 
        'true_primary_direction_x', 
        'true_primary_direction_y', 
        'true_primary_direction_z'
    ]
    db_path = PATH_TRAIN_DB
    key = 'dom_charge'
    transformers = joblib.load(
        open(PATH_DATA_OSCNEXT + '/sqlite_transformers.pickle', 'rb')
    )
    db = SqliteFetcher(db_path)
    # Lets go with 1M ~ approximtely 1M/50 = 20k events
    ids = [str(e) for e in range(1000)]

    all_data = db.fetch_features(
        all_events=ids, 
        seq_features=seq_keys, 
        scalar_features=target_keys
        )
    data_d = {key: [] for key in all_data['0']}
    for key in target_keys:
        data_d[key] = [data[key] for event_id, data in all_data.items()]
    for key in seq_keys:
        data_d[key].extend(
            flatten_list_of_lists(
                [data[key] for event_id, data in all_data.items()]
                )
            )
    # Calculate means and std's before and after transformation
    dicts = {}
    table = np.empty((5, len(seq_keys)+len(target_keys)), dtype=object)
    for i_key, key in enumerate(data_d):
        data = data_d[key]
        d = {}
        if key in transformers:
            if type(transformers[key]) == sklearn.preprocessing._data.QuantileTransformer:
                name = 'ToNormal'
            elif sklearn.preprocessing._data.RobustScaler:
                if key == 'true_primary_energy':
                    name = 'LogRobust'
                else:
                    name = 'Robust'
            table[0, i_key] = name
            table[3, i_key] = r'%.2f'%(np.mean(data))
            table[4, i_key] = r'%.2f'%(np.std(data))
            data_pre = np.squeeze(
                transformers[key].inverse_transform(
                    np.array(data).reshape(-1, 1)
                )
            )
            if key == 'true_primary_energy':
                table[1, i_key] = r'%.2e'%(np.mean(10**data_pre))
                table[2, i_key] = r'%.2e'%(np.std(10**data_pre))
            else:
                table[1, i_key] = r'%.2e'%(np.mean(data_pre))
                table[2, i_key] = r'%.2e'%(np.std(data_pre))
        else:
            table[0, i_key] = 'None'
            table[1, i_key] = r'%.2f'%(np.mean(data))
            table[2, i_key] = r'%.2f'%(np.std(data))
            table[3, i_key] = r'-'
            table[4, i_key] = r'-'

    index = [r'Transformation', r'$\mu$, before', r'$\sigma$, before', r'$\mu$, after', r'$\sigma$, after']
    columns = []
    for col in [key for key in data_d]:
        split = col.split('_')
        new_col = r'\_'.join(split)
        columns.append(new_col)
    table_pd = pd.DataFrame(
        np.transpose(table),                       # values
        index=columns,    # 1st column as index
        columns=index)                # 1st row as the column names

    return table_pd
Beispiel #10
0
from src.modules.helper_functions import *
from src.modules.reporting import *
from src.modules.classes import *
import numpy as np
import pickle

from src.modules.reporting import *
from src.modules.constants import *
from matplotlib import pyplot as plt
from src.modules.classes import SqliteFetcher
from src.modules.thesis_plotting import *
import os

# ! INSERT MASKNAMES HERE
masks = ['tau_neutrino']
ids = [str(e) for e in load_sqlite_mask(PATH_DATA_OSCNEXT, masks, 'val')]

# ! INSERT VARIABLENAME FOR INSPECTION HERE
scalar_var = ['energy_balanced_alpha70', 'true_primary_energy']
db = SqliteFetcher(PATH_VAL_DB)
events = db.fetch_features(all_events=ids[:10], scalar_features=scalar_var)

for event, data in events.items():
    print(event)
    print(data)
    print('')
Beispiel #11
0
from src.modules.db_utils import *
from src.modules.classes import SqliteFetcher
from src.modules.constants import *
import joblib

old_db = SqliteFetcher(PATH_DATA_OSCNEXT + '/val_set_sqlite.db')
new_db = SqliteFetcher(PATH_DATA_OSCNEXT + '/val_transformed.db')
transformers = joblib.load(open(PATH_TRANSFORMERS, 'rb'))

create_transformed_db(old_db, new_db, transformers)