Exemple #1
0
class small_runs:
    name = "1e5"
    with_outliers = analysis.TrainingStability("Ribbles_w_outliers_1e5", None)
    with_outliers.load()

    without_outliers = analysis.TrainingStability("Ribbles_wo_outliers_1e5",
                                                  None)
    without_outliers.load()

    interactive.save_value("ensemble size 1e5 with outliers",
                           len(with_outliers.runs), ".1e")
    interactive.save_value("ensemble size 1e5 without outliers",
                           len(without_outliers.runs), ".1e")
#%%
from icae.results.n01_toy.n01_generate_data import *

from icae import interactive_setup as interactive

print("Generating Allen (training set):")
gen = toy.Generator(distances_to_DOM, angles_to_DOM, photons_per_event)
base_size = int(5e6)
noise_factor = 0.01
interactive.save_value("noise factor in training set", noise_factor)

#%%
gen.generate_valid(base_size)
#%%
# plot examples for noise
print("Gaussian reshape noise:")

example_params = {"d": 25, "η": 0.1}
valid = gen.pdf_valid(**example_params)
means = np.linspace(0.2, 0.8, num=5)
stds = np.linspace(0.1, 0.8, num=5)
impacts = np.linspace(0.1, 1, num=5)
count_variations = len(means) * len(stds) * len(impacts)

for impact in tqdm(impacts, "Plotting examples"):
    for std in stds:
        for mean in means:
            times, pdf = gen.pdf_gaussian_reshaped(
                **example_params,
                mean=mean,
                std=std,
Exemple #3
0
    return (average, np.sqrt(variance))


# re-weighted to 5% outliers
weights = np.concatenate([
    np.repeat(0.025, len(data[0])),
    np.repeat(0.025, len(data[1])),
    np.repeat(0.95, len(data[2]))
])
d = np.concatenate(data)
mean, std = weighted_avg_and_std(d, weights)

peak_index = np.argmax(combined)
peak = (bins[peak_index] + bins[peak_index + 1]) / 2

interactive.save_value("mean val loss reweighted", mean, ".1e")
interactive.save_value("std val loss reweighted", std, ".1e")
interactive.save_value("peak val loss reweighted", peak, ".1e")

# %%
truth = (names != "valid").astype("int")
pred = val_losses
fpr, tpr, _ = metrics.roc_curve(truth, pred)
auc = metrics.auc(fpr, tpr)

lw = 2
plt.plot(fpr,
         tpr,
         color="darkorange",
         lw=lw,
         label="ROC curve (area = %0.2f)" % auc)
Exemple #4
0
interactive.set_saved_value_yaml_root_by_filename(__file__)

#%%
# Stability Test


def gym_factory():
    m = training.best_model_factory()
    m.verbose = False
    m.max_validation_steps = -1
    return m


def auc_classes(model):
    names = model.data_val.dataset.df["MC_name"].values
    truth = (names != "valid").astype("int")
    return truth


while True:
    for name, batches in zip(["1e5", "1e6"], [1000, 10000]):
        interactive.save_value("waveforms used for training " + name,
                               batches * gym_factory().data_train.batch_size)
        r = TrainingStability("Ribbles_w_outliers_" + name,
                              gym_factory,
                              auc_classes,
                              batches=batches)
        r.run(20)

# %%
Exemple #5
0
from icae.models.waveform.simple import ConvAE
from icae.tools.analysis import calc_auc, plot_auc, TrainingStability
import icae.tools.loss.EMD as EMD
from icae.tools.torch.gym import Gym
from icae.tools.dataset.MC import MCDataset
from icae.tools.hyperparam import mappings
from icae.tools.dataset.single import SingleWaveformPreprocessing

plt.set_plot_path(__file__)
interactive.set_saved_value_yaml_root_by_filename(__file__)

#%%
ana = tune.Analysis("~/ray_results/final-1/")
# %%
dfs = ana.trial_dataframes
interactive.save_value("number of configurations", len(dfs), ".1e")
#%%
plot_training_overview = False
if plot_training_overview:
    ax = None  # This plots everything on the same plot
    for d in tqdm(dfs.values()):
        d.auc.plot(ax=ax, legend=False)
    plt.show_and_save("training overview")

#%%
aucs = []
times = []
for d in tqdm(dfs.values()):
    aucs.extend(d.auc)
    times.extend(range(len(d.auc)))
times = np.asarray(times)  # +1)*hpo.waveforms_per_step
from tqdm import tqdm
from box import Box

from icae.tools.config_loader import config
import icae.toy.waveformMC as toy
import icae.interactive_setup as interactive

interactive.set_saved_value_yaml_root_by_filename(__file__)
out_file = config.root + config.MC.filename

#%%
distances_to_DOM = np.linspace(5, 50, num=30)  # m
angles_to_DOM = np.deg2rad(np.linspace(0, 180, num=30))
photons_per_event = stats.uniform(10, 3000)

interactive.save_value("min angle", 0)
interactive.save_value("max angle", 180)
interactive.save_value("min distance", min(distances_to_DOM))
interactive.save_value("max distance", max(distances_to_DOM))
interactive.save_value("min photons", 10)
interactive.save_value("max photons", 3000)
interactive.save_value(
    "unique parameter combinations",
    len(distances_to_DOM) * len(angles_to_DOM) * (3000 - 10),
)
# %%

# generate two datasets:
#  - Allen: big dataset with few outliers (1%) as training data
#  - Betty: small dataset with 50% outliers as validation data
#  - Conrad: dataset with only double peak to measure differentiation
Exemple #7
0
# %%
for run in [small_runs, big_runs]:
    data = [
        run.with_outliers.auc,
        run.without_outliers.auc[run.without_outliers.auc != None]
    ]
    labels = ["with outliers", "without outliers"]
    plt.hist(data, bins=30, histtype="step", density=True, label=labels)
    plt.xlabel("AUC")
    plt.ylabel("frequency")
    plt.legend(loc="upper left")
    plt.show_and_save("training without outliers - AUC comparison " + run.name)
    KS_statistic, p_value = scipy.stats.ks_2samp(*data, mode='exact')
    interactive.save_value(
        "p-value null hypo AUC of training wo outliers is the same as training with outliers "
        + run.name, p_value, ".2e")
    interactive.save_value(
        "ks statistic null hypo AUC of training wo outliers is the same as training with outliers "
        + run.name, KS_statistic, ".1e")
    interactive.save_value("mean AUC with outliers " + run.name,
                           f"{data[0].mean():.2f} ")
    interactive.save_value("mean AUC without outliers " + run.name,
                           f"{data[1].mean():.2f}")

    data = [
        run.with_outliers.loss_val.mean(axis=1),
        run.without_outliers.loss_val.mean(axis=1)
    ]
    plt.hist(data, bins=30, histtype="step", density=True, label=labels)
    plt.xlabel("EMD mean validation loss")
    split=1,
)
val_MC = DataLoader(dataset_val_MC, batch_size=12, num_workers=1)

#%%
normal_toy_runs = analysis.TrainingStability("Ribbles_w_outliers_1e6", None)
normal_toy_runs.load()

#%%
best_loss = np.argmin(normal_toy_runs.loss_train)
best_model = normal_toy_runs.model[best_loss]
model_toy = ConvAE()
model_toy.load_state_dict(best_model.to_dict())
model_toy.eval()

interactive.save_value("auc of best toy model on toy val",
                       normal_toy_runs.auc[best_loss])

#%%
model_MC = ConvAE({"latent_size": 6})
model_MC.load_state_dict(
    torch.load(
        config.root +
        "icae/models/trained/1e+06 samples 1.3e-02 loss latent_space 6 IceMC.pt"
    ))
model_MC.eval()


# %%
def compare(model1, model2, validation, max_iter=-1):
    for m in [model1, model2]:
        g = Gym(m,
means = np.array(means)
stds = np.array(stds)
plt.plot(unique_seps, means, label="mean loss of outlier waveforms")
plt.fill_between(unique_seps,
                 means + stds,
                 means - stds,
                 alpha=0.3,
                 label="standard deviation of loss")

m = loss_valid.mean()
s = loss_valid.std(ddof=1)
plt.plot(unique_seps, [m] * len(unique_seps),
         label="mean loss of normal waveforms")
plt.fill_between(unique_seps, [m + s] * len(unique_seps),
                 [m - s] * len(unique_seps),
                 alpha=0.3)

plt.xlabel("peak separation in s")
plt.ylabel("EMD loss")
plt.legend(loc="upper left")
plt.show_and_save("peak separation vs loss")

# %%
r, p_val = pearsonr(losses, separations)
interactive.save_value("Pearson correlation of loss to peak separation", r,
                       ".2f")
interactive.save_value("Pearson p-value of loss to peak separation", p_val,
                       ".2e")

# %%
Exemple #10
0
def val_loss_and_auc(model:Gym):
    loss_func = lambda p, t: EMD.torch_auto(p, t, mean=False)
    val_losses = np.hstack(model.validation_loss(loss_func))  # restack batches
    names = model.data_val.dataset.df["MC_name"].values
    truth = (names != "valid").astype("int")
    pred = val_losses
    fpr, tpr, _ = metrics.roc_curve(truth, pred)
    auc = metrics.auc(fpr, tpr)
    return val_losses, auc



r = TrainingStability("Ribbles_w_outliers_1e6",None)
r.load()

interactive.save_value("trained models used for stability estimates",len(r.model))
#%%
loss_val = r.loss_val.mean(axis=1)
cut = np.quantile(r.loss_train,0.8)#np.mean(loss_val) #+ np.std(loss_val)*0.5
#cut2 = np.mean(r.auc) + np.std(loss_val)*0.5
filter = (r.loss_train < cut) #& (r.auc>cut2)
interactive.save_value("excluded values from stability plot",f"{100*sum(~filter)/len(filter):0.1f}")

plt.hexbin(loss_val[filter], r.auc[filter], linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter]))))
plt.xlabel("EMD validation loss")
plt.ylabel("AUC")
plt.colorbar(label="count")
plt.show_and_save(f"va training stability on {len(r.auc)} samples")

# %%
plt.hexbin(r.loss_train[filter], r.auc[filter],linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter]))))
#plt.subplot(2, 1, 1)
plt.hist(low_loss, bins=bins)
plt.xlabel("EMD loss")
plt.ylabel("count")
plt.yscale("log")
#plt.xlim(0,max(low_loss)*1.01)

# plt.subplot(2, 1, 2)
# plt.hist(data[data > 0], bins=bins)
# plt.xlabel("EMD loss")
# plt.ylabel("count")
# plt.yscale("log")

plt.show_and_save("loss hist 1e6")

interactive.save_value("loss hist 1e6 number of models", len(data), ".1e")
interactive.save_value(
    "number of waveforms for loss hist 1e6 plot", equivalent_1e6 * batch_size, ".1e"
)
interactive.save_value(
    "percentage of bad models", len(data[data > 0.07]) / len(data) * 100, ".1f"
)

interactive.save_value(
    "median loss for 1e6", np.median(data), ".3f"
)

#%%
# remove failed models
#filter = (r.loss_val.mean()>0)&(r.loss_val.mean()<1)&(r.loss_val.mean()==r.loss_val.mean())
#r.loss_val = r.loss_val[filter]
Exemple #12
0
        best = argmedian(r.loss_train[indices])

        actual_times.append(np.mean([len(x) for x in r.losses_train[indices]]))
        aucs_best[i_t, i_l] = r.auc[indices][best]
        aucs_mean[i_t, i_l] = np.mean(r.auc[indices])
        aucs_std[i_t, i_l] = np.std(r.auc[indices], ddof=1) / np.sqrt(
            len(r.auc[indices]))

        mean_val_losses = r.loss_val[indices].mean(axis=1)
        val_loss_mean[i_t, i_l] = mean_val_losses.mean()
        train_loss_mean[i_t, i_l] = r.loss_train[indices].mean()
        val_loss_mean_std[i_t, i_l] = mean_val_losses.std(ddof=1) / np.sqrt(
            len(mean_val_losses))
        n_models.append(len(r.auc[indices]))

interactive.save_value("mean number of models data point", np.mean(n_models),
                       ".1e")
assert len(np.unique(actual_times)) == len(
    np.array(times)
), "models haven't been training for as long as they should have been!"
# %%
for i, latent_dim in enumerate(latents):
    plt.errorbar(times,
                 aucs_mean[:, i],
                 yerr=aucs_std[:, i],
                 label=f"latent dim: {latent_dim}")
# TODO: insert line at 1e6 to indicate when training set is exhausted

plt.ylabel("mean AUC")
plt.xlabel("waveforms used for training")
plt.xscale("log")
plt.ylim([0.5, 0.9])