def compare(model1, model2, validation, max_iter=-1): for m in [model1, model2]: g = Gym(m, device, lambda i, t: EMD.torch_auto(i, t, False), None, validation, max_validation_steps=max_iter) yield np.hstack(g.validation_loss())
def val_loss_and_auc(model:Gym): loss_func = lambda p, t: EMD.torch_auto(p, t, mean=False) val_losses = np.hstack(model.validation_loss(loss_func)) # restack batches names = model.data_val.dataset.df["MC_name"].values truth = (names != "valid").astype("int") pred = val_losses fpr, tpr, _ = metrics.roc_curve(truth, pred) auc = metrics.auc(fpr, tpr) return val_losses, auc
def __init__( self, name, gym_factory, auc_classes=None, batches=1000, val_loss_func=lambda p, t: EMD.torch_auto(p, t, mean=False), num_workers=1, add_attributes={}, catch_ctrl_c=False, ): super().__init__(name, num_workers, catch_ctrl_c) self.gym_factory = gym_factory self.batches = batches self.val_loss_func = val_loss_func self.auc_classes = auc_classes self.add_attributes = add_attributes
def _setup(self, config): # controls how often workers report model performance name = config.get('model_factory',None) self.model_factory = mappings.models[name] self.model = self.model_factory(config) self.batches_per_step = config.get('batches_per_step',1) self.max_validation_steps = config.get('max_validation_steps',10) # GPU or CPU? use_cuda = config.get("use_gpu") and torch.cuda.is_available() print("CUDA:",use_cuda) self.device = torch.device("cuda" if use_cuda else "cpu") self.model.to(self.device) self.batch_size = config.get("batch_size", None) self.verbose = config.get("verbose", False) # Abstract: implement self.model optimizer = config.get("optimizer", "Adam") if optimizer == "SGD": self.optimizer = optim.SGD( self.model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) elif optimizer == "Adam": self.optimizer = optim.Adam(self.model.parameters()) else: raise NotImplemented() self.validation_loss_F = lambda p, t: EMD.torch_auto(p, t, mean=False) loss_name = config.get("training_loss", "mse_loss") self.train_loss_F = mappings.losses[loss_name]
best_loss = argmedian(runs.loss_train) best_auc = np.argmax(runs.auc) gym = training.best_model_factory() model = gym.model model.load_state_dict(runs.model[best_loss].to_dict()) model.eval() # %% losses = [] for d in tqdm(data): d = d["data"] d = d.to(device) pred = model(d) loss = EMD.torch_auto(pred, d, False).detach().flatten().cpu().numpy() losses.append(loss) losses = np.hstack(losses) # %% unique_seps = np.unique(separations) loss_valid = losses[separations == 0] aucs = [] for sep in unique_seps: if sep == 0: continue loss_peak = losses[separations == sep] classes = [0] * len(loss_valid) + [1] * len(loss_peak) auc = analysis.calc_auc(np.hstack((loss_valid, loss_peak)), classes) aucs.append(auc)
def ks_test(observation_pdf, pdf): #observ_cdf = np.cumsum(observation_pdf) #cdf = np.cumsum(pdf) ks_stat, p_value = stats.ks_2samp(observation_pdf.reshape(-1),pdf.reshape(-1)) return p_value #return np.max(np.abs(observ_cdf-cdf)) all_ks = [] losses_val = [] val_classes = [] for d in tqdm(model.data_val): data = d['data'] type = d['MC_type'] data = data.to(model.device) pred = model.model(data).detach() losses_val.append(EMD.torch_auto(pred,data,False).cpu().numpy()) pred = pred.cpu().numpy() data = data.cpu().numpy() val_classes.append(type) ks = [ks_test(data[i], pred[i]) for i in range(len(pred))] all_ks.append(ks) ks = np.hstack(all_ks) losses_val = np.hstack(losses_val) val_classes = np.hstack(val_classes) #%% calc_auc(losses_val,val_classes!=0) #%% interactive.save_value("AUC for KS as metric", calc_auc(-ks,val_classes!=0),".2f") # %%
except: modelname = model.model.__class__.__name__ name = f"{i}, Model:{modelname}, Loss: {lossname}" print(f"Training {name}:") loss = model.train_batches(steps) x = np.linspace(0, (steps + 1) * dl_config["batch_size"], len(loss)) plt.plot(x, loss) plt.xlabel("# of waveforms used for training") plt.ylabel(f"loss {lossname}") plt.xscale("log") plt.figtext(0, 0, name) plt.show_and_save(f"{name} + training") loss_func = lambda p, t: EMD.torch_auto(p, t, mean=False) val_losses = np.hstack( model.validation_loss(loss_func)) # restack batches names = dataset_val.df["MC_name"].values _, bins, _ = plt.hist(val_losses, int(np.sqrt(len(val_losses))), label="everything") plt.clf() unames = np.unique(names) data = [val_losses[names == name] for name in unames] plt.hist(data, bins=bins, label=unames, stacked=True) plt.xlabel("EMD loss") plt.ylabel("frequency") plt.legend()