def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # train loader train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=savedir_base, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=savedir_base, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Compute fstar # ------------- if exp_dict['opt'].get('fstar_flag'): ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for e in range(s_epoch, exp_dict['max_epoch']): # Set seed seed = e + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {} # Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # Train over train loader model.train() print("%d - Training model with %s..." % (e, exp_dict["loss_func"])) # train and validate s_time = time.time() for batch in tqdm.tqdm(train_loader): images, labels = batch["images"].cuda(), batch["labels"].cuda() opt.zero_grad() # closure def closure(): return loss_function(model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record metrics score_dict["epoch"] = e score_dict["step_size"] = opt.state["step_size"] score_dict["step_size_avg"] = opt.state["step_size_avg"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["grad_norm"] = opt.state["grad_norm"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) print('Experiment completed')
def trainval_svrg(exp_dict, savedir, datadir, metrics_flag=True): ''' SVRG-specific training and validation loop. ''' pprint.pprint(exp_dict) # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=False, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Load model model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # lookup the learning rate lr = get_svrg_step_size(exp_dict) # Load Optimizer opt = get_svrg_optimizer(model, loss_function, train_loader=train_loader, lr=lr) # Resume from last saved state_dict if (not os.path.exists(savedir + "/run_dict.pkl") or not os.path.exists(savedir + "/score_list.pkl")): ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1}) score_list = [] s_epoch = 0 else: score_list = ut.load_pkl(savedir + "/score_list.pkl") model.load_state_dict(torch.load(savedir + "/model_state_dict.pth")) opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth")) s_epoch = score_list[-1]["epoch"] + 1 for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): images, labels = images.cuda(), labels.cuda() opt.zero_grad() closure = lambda svrg_model: loss_function( svrg_model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record step size and batch size score_dict["step_size"] = opt.state["step_size"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(savedir + "/score_list.pkl", score_list) ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict()) ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict()) print("Saved: %s" % savedir) return score_list
def newminimum(exp_id, savedir_base, datadir, name, exp_dict, metrics_flag=True): # bookkeeping # --------------- # get experiment directory old_modeldir = os.path.join(savedir_base, exp_id) savedir = os.path.join(savedir_base, exp_id, name) old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json')) # TODO: compare exp dict for possible errors: # optimizer have to be the same # same network, dataset # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') old_model_path = os.path.join(old_modeldir, 'model.pth') old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl') old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth') score_list = hu.load_pkl(old_score_list_path) model.load_state_dict(torch.load(old_model_path)) opt.load_state_dict(torch.load(old_opt_path)) s_epoch = score_list[-1]['epoch'] + 1 # save current model state for comparison minimum = [] for param in model.parameters(): minimum.append(param.clone()) # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for epoch in range(s_epoch, exp_dict['max_epoch']): # Set seed np.random.seed(exp_dict['runs'] + epoch) torch.manual_seed(exp_dict['runs'] + epoch) # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name='softmax_loss') # metric_name=exp_dict["loss_func"]) # TODO: which loss should be used? (normal or with reguralizer?) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): # images, labels = images.cuda(), labels.cuda() no cuda available opt.zero_grad() loss = loss_function(model, images, labels, minimum, 0.1) # just works for custom loss function loss.backward() opt.step() e_time = time.time() # Record metrics score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) with torch.nograd(): print('Current distance: %f', metrics.computedistance(minimum, model)) print('Experiment completed')
def trainval(exp_dict, savedir_base, reset, metrics_flag=True, datadir=None, cuda=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) print(pprint.pprint(exp_dict)) print('Experiment saved in %s' % savedir) # set seed # ================== seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) if cuda: device = 'cuda' torch.cuda.manual_seed_all(seed) else: device = 'cpu' print('Running on device: %s' % device) # Dataset # ================== train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=True, shuffle=True, sampler=None, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ================== use_backpack = exp_dict['opt'].get("backpack", False) model = models.get_model(exp_dict["model"], train_set=train_set, backpack=use_backpack).to(device=device) if use_backpack: assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second'] from backpack import extend model = extend(model) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer # ============== n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch, n_train=len(train_set), train_loader=train_loader, model=model, loss_function=loss_function, exp_dict=exp_dict, batch_size=exp_dict["batch_size"]) # Checkpointing # ============= score_list_path = os.path.join(savedir, "score_list.pkl") model_path = os.path.join(savedir, "model_state_dict.pth") opt_path = os.path.join(savedir, "opt_state_dict.pth") if os.path.exists(score_list_path): # resume experiment score_list = ut.load_pkl(score_list_path) if use_backpack: model.load_state_dict(torch.load(model_path), strict=False) else: model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Start Training # ============== n_train = len(train_loader.dataset) n_batches = len(train_loader) batch_size = train_loader.batch_size for epoch in range(s_epoch, exp_dict["max_epoch"]): # Set seed seed = epoch + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {"epoch": epoch} # Validate # -------- if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"], batch_size=exp_dict['batch_size']) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"], batch_size=exp_dict['batch_size']) # Train # ----- model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() train_on_loader(model, train_set, train_loader, opt, loss_function, epoch, use_backpack) e_time = time.time() # Record step size and batch size score_dict["step"] = opt.state.get("step", 0) / int(n_batches_per_epoch) score_dict["step_size"] = opt.state.get("step_size", {}) score_dict["step_size_avg"] = opt.state.get("step_size_avg", {}) score_dict["n_forwards"] = opt.state.get("n_forwards", {}) score_dict["n_backwards"] = opt.state.get("n_backwards", {}) score_dict["grad_norm"] = opt.state.get("grad_norm", {}) score_dict["batch_size"] = batch_size score_dict["train_epoch_time"] = e_time - s_time score_dict.update(opt.state["gv_stats"]) # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(score_list_path, score_list) ut.torch_save(model_path, model.state_dict()) ut.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) return score_list
def trainval(exp_dict, savedir, datadir, metrics_flag=True): # TODO: Do we get similar results with different seeds? # Set seed np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) pprint.pprint(exp_dict) # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Load model model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Resume from last saved state_dict if (not os.path.exists(savedir + "/run_dict.pkl") or not os.path.exists(savedir + "/score_list.pkl")): ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1}) score_list = [] s_epoch = 0 else: score_list = ut.load_pkl(savedir + "/score_list.pkl") model.load_state_dict(torch.load(savedir + "/model_state_dict.pth")) opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth")) s_epoch = score_list[-1]["epoch"] + 1 for epoch in range(s_epoch, exp_dict["max_epoch"]): # Set seed np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): images, labels = images.cuda(), labels.cuda() opt.zero_grad() if exp_dict["opt"]["name"] in exp_configs.ours_opt_list + ["l4"]: closure = lambda: loss_function( model, images, labels, backwards=False) opt.step(closure) else: loss = loss_function(model, images, labels) loss.backward() opt.step() e_time = time.time() # Record step size and batch size score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(savedir + "/score_list.pkl", score_list) ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict()) ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict()) print("Saved: %s" % savedir) return score_list