def test_checkpoint(self): savedir_base = '.results' # create exp folder exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 'dataset':'mnist', 'batch_size':1} savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict)) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) hu.torch_save(os.path.join(savedir, "model.pth"), torch.zeros(10)) hu.torch_load(os.path.join(savedir, "model.pth")) assert(os.path.exists(savedir)) # delete exp folder hc.delete_experiment(savedir) assert(not os.path.exists(savedir)) # check backup folder os.rmdir(savedir_base)
def test_checkpoint(): savedir_base = ".results" # create exp folder exp_dict = { "model": { "name": "mlp", "n_layers": 30 }, "dataset": "mnist", "batch_size": 1 } savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict)) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) hu.torch_save(os.path.join(savedir, "model.pth"), torch.zeros(10)) hu.torch_load(os.path.join(savedir, "model.pth")) hc.load_checkpoint(exp_dict, savedir_base, fname="model.pth") assert os.path.exists(savedir) # delete exp folder hc.delete_experiment(savedir) assert not os.path.exists(savedir) # check backup folder os.rmdir(savedir_base)
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # train loader train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=savedir_base, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=savedir_base, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Compute fstar # ------------- if exp_dict['opt'].get('fstar_flag'): ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for e in range(s_epoch, exp_dict['max_epoch']): # Set seed seed = e + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {} # Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # Train over train loader model.train() print("%d - Training model with %s..." % (e, exp_dict["loss_func"])) # train and validate s_time = time.time() for batch in tqdm.tqdm(train_loader): images, labels = batch["images"].cuda(), batch["labels"].cuda() opt.zero_grad() # closure def closure(): return loss_function(model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record metrics score_dict["epoch"] = e score_dict["step_size"] = opt.state["step_size"] score_dict["step_size_avg"] = opt.state["step_size_avg"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["grad_norm"] = opt.state["grad_norm"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) print('Experiment completed')
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # Dataset # ----------- # train loader train_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='train') # val loader val_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='val') # Model # ----------- model = models.get_model(model_name=exp_dict['model']) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d' % (s_epoch)) for e in range(s_epoch, 10): score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) # Get metrics score_dict['train_loss'] = train_dict['train_loss'] score_dict['val_acc'] = val_dict['val_acc'] score_dict['epoch'] = e # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print(score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print('Checkpoint Saved: %s' % savedir) print('experiment completed')
def trainval(exp_dict, savedir_base, datadir_base, reset=False, num_workers=0, pin_memory=False, ngpu=1, cuda_deterministic=False): # bookkeeping # ================== # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) if DEVICE.type == "cuda": if cuda_deterministic: cudnn.benchmark = False cudnn.deterministic = True else: cudnn.benchmark = True # Dataset # ================== trainset = get_dataset(exp_dict['dataset'], 'train', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=exp_dict['dataset']['transform_lvl'], colorjitter=exp_dict['dataset'].get('colorjitter') ) valset = get_dataset(exp_dict['dataset'], 'validation', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) testset = get_dataset(exp_dict['dataset'], 'test', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['test'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) print("Dataset defined.") # define dataloaders if exp_dict['dataset']['name'] == 'bach': testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) else: testloader = torch.utils.data.DataLoader(testset, batch_size=exp_dict['batch']['size'], shuffle=False, num_workers=num_workers, pin_memory=pin_memory) print("Testloader defined.") # Model # ================== model = get_model(exp_dict, trainset, device=DEVICE) print("Model loaded") model_path = os.path.join(savedir, 'model.pth') model_best_path = os.path.join(savedir, 'model_best.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') # checkpoint management if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = len(score_list) else: # restart experiment score_list = [] s_epoch = 0 # define and log random seed for reproducibility assert('fixedSeed' in exp_dict) seed = exp_dict['fixedSeed'] random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) print("Seed defined.") # Train & Val # ================== print("Starting experiment at epoch %d/%d" % (s_epoch, exp_dict['niter'])) for epoch in range(s_epoch, exp_dict['niter']): s_time = time.time() # Sample new train val trainloader, valloader = get_train_val_dataloader(exp_dict, trainset, valset, mixtrainval=exp_dict['mixTrainVal'], num_workers=num_workers, pin_memory=pin_memory) # Train & validate train_dict = model.train_on_loader(trainloader, valloader, epoch=epoch, exp_dict=exp_dict) # Test phase train_dict_2 = model.test_on_loader(trainloader) val_dict = model.test_on_loader(valloader) test_dict = model.test_on_loader(testloader) # Vis phase model.vis_on_loader('train', trainset, savedir_images=os.path.join( savedir, 'images'), epoch=epoch) score_dict = {} score_dict["epoch"] = epoch score_dict["test_acc"] = test_dict['acc'] score_dict["val_acc"] = val_dict['acc'] score_dict["train_acc"] = train_dict_2['acc'] score_dict["train_loss"] = train_dict['loss'] score_dict["time_taken"] = time.time() - s_time score_dict["netC_lr"] = train_dict['netC_lr'] if exp_dict['model']['netA'] is not None: if 'transformations_mean' in train_dict: for i in range(len(train_dict['transformations_mean'])): score_dict[str( i) + "_mean"] = train_dict['transformations_mean'][i].item() if 'transformations_std' in train_dict: for i in range(len(train_dict['transformations_std'])): score_dict[str( i) + "_std"] = train_dict['transformations_std'][i].item() # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Update best score if epoch == 0 or (score_dict["test_acc"] >= score_df["test_acc"][:-1].max()): hu.save_pkl(os.path.join( savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('experiment completed')
def trainval(exp_dict, savedir_base, data_root, reset=False, tensorboard=True): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) np.random.seed(exp_dict["seed"]) torch.manual_seed(exp_dict["seed"]) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) writer = tensorboardX.SummaryWriter(savedir) \ if tensorboard == 1 else None # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) pprint.pprint(exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ----------- train_dataset, val_dataset = get_dataset(['train', 'val'], data_root, exp_dict) # val_dataset = get_dataset('val', exp_dict) # train and val loader if exp_dict["episodic"] == False: train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict, labelset=train_dataset.raw_labelset, writer=writer) print("Model with:", sum(p.numel() for p in model.parameters() if p.requires_grad), "parameters") # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(e, train_loader)) # Validate the model score_dict.update(model.val_on_loader(e, val_loader)) score_dict["epoch"] = e if tensorboard: for key, value in score_dict.items(): writer.add_scalar(key, value, e) writer.flush() # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # if model.is_end(): # print("Early stopping") # break print('experiment completed') # Cleanup if tensorboard == 1: writer.close()
def trainval(exp_dict, savedir_base, reset=False, num_workers=0, run_ssl=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # load datasets # ========================== train_set = datasets.get_dataset( dataset_name=exp_dict["dataset_train"], data_root=exp_dict["dataset_train_root"], split="train", transform=exp_dict["transform_train"], classes=exp_dict["classes_train"], support_size=exp_dict["support_size_train"], query_size=exp_dict["query_size_train"], n_iters=exp_dict["train_iters"], unlabeled_size=exp_dict["unlabeled_size_train"]) val_set = datasets.get_dataset( dataset_name=exp_dict["dataset_val"], data_root=exp_dict["dataset_val_root"], split="val", transform=exp_dict["transform_val"], classes=exp_dict["classes_val"], support_size=exp_dict["support_size_val"], query_size=exp_dict["query_size_val"], n_iters=exp_dict["val_iters"], unlabeled_size=exp_dict["unlabeled_size_val"]) test_set = datasets.get_dataset( dataset_name=exp_dict["dataset_test"], data_root=exp_dict["dataset_test_root"], split="test", transform=exp_dict["transform_val"], classes=exp_dict["classes_test"], support_size=exp_dict["support_size_test"], query_size=exp_dict["query_size_test"], n_iters=exp_dict["test_iters"], unlabeled_size=exp_dict["unlabeled_size_test"]) # get dataloaders # ========================== train_loader = torch.utils.data.DataLoader( train_set, batch_size=exp_dict["batch_size"], shuffle=True, num_workers=num_workers, collate_fn=ut.get_collate(exp_dict["collate_fn"]), drop_last=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) # create model and trainer # ========================== # Create model, opt, wrapper backbone = backbones.get_backbone( backbone_name=exp_dict['model']["backbone"], exp_dict=exp_dict) model = models.get_model(model_name=exp_dict["model"]['name'], backbone=backbone, n_classes=exp_dict["n_classes"], exp_dict=exp_dict) if run_ssl: # runs the SSL experiments score_list_path = os.path.join(savedir, 'score_list.pkl') if not os.path.exists(score_list_path): test_dict = model.test_on_loader(test_loader, max_iter=None) hu.save_pkl(score_list_path, [test_dict]) return # Checkpoint # ----------- checkpoint_path = os.path.join(savedir, 'checkpoint.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(checkpoint_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} score_dict.update(model.get_lr()) # train score_dict.update(model.train_on_loader(train_loader)) # validate score_dict.update(model.val_on_loader(val_loader)) score_dict.update(model.test_on_loader(test_loader)) # Add score_dict to score_list score_list += [score_dict] # Report score_df = pd.DataFrame(score_list) print(score_df.tail()) # Save checkpoint hu.save_pkl(score_list_path, score_list) hu.torch_save(checkpoint_path, model.get_state_dict()) print("Saved: %s" % savedir) if "accuracy" in exp_dict["target_loss"]: is_best = score_dict[exp_dict["target_loss"]] >= score_df[ exp_dict["target_loss"]][:-1].max() else: is_best = score_dict[exp_dict["target_loss"]] <= score_df[ exp_dict["target_loss"]][:-1].min() # Save best checkpoint if is_best: hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "checkpoint_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) # Check for end of training conditions if model.is_end_of_training(): break
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print(exp_dict) print("Experiment saved in %s" % savedir) # Set Seed # ------- seed = exp_dict.get('seed') np.random.seed(seed) torch.manual_seed(seed) # Dataset # ----------- train_dataset = get_dataset('train', exp_dict['dataset']) val_dataset = get_dataset('test', exp_dict['dataset']) # train and val loader train_loader = DataLoader( train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x if exp_dict['batch_size'] == 1 else default_collate, # to handle episodes num_workers=args.num_workers) val_loader = DataLoader( val_dataset, batch_size=exp_dict['batch_size'], collate_fn=lambda x: x if exp_dict['batch_size'] == 1 else default_collate, shuffle=True, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model savepath = os.path.join(savedir_base, exp_dict['dataset']['name']) score_dict.update(model.val_on_loader(val_loader, savedir=savepath)) model.on_train_end(savedir=savedir, epoch=e) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) print('experiment completed')
def trainval(exp_dict, savedir_base, reset, metrics_flag=True, datadir=None, cuda=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) print(pprint.pprint(exp_dict)) print('Experiment saved in %s' % savedir) # set seed # ================== seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) if cuda: device = 'cuda' torch.cuda.manual_seed_all(seed) else: device = 'cpu' print('Running on device: %s' % device) # Dataset # ================== train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=True, shuffle=True, sampler=None, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ================== use_backpack = exp_dict['opt'].get("backpack", False) model = models.get_model(exp_dict["model"], train_set=train_set, backpack=use_backpack).to(device=device) if use_backpack: assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second'] from backpack import extend model = extend(model) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer # ============== n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch, n_train=len(train_set), train_loader=train_loader, model=model, loss_function=loss_function, exp_dict=exp_dict, batch_size=exp_dict["batch_size"]) # Checkpointing # ============= score_list_path = os.path.join(savedir, "score_list.pkl") model_path = os.path.join(savedir, "model_state_dict.pth") opt_path = os.path.join(savedir, "opt_state_dict.pth") if os.path.exists(score_list_path): # resume experiment score_list = ut.load_pkl(score_list_path) if use_backpack: model.load_state_dict(torch.load(model_path), strict=False) else: model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Start Training # ============== n_train = len(train_loader.dataset) n_batches = len(train_loader) batch_size = train_loader.batch_size for epoch in range(s_epoch, exp_dict["max_epoch"]): # Set seed seed = epoch + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {"epoch": epoch} # Validate # -------- if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"], batch_size=exp_dict['batch_size']) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"], batch_size=exp_dict['batch_size']) # Train # ----- model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() train_on_loader(model, train_set, train_loader, opt, loss_function, epoch, use_backpack) e_time = time.time() # Record step size and batch size score_dict["step"] = opt.state.get("step", 0) / int(n_batches_per_epoch) score_dict["step_size"] = opt.state.get("step_size", {}) score_dict["step_size_avg"] = opt.state.get("step_size_avg", {}) score_dict["n_forwards"] = opt.state.get("n_forwards", {}) score_dict["n_backwards"] = opt.state.get("n_backwards", {}) score_dict["grad_norm"] = opt.state.get("grad_norm", {}) score_dict["batch_size"] = batch_size score_dict["train_epoch_time"] = e_time - s_time score_dict.update(opt.state["gv_stats"]) # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(score_list_path, score_list) ut.torch_save(model_path, model.state_dict()) ut.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) return score_list
def trainval(exp_dict, savedir_base, data_root, reset=False, wandb='None', wandb_key='None'): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print(exp_dict) print("Experiment saved in %s" % savedir) model_name = exp_dict['model'] + \ "_lr_" + str(exp_dict['lr']) +\ "_hs_" + str(exp_dict['backbone']['hidden_size']) +\ "_pa_" + str(exp_dict['patience']) if exp_dict['model'] == 'MAML': model_name += "_ilr_" + str(exp_dict['inner_lr']) +\ "_nii_" + str(exp_dict['n_inner_iter']) #TODO add seed if wandb is not 'None': # https://docs.wandb.com/quickstart import wandb as logger if wandb_key is not 'None': logger.login(key=wandb_key) logger.init(project=wandb, group=model_name) logger.config.update(exp_dict) # Dataset # ----------- train_dataset = get_dataset('train', data_root, exp_dict) val_dataset = get_dataset('val', data_root, exp_dict) test_dataset = get_dataset('test', data_root, exp_dict) if 'ood' in exp_dict['dataset']['task']: ood_dataset = get_dataset('ood', data_root, exp_dict) ood = True else: ood = False # train and val loader if exp_dict["episodic"] == False: train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) test_loader = EpisodicDataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) if ood: ood_loader = EpisodicDataLoader(ood_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 patience_counter = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate and Test the model score_dict.update( model.val_on_loader(val_loader, mode='val', savedir=os.path.join( savedir_base, exp_dict['dataset']['name']))) score_dict.update(model.val_on_loader(test_loader, mode='test')) if ood: score_dict.update(model.val_on_loader(ood_loader, mode='ood')) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Test error at best validation: if score_dict["val_accuracy"] > model.best_val: score_dict["test_accuracy_at_best_val"] = score_dict[ "test_accuracy"] score_dict["ood_accuracy_at_best_val"] = score_dict["ood_accuracy"] model.best_val = score_dict["val_accuracy"] patience_counter = 0 # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if wandb is not 'None': for key, values in score_dict.items(): logger.log({key: values}) patience_counter += 1 # Patience: if patience_counter > exp_dict['patience'] * 3: print('training done, out of patience') break print('experiment completed')
def trainval(exp_dict, savedir_base, data_root, reset=False, test_only=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) np.random.seed(exp_dict["seed"]) torch.manual_seed(exp_dict["seed"]) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) pprint.pprint(exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ----------- # train and val loader if exp_dict["episodic"] == False: if (int(test_only) == 0): train_dataset, val_dataset, test_dataset = get_dataset( ['train', 'val', 'test'], data_root, exp_dict) train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) if hasattr(train_dataset, "mask"): # assert((train_dataset.mask == val_dataset.mask).all()) # assert((train_dataset.mask == test_dataset.mask).all()) np.save(os.path.join(savedir, "mask.npy"), train_dataset.mask) else: test_dataset, = get_dataset(['test'], exp_dict) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) print("Parameters: ", sum([torch.numel(v) for v in model.parameters()])) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment print("Resuming from", model_path) model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 if int(test_only) == 0: # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict.update( model.val_on_loader(val_loader, savedir=os.path.join( savedir_base, exp_dict['dataset']['name']))) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if model.is_end(): print("Early stopping") break print('experiment completed') print("Testing...") score_dict = model.test_on_loader(train_loader, tag="train") score_dict.update(model.test_on_loader(val_loader, tag="val")) score_dict.update(model.test_on_loader(test_loader, tag="test")) # Report & Save score_list_path = os.path.join(savedir, "score_list_test.pkl") hu.save_pkl(score_list_path, score_dict) else: print("Testing...") score_dict = model.test_on_loader(test_loader, "test") # Report & Save score_list_path = os.path.join(savedir, "score_list_test.pkl") hu.save_pkl(score_list_path, score_dict)