def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # 2. Create data loader and model train_loader = he.get_loader(name=exp_dict['dataset'], split='train', datadir=os.path.dirname(savedir), exp_dict=exp_dict) model = he.get_model(name=exp_dict['model'], exp_dict=exp_dict) # 3. load checkpoint chk_dict = hw.get_checkpoint(savedir) # 4. Add main loop for epoch in tqdm.tqdm(range(chk_dict['epoch'], 3), desc="Running Experiment"): # 5. train for one epoch train_dict = model.train_on_loader(train_loader, epoch=epoch) # 6. get and save metrics score_dict = {'epoch':epoch, 'acc': train_dict['train_acc'], 'loss':train_dict['train_loss']} chk_dict['score_list'] += [score_dict] images = model.vis_on_loader(train_loader) hw.save_checkpoint(savedir, score_list=chk_dict['score_list'], images=[images]) print('Experiment done\n')
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # Create data loader and model train_loader = he.get_loader( name=exp_dict["dataset"], split="train", datadir=os.path.dirname(savedir), exp_dict=exp_dict ) model = he.get_model(name=exp_dict["model"], exp_dict=exp_dict) # Resume or initialize checkpoint chk_dict = hw.get_checkpoint(savedir) if "model_state_dict" in chk_dict and len(chk_dict["model_state_dict"]): model.set_state_dict(chk_dict["model_state_dict"]) # Train and Validate for epoch in tqdm.tqdm(range(chk_dict["epoch"], 3), desc="Running Experiment"): # Train for one epoch train_dict = model.train_on_loader(train_loader, epoch=epoch) # Get and save metrics score_dict = {"epoch": epoch, "acc": train_dict["train_acc"], "loss": train_dict["train_loss"]} chk_dict["score_list"] += [score_dict] # Save Checkpoint hw.save_checkpoint(savedir, score_list=chk_dict["score_list"]) print("Experiment done\n")
from haven import haven_wizard as hw import wandb import sys import os import pprint path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0, path) if __name__ == "__main__": # first way score_dict = {"loss": loss} wandb.send(score_dict) # second way chk = load_checkpoint(savedir) hw.save_checkpoint(savedir, score_dict=score_dict, wandb_config={})
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ datadir = args.datadir # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # test set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="test", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader(val_set, # sampler=val_sampler, batch_size=exp_dict["batch_size"], collate_fn=ut.collate_fn, num_workers=args.num_workers, drop_last=False) test_loader = DataLoader(test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=args.num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() chk_dict = hw.get_checkpoint(savedir) score_list = chk_dict['score_list'] # Train & Val # ================== model.waiting = 0 model.val_score_best = -np.inf sampler = exp_dict['dataset'].get('sampler', 'random') if sampler == 'random': train_sampler = torch.utils.data.RandomSampler( train_set, replacement=True, num_samples=len(val_set)) elif sampler == 'balanced': train_sampler = samplers.BalancedSampler( train_set, n_samples=len(val_set)) train_loader = DataLoader(train_set, sampler=train_sampler, collate_fn=ut.collate_fn, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=args.num_workers) for e in range(chk_dict['epoch'], exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join(savedir, "images"), n_images=5) score_dict.update(val_dict) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = e score_dict["waiting"] = model.waiting model.waiting += 1 # Add to score_list and save checkpoint score_list += [score_dict] # Save Best Checkpoint score_df = pd.DataFrame(score_list) if score_dict["val_score"] >= model.val_score_best: test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join(savedir, "images"), n_images=3) score_dict.update(test_dict) hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) # score_df.to_csv(os.path.join(savedir, "score_best_df.csv")) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) model.waiting = 0 model.val_score_best = score_dict["val_score"] print("Saved Best: %s" % savedir) # Report & Save hw.save_checkpoint(savedir, score_list=score_list) if model.waiting > 100: break print('Experiment completed et epoch %d' % e)