def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # -- Datasets train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=args.datadir) val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=args.datadir) # -- Model model = models.Model(exp_dict, device=torch.device('cuda')) # -- Train & Val Loop score_list = [] for e in range(0, 50): # Compute metrics score_dict = {"epoch": e} score_dict["train_loss"] = model.val_on_dataset( val_set, metric_name='softmax_loss') score_dict["val_acc"] = model.val_on_dataset(val_set, metric_name='softmax_acc') score_list += [score_dict] # Train model for one epoch model.train_on_dataset(train_set) # Visualize images = model.vis_on_dataset(val_set, fname=os.path.join( savedir, 'images', 'results.png')) # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list) hu.torch_save(os.path.join(savedir, 'model.pth'), model.state_dict()) print("Checkpoint Saved: %s" % savedir) print('Experiment completed et epoch %d' % e)
def trainval(): print("train") num_epochs = 50 results = {} train_dl = datasets.get_dataset(dataroot="data", image_size=64, batch_size=32, num_workers=2) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = models.Model(device) score_list = [] for epoch in range(0, num_epochs): print(f'epoch {epoch} of {num_epochs}') lossD, lossG = model.train_on_dataset(train_dl) results["lossD"] = lossD results["lossG"] = lossG model.vis_on_dataset(fname=os.path.join('training_image_results', f'epcoch{epoch}_results.png'))
def get_stats(exp_dict): dataset_name = exp_dict['dataset']['name'] n_classes = exp_dict['dataset']['n_classes'] stat_list = [] print('') print(dataset_name, '-', 'n_classes: %d' % n_classes) print('===========') fname = '.tmp/covid_stats/%s_c%d.csv' % (dataset_name, n_classes) if not os.path.exists(fname): for split in ['train', 'val', 'test']: dataset = datasets.get_dataset(dataset_dict={'name': dataset_name}, datadir=None, split=split, exp_dict=exp_dict) loader = torch.utils.data.DataLoader(dataset, batch_size=1, num_workers=100, collate_fn=ut.collate_fn) for i, b in enumerate(tqdm.tqdm(loader)): u_list = np.unique(b['masks']) stat_dict = {'split': split} b['points'][b['points'] == 0] = 255 for c in range(n_classes): if c in u_list: stat_dict['class_%d' % c] = 1 else: stat_dict['class_%d' % c] = 0 for c in range(n_classes): if c == 0: continue stat_dict['n_regions_c%d' % c] = (b['points'] == c).sum().item() # stat_dict['n_regions_2'] = (b['points'] == 2).sum().item() stat_list += [stat_dict] stats = pd.DataFrame(stat_list).groupby('split').sum() stats.to_csv(fname) else: stats = pd.read_csv(fname) return stats
def train(cfg): print(cfg.pretty()) train_config_validator(cfg) fix_seed(cfg.seed) writer = SummaryWriter(log_dir='logs') controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) dataset = get_dataset(writer=writer, seed=cfg.seed, **cfg.dataset) optimizer = get_optimizer(parameters=_get_target_parameters( controller, cfg.freeze_encoder_decoder), **cfg.optimizer) lr_scheduler = get_scheduler(optimizer=optimizer, **cfg.scheduler) end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( controller=controller, dataset=dataset, optimizer=optimizer, lr_scheduler=lr_scheduler, writer=writer, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
def pretrain(cfg): print(cfg.pretty()) pretrain_config_validator(cfg) fix_seed(cfg.seed) controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) models = {'trunk': controller} dataset = get_dataset(seed=cfg.seed, **cfg.dataset) optimizers = { 'trunk_optimizer': get_optimizer(parameters=models['trunk'].parameters(), **cfg.optimizer) } lr_schedulers = { 'trunk_scheduler_by_iteration': get_scheduler(optimizer=optimizers['trunk_optimizer'], **cfg.scheduler) } loss_funcs = { 'reconstruction_loss': torch.nn.NLLLoss(), 'metric_loss': get_loss(**cfg.loss) } mining_funcs = {"tuple_miner": get_miner(**cfg.miner)} visualizers = [umap.UMAP(**params) for params in cfg.visualizers] end_of_iteration_hook = TensorboardHook(visualizers).end_of_iteration_hook end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( models=models, optimizers=optimizers, lr_schedulers=lr_schedulers, loss_funcs=loss_funcs, mining_funcs=mining_funcs, dataset=dataset, end_of_iteration_hook=end_of_iteration_hook, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
"model": { 'name': 'lcfcn', 'base': "fcn8_vgg16" }, "batch_size": 1, "max_epoch": 100, 'dataset_size': { 'train': 1, 'val': 1 }, 'optimizer': 'adam', 'lr': 1e-5 } train_set = datasets.get_dataset(dataset_dict=exp_dict['dataset'], datadir='/mnt/public/datasets/Trancos', split="test", exp_dict=exp_dict) model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() batch = train_set[0] batch['images'] = batch['images'][None] batch['points'] = batch['points'][None] # train for several iterations for i in range(1000): loss = model.train_on_batch(batch) print(i, '- loss:', float(loss['train_loss'])) # visualize blobs and heatmap model.vis_on_batch(batch, savedir_image='result.png')
def trainval(exp_dict, savedir_base, datadir_base, reset=False): # bookkeeping stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # load train and acrtive set train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", datadir_base=datadir_base, exp_dict=exp_dict) active_set = ActiveLearningDataset(train_set, random_state=42) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", datadir_base=datadir_base, exp_dict=exp_dict) val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"]) # Model # ================== model = models.get_model(model_name=exp_dict['model']['name'], exp_dict=exp_dict).cuda() model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) active_set.load_state_dict( hu.load_pkl(os.path.join(savedir, "active_set.pkl"))) score_list = hu.load_pkl(score_list_path) inner_s_epoch = score_list[-1]['inner_epoch'] + 1 s_cycle = score_list[-1]['cycle'] else: # restart experiment score_list = [] inner_s_epoch = 0 s_cycle = 0 # Train & Val # ================== print("Starting experiment at cycle %d epoch %d" % (s_cycle, inner_s_epoch)) for c in range(s_cycle, exp_dict['max_cycle']): # Set seed np.random.seed(c) torch.manual_seed(c) torch.cuda.manual_seed_all(c) if inner_s_epoch == 0: active_set.label_next_batch(model) hu.save_pkl(os.path.join(savedir, "active_set.pkl"), active_set.state_dict()) train_loader = DataLoader(active_set, sampler=samplers.get_sampler( exp_dict['sampler']['train'], active_set), batch_size=exp_dict["batch_size"]) # Visualize the model model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) for e in range(inner_s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} if e == 0: score_dict.update(model.val_on_loader(val_loader)) # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict["epoch"] = len(score_list) score_dict["inner_epoch"] = e score_dict["cycle"] = c score_dict['n_ratio'] = active_set.n_labelled_ratio score_dict["n_train"] = len(train_loader.dataset) score_dict["n_pool"] = len(train_loader.dataset.pool) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) inner_s_epoch = 0
def trainval(exp_dict, savedir, datadir, metrics_flag=True): # TODO: Do we get similar results with different seeds? # Set seed np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) pprint.pprint(exp_dict) # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Load model model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Resume from last saved state_dict if (not os.path.exists(savedir + "/run_dict.pkl") or not os.path.exists(savedir + "/score_list.pkl")): ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1}) score_list = [] s_epoch = 0 else: score_list = ut.load_pkl(savedir + "/score_list.pkl") model.load_state_dict(torch.load(savedir + "/model_state_dict.pth")) opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth")) s_epoch = score_list[-1]["epoch"] + 1 for epoch in range(s_epoch, exp_dict["max_epoch"]): # Set seed np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): images, labels = images.cuda(), labels.cuda() opt.zero_grad() if exp_dict["opt"]["name"] in exp_configs.ours_opt_list + ["l4"]: closure = lambda: loss_function( model, images, labels, backwards=False) opt.step(closure) else: loss = loss_function(model, images, labels) loss.backward() opt.step() e_time = time.time() # Record step size and batch size score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(savedir + "/score_list.pkl", score_list) ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict()) ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict()) print("Saved: %s" % savedir) return score_list
datadir = '/mnt/public/datasets/DeepFish/' score_list = [] for hash_id in hash_list: fname = os.path.join('/mnt/public/predictions/habitat/%s.pkl' % hash_id) exp_dict = hu.load_json( os.path.join(savedir_base, hash_id, 'exp_dict.json')) if os.path.exists(fname): print('FOUND:', fname) val_dict = hu.load_pkl(fname) else: train_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='train', datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) test_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='test', datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) test_loader = DataLoader(test_set, batch_size=1, collate_fn=ut.collate_fn, num_workers=0) pprint.pprint(exp_dict)
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) if args.use_cuda: device = 'cuda' torch.cuda.manual_seed_all(seed) assert torch.cuda.is_available( ), 'cuda is not, available please run with "-c 0"' else: device = 'cpu' print('Running on device: %s' % device) # Dataset # Load val set and train set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", transform=exp_dict.get("transform"), datadir=args.datadir) train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", transform=exp_dict.get("transform"), datadir=args.datadir) # Load train loader, val loader, and vis loader train_loader = DataLoader(train_set, sampler=RandomSampler( train_set, replacement=True, num_samples=max(min(500, len(train_set)), len(val_set))), batch_size=exp_dict["batch_size"]) val_loader = DataLoader(val_set, shuffle=False, batch_size=exp_dict["batch_size"]) vis_loader = DataLoader(val_set, sampler=ut.SubsetSampler(train_set, indices=[0, 1, 2]), batch_size=1) # Create model, opt, wrapper model_original = models.get_model(exp_dict["model"], exp_dict=exp_dict).cuda() opt = torch.optim.Adam(model_original.parameters(), lr=1e-5, weight_decay=0.0005) model = wrappers.get_wrapper(exp_dict["wrapper"], model=model_original, opt=opt).cuda() score_list = [] # Checkpointing # ============= score_list_path = os.path.join(savedir, "score_list.pkl") model_path = os.path.join(savedir, "model_state_dict.pth") opt_path = os.path.join(savedir, "opt_state_dict.pth") if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} # visualize model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) # validate score_dict.update(model.val_on_loader(val_loader)) # train score_dict.update(model.train_on_loader(train_loader)) # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved in %s" % savedir)
logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) model = eval(args.model)(dataset=args.dataset, device=args.device) # for fine-tuning a pre-trained model, we strip out the last fc layer if args.save_path: saved_dict = torch.load(args.save_path) del saved_dict["model.module.fc.weight"] del saved_dict["model.module.fc.bias"] model.load_state_dict(saved_dict, strict=False) model.train() train_loader = DataLoader(get_dataset(args.dataset, "train"), shuffle=True, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=False) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4, nesterov=True) annealer = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs) loss_meter = meter.AverageValueMeter() time_meter = meter.TimeMeter(unit=False)
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # test set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="test", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader( val_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) test_loader = DataLoader( test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) model.waiting = 0 model.val_score_best = -np.inf train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(test_set)) train_loader = DataLoader(train_set, sampler=train_sampler, collate_fn=ut.collate_fn, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) score_dict["val_score"] = val_dict["val_score"] # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = e score_dict["waiting"] = model.waiting model.waiting += 1 # Add to score_list and save checkpoint score_list += [score_dict] # Save Best Checkpoint score_df = pd.DataFrame(score_list) if score_dict["val_score"] >= model.val_score_best: test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(test_dict) hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) # score_df.to_csv(os.path.join(savedir, "score_best_df.csv")) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) model.waiting = 0 model.val_score_best = score_dict["val_score"] print("Saved Best: %s" % savedir) # Report & Save score_df = pd.DataFrame(score_list) # score_df.to_csv(os.path.join(savedir, "score_df.csv")) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if model.waiting > 100: break print('Experiment completed et epoch %d' % e)
def trainval_svrg(exp_dict, savedir, datadir, metrics_flag=True): ''' SVRG-specific training and validation loop. ''' pprint.pprint(exp_dict) # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=False, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Load model model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # lookup the learning rate lr = get_svrg_step_size(exp_dict) # Load Optimizer opt = get_svrg_optimizer(model, loss_function, train_loader=train_loader, lr=lr) # Resume from last saved state_dict if (not os.path.exists(savedir + "/run_dict.pkl") or not os.path.exists(savedir + "/score_list.pkl")): ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1}) score_list = [] s_epoch = 0 else: score_list = ut.load_pkl(savedir + "/score_list.pkl") model.load_state_dict(torch.load(savedir + "/model_state_dict.pth")) opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth")) s_epoch = score_list[-1]["epoch"] + 1 for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): images, labels = images.cuda(), labels.cuda() opt.zero_grad() closure = lambda svrg_model: loss_function( svrg_model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record step size and batch size score_dict["step_size"] = opt.state["step_size"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(savedir + "/score_list.pkl", score_list) ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict()) ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict()) print("Saved: %s" % savedir) return score_list
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=1, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(val_set)) train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate and Visualize the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(val_dict) # model.vis_on_loader( # vis_loader, savedir=os.path.join(savedir, "images")) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = len(score_list) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Save Best Checkpoint if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()): hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('Experiment completed et epoch %d' % e)
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ datadir = args.datadir # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # test set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="test", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader(val_set, # sampler=val_sampler, batch_size=exp_dict["batch_size"], collate_fn=ut.collate_fn, num_workers=args.num_workers, drop_last=False) test_loader = DataLoader(test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=args.num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() chk_dict = hw.get_checkpoint(savedir) score_list = chk_dict['score_list'] # Train & Val # ================== model.waiting = 0 model.val_score_best = -np.inf sampler = exp_dict['dataset'].get('sampler', 'random') if sampler == 'random': train_sampler = torch.utils.data.RandomSampler( train_set, replacement=True, num_samples=len(val_set)) elif sampler == 'balanced': train_sampler = samplers.BalancedSampler( train_set, n_samples=len(val_set)) train_loader = DataLoader(train_set, sampler=train_sampler, collate_fn=ut.collate_fn, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=args.num_workers) for e in range(chk_dict['epoch'], exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join(savedir, "images"), n_images=5) score_dict.update(val_dict) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = e score_dict["waiting"] = model.waiting model.waiting += 1 # Add to score_list and save checkpoint score_list += [score_dict] # Save Best Checkpoint score_df = pd.DataFrame(score_list) if score_dict["val_score"] >= model.val_score_best: test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join(savedir, "images"), n_images=3) score_dict.update(test_dict) hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) # score_df.to_csv(os.path.join(savedir, "score_best_df.csv")) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) model.waiting = 0 model.val_score_best = score_dict["val_score"] print("Saved Best: %s" % savedir) # Report & Save hw.save_checkpoint(savedir, score_list=score_list) if model.waiting > 100: break print('Experiment completed et epoch %d' % e)
def train(exp_dict, savedir_base, reset, compute_fid=False): # Book keeping pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: ut.rmtree(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) print('Experiment saved in %s' % savedir) device = \ torch.device('cuda:' + exp_dict['gpu'] if torch.cuda.is_available() else 'cpu') # 1. Load dataset and loader train_set, test_set, num_channels, num_train_classes, num_test_classes = \ datasets.get_dataset(exp_dict['dataset'], dataset_path=savedir_base, image_size=exp_dict['image_size']) train_loader, test_loader = \ dataloaders.get_dataloader(exp_dict['dataloader'], train_set, test_set, exp_dict) # 2. Fetch model to train model = models.get_model(exp_dict['model'], num_train_classes, num_test_classes, num_channels, device, exp_dict) # 3. Resume experiment or start from scratch score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # Resume experiment if it exists model_path = os.path.join(savedir, 'model_state_dict.pth') model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) meta_dict_path = os.path.join(savedir, 'meta_dict.pkl') meta_dict = hu.load_pkl(meta_dict_path) print('Resuming experiment at episode %d epoch %d' % (meta_dict['episode'], meta_dict['epoch'])) else: # Start experiment from scratch meta_dict = {'episode': 1, 'epoch': 1} score_list = [] # Remove TensorBoard logs from previous runs ut.rmtree(os.path.join(savedir, 'tensorboard_logs')) print('Starting experiment at episode %d epoch %d' % (meta_dict['episode'], meta_dict['epoch'])) # 4. Train and eval loop s_epoch = meta_dict['epoch'] for e in range(s_epoch, exp_dict['num_epochs'] + 1): # 0. Initialize dicts score_dict = {'epoch': e} meta_dict['epoch'] = e # 1. Train on loader train_dict = model.train_on_loader(train_loader) # 1b. Compute FID if compute_fid == 1: if e % 20 == 0 or e == 1 or e == exp_dict['num_epochs']: print('Starting FID computation...') train_dict['fid'] = fid(model, train_loader.dataset, train_loader.sampler, save_dir) score_dict.update(train_dict) # 2. Eval on loader eval_dict = model.val_on_loader(test_loader, savedir, e) score_dict.update(eval_dict) # 3. Report and save model state, optimizer state, and scores score_list += [score_dict] score_df = pd.DataFrame(score_list) print('\n', score_df.tail(), '\n') if e % 10 == 0: hu.torch_save(os.path.join(savedir, 'model_state_dict.pth'), model.get_state_dict()) hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list) hu.save_pkl(os.path.join(savedir, 'meta_dict.pkl'), meta_dict)
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) #helen commented out the following lines to hard code in that the device was 'cpu' to resolve errors #if args.use_cuda: #device = 'cuda' #torch.cuda.manual_seed_all(seed) #assert torch.cuda.is_available(), 'cuda is not, available please run with "-c 0"' #else: device = 'cpu' print('Running on device: %s' % device) # Dataset # Load val set and train set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", transform=exp_dict.get("transform"), datadir=args.datadir) train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", transform=exp_dict.get("transform"), datadir=args.datadir) # Load train loader, val loader, and vis loader train_loader = DataLoader(train_set, sampler=RandomSampler( train_set, replacement=True, num_samples=max(min(500, len(train_set)), len(val_set))), batch_size=exp_dict["batch_size"]) val_loader = DataLoader(val_set, shuffle=False, batch_size=exp_dict["batch_size"]) vis_loader = DataLoader(val_set, sampler=ut.SubsetSampler(train_set, indices=[0, 1, 2]), batch_size=1) # Create model, opt, wrapper model_original = models.get_model(exp_dict["model"], exp_dict=exp_dict).cuda() opt = torch.optim.Adam(model_original.parameters(), lr=1e-5, weight_decay=0.0005) model = wrappers.get_wrapper(exp_dict["wrapper"], model=model_original, opt=opt).cuda() score_list = [] # Checkpointing # ============= #score_list_path = os.path.join(savedir, "score_list.pkl") #helen commented out these three lines and hard coded the model and opt paths to resolve errors #model_path = os.path.join(savedir, "model_state_dict.pth") #opt_path = os.path.join(savedir, "opt_state_dict.pth") score_list_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/score_list.pkl' #helen added this model_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/model_state_dict.pth' #helen added this opt_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/opt_state_dict.pth' #helen added this #helen hard coded that the experiment would resume instead of restarting from epoch 0 #if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl( score_list_path ) #helen changed this from ut.load_pkl to hu.load_pkl to resolve error model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 #else: # restart experiment #score_list = [] #s_epoch = 0 # *************** helen added this code im = Image.open("/Users/helenpropson/Documents/git/marepesca/tank.jpg") # im.show() #this line will display the image you are running the model on if uncommented mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize_transform = transforms.Normalize(mean=mean, std=std) data_transform = transforms.Compose( [transforms.ToTensor(), normalize_transform]) #transformations we will use on our image im_new = data_transform( im) #transforms the image into a tensor and normalizes it im_final = im_new.unsqueeze( 0) #adds another dimension so image is the correct shape for the model print("now trying helen's code") #print statement for debugging #model.vis_on_batch_helen(im_final, f'im_new') #uncomment this line to run model on image # *************** this is the end of helen's code # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} # visualize model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) print("after vis_on_loader" ) #helen add this print statement as an update while iterating # validate score_dict.update(model.val_on_loader(val_loader)) print("after validate") # train score_dict.update(model.train_on_loader(train_loader)) print("after train") # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved in %s" % savedir)
exp_dict = exp_configs.EXP_GROUPS[exp_group][0] dataset_name = exp_dict['dataset']['name'] n_classes = exp_dict['dataset']['n_classes'] stat_list = [] print('') print(dataset_name, '-', 'n_classes: %d' % n_classes) print('===========') fname = '.tmp/covid_stats/%s_c%d.csv' % (dataset_name, n_classes) if not os.path.exists(fname): for split in ['train', 'val', 'test']: dataset = datasets.get_dataset( dataset_dict={'name': dataset_name}, datadir=None, split=split, exp_dict=exp_dict) loader = torch.utils.data.DataLoader(dataset, batch_size=1, num_workers=100, collate_fn=ut.collate_fn) for i, b in enumerate(tqdm.tqdm(loader)): u_list = np.unique(b['masks']) stat_dict = {'split': split} b['points'][b['points'] == 0] = 255 for c in range(n_classes): if c in u_list: stat_dict['class_%d' % c] = 1 else:
# lcfcn loss with_affinity=True # hash_dir = '84ced18cf5c1fb3ad5820cc1b55a38fa' # point level # hash_dir = 'd7040c9534b08e765f48c6cb034b26b2' # LCFCN hash_dir = 'bcba046296675e9e3af5cd9f353d217b' savedir = '/mnt/public/predictions' datadir = '/mnt/public/datasets/DeepFish/' split = 'test' test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split=split, datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) test_loader = DataLoader( test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=0) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=test_set).cuda()
def newminimum(exp_id, savedir_base, datadir, name, exp_dict, metrics_flag=True): # bookkeeping # --------------- # get experiment directory old_modeldir = os.path.join(savedir_base, exp_id) savedir = os.path.join(savedir_base, exp_id, name) old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json')) # TODO: compare exp dict for possible errors: # optimizer have to be the same # same network, dataset # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') old_model_path = os.path.join(old_modeldir, 'model.pth') old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl') old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth') score_list = hu.load_pkl(old_score_list_path) model.load_state_dict(torch.load(old_model_path)) opt.load_state_dict(torch.load(old_opt_path)) s_epoch = score_list[-1]['epoch'] + 1 # save current model state for comparison minimum = [] for param in model.parameters(): minimum.append(param.clone()) # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for epoch in range(s_epoch, exp_dict['max_epoch']): # Set seed np.random.seed(exp_dict['runs'] + epoch) torch.manual_seed(exp_dict['runs'] + epoch) # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name='softmax_loss') # metric_name=exp_dict["loss_func"]) # TODO: which loss should be used? (normal or with reguralizer?) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): # images, labels = images.cuda(), labels.cuda() no cuda available opt.zero_grad() loss = loss_function(model, images, labels, minimum, 0.1) # just works for custom loss function loss.backward() opt.step() e_time = time.time() # Record metrics score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) with torch.nograd(): print('Current distance: %f', metrics.computedistance(minimum, model)) print('Experiment completed')
def test(exp_dict, savedir_base, datadir, num_workers=0, model_path=None, scan_id=None): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # val set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) if str(scan_id) != 'None': test_set.active_data = test_set.get_scan(scan_id) test_sampler = torch.utils.data.SequentialSampler(test_set) test_loader = DataLoader(test_set, sampler=test_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) # Model # ================== # chk = torch.load('best_model.ckpt') model = models.get_model_for_onnx_export(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=test_set).cuda() epoch = -1 if str(model_path) != 'None': model_path = model_path model.load_state_dict(hu.torch_load(model_path)) else: try: exp_dict_train = copy.deepcopy(exp_dict) del exp_dict_train['test_mode'] savedir_train = os.path.join(savedir_base, hu.hash_dict(exp_dict_train)) model_path = os.path.join(savedir_train, "model_best.pth") score_list = hu.load_pkl( os.path.join(savedir_train, 'score_list_best.pkl')) epoch = score_list[-1]['epoch'] print('Loaded model at epoch %d with score %.3f' % epoch) model.load_state_dict(hu.torch_load(model_path)) except: pass s_time = time.time() savedir_images = os.path.join(savedir, 'images') # delete image folder if exists if os.path.exists(savedir_images): shutil.rmtree(savedir_images) os.makedirs(savedir_images, exist_ok=True) # for i in range(20): # score_dict = model.train_on_loader(test_loader) score_dict = model.val_on_loader(test_loader, savedir_images=savedir_images, n_images=30000, save_preds=True) score_dict['epoch'] = epoch score_dict["time"] = time.time() - s_time score_dict["saved_at"] = hu.time_to_montreal() # save test_score_list test_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(test_path): test_score_list = [ sd for sd in hu.load_pkl(test_path) if sd['epoch'] != epoch ] else: test_score_list = [] # append score_dict to last result test_score_list += [score_dict] hu.save_pkl(test_path, test_score_list) print('Final Score is ', str(score_dict["val_score"]) + "\n")
'lr': 1e-06, 'max_epoch': 100, 'model': { 'base': 'fcn8_vgg16', 'loss': 'point_level', 'n_channels': 3, 'n_classes': 2, 'name': 'semseg' }, 'num_channels': 1, 'optimizer': 'adam' } pprint.pprint(exp_dict) train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir='/mnt/public/datasets/DeepFish', exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) model_seam = resnet38_SEAM.Net().cuda() model_seam.load_state_dict( torch.load(os.path.join('/mnt/public/weights', 'resnet38_SEAM.pth'))) model_aff = resnet38_aff.Net().cuda() model_aff.load_state_dict(torch.load( os.path.join('/mnt/public/weights', 'resnet38_aff_SEAM.pth')), strict=False) # ut.generate_seam_segmentation(train_set, # path_base='/mnt/datasets/public/issam/seam', # # path_base='D:/Issam/SEAM_model/'
def trainval(exp_dict, savedir_base, datadir_base, reset=False, num_workers=0, pin_memory=False, ngpu=1, cuda_deterministic=False): # bookkeeping # ================== # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) if DEVICE.type == "cuda": if cuda_deterministic: cudnn.benchmark = False cudnn.deterministic = True else: cudnn.benchmark = True # Dataset # ================== trainset = get_dataset(exp_dict['dataset'], 'train', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=exp_dict['dataset']['transform_lvl'], colorjitter=exp_dict['dataset'].get('colorjitter') ) valset = get_dataset(exp_dict['dataset'], 'validation', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) testset = get_dataset(exp_dict['dataset'], 'test', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['test'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) print("Dataset defined.") # define dataloaders if exp_dict['dataset']['name'] == 'bach': testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) else: testloader = torch.utils.data.DataLoader(testset, batch_size=exp_dict['batch']['size'], shuffle=False, num_workers=num_workers, pin_memory=pin_memory) print("Testloader defined.") # Model # ================== model = get_model(exp_dict, trainset, device=DEVICE) print("Model loaded") model_path = os.path.join(savedir, 'model.pth') model_best_path = os.path.join(savedir, 'model_best.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') # checkpoint management if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = len(score_list) else: # restart experiment score_list = [] s_epoch = 0 # define and log random seed for reproducibility assert('fixedSeed' in exp_dict) seed = exp_dict['fixedSeed'] random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) print("Seed defined.") # Train & Val # ================== print("Starting experiment at epoch %d/%d" % (s_epoch, exp_dict['niter'])) for epoch in range(s_epoch, exp_dict['niter']): s_time = time.time() # Sample new train val trainloader, valloader = get_train_val_dataloader(exp_dict, trainset, valset, mixtrainval=exp_dict['mixTrainVal'], num_workers=num_workers, pin_memory=pin_memory) # Train & validate train_dict = model.train_on_loader(trainloader, valloader, epoch=epoch, exp_dict=exp_dict) # Test phase train_dict_2 = model.test_on_loader(trainloader) val_dict = model.test_on_loader(valloader) test_dict = model.test_on_loader(testloader) # Vis phase model.vis_on_loader('train', trainset, savedir_images=os.path.join( savedir, 'images'), epoch=epoch) score_dict = {} score_dict["epoch"] = epoch score_dict["test_acc"] = test_dict['acc'] score_dict["val_acc"] = val_dict['acc'] score_dict["train_acc"] = train_dict_2['acc'] score_dict["train_loss"] = train_dict['loss'] score_dict["time_taken"] = time.time() - s_time score_dict["netC_lr"] = train_dict['netC_lr'] if exp_dict['model']['netA'] is not None: if 'transformations_mean' in train_dict: for i in range(len(train_dict['transformations_mean'])): score_dict[str( i) + "_mean"] = train_dict['transformations_mean'][i].item() if 'transformations_std' in train_dict: for i in range(len(train_dict['transformations_std'])): score_dict[str( i) + "_std"] = train_dict['transformations_std'][i].item() # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Update best score if epoch == 0 or (score_dict["test_acc"] >= score_df["test_acc"][:-1].max()): hu.save_pkl(os.path.join( savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('experiment completed')
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-d', '--datadir', type=str, default='/mnt/public/datasets/DeepFish') parser.add_argument("-e", "--exp_config", default='loc') parser.add_argument("-uc", "--use_cuda", type=int, default=0) args = parser.parse_args() device = torch.device('cuda' if args.use_cuda else 'cpu') exp_dict = exp_configs.EXP_GROUPS[args.exp_config][0] train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", transform=exp_dict.get("transform"), datadir=args.datadir) # Create model, opt, wrapper model_original = models.get_model(exp_dict["model"], exp_dict=exp_dict).to('cpu') #.cuda() opt = torch.optim.Adam(model_original.parameters(), lr=1e-5, weight_decay=0.0005) model = wrappers.get_wrapper(exp_dict["wrapper"], model=model_original, opt=opt).to('cpu') #.cuda() if args.exp_config == 'loc': batch = torch.utils.data.dataloader.default_collate([train_set[3]])
'pascal': '/mnt/datasets/public/issam', 'kitti': '/mnt/datasets/public/issam' } if __name__ == "__main__": for exp_group in [ 'weakly_covid19_v1_c2', 'weakly_covid19_v2_mixed_c2', 'weakly_covid19_v2_sep_c2', 'weakly_covid19_v2_mixed_c3', 'weakly_covid19_v2_sep_c3', 'weakly_covid19_v3_mixed_c2' ]: exp_dict = exp_configs.EXP_GROUPS[exp_group][0] dataset_name = exp_dict['dataset']['name'] n_classes = exp_dict['dataset']['n_classes'] train_set = datasets.get_dataset(dataset_dict={'name': dataset_name}, datadir=None, split="test", exp_dict=exp_dict) for i, b in enumerate(train_set): if b['masks'].sum() == 0: print(i) continue break batch = ut.collate_fn([b]) image = batch['images'] gt = np.asarray(batch['masks'], np.float32) gt /= (gt.max() + 1e-8) image = F.interpolate(image, size=gt.shape[-2:], mode='bilinear',
argparser.add_argument("--data-parallel", action="store_true") argparser.add_argument("--use-val-set", action="store_true") argparser.add_argument("--focal", action="store_true") argparser.add_argument('--output-dir', type=str, default=os.getenv("PT_OUTPUT_DIR")) args = argparser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) model = eval(args.model)(dataset=args.dataset, device=args.device, precision=args.precision, norm_layer=args.norm_layer, focal=args.focal) model = DataParallelWrapper(model) if args.data_parallel else model if not args.use_val_set: train_dataset = get_dataset(args.dataset, "train", args.precision) train_loader = get_dataloader(train_dataset, True, args.batch_size, args.num_workers) #_, subset_idxs = split_hold_out_set(train_dataset.targets, 10000) subset_idxs = np.random.choice(len(train_dataset), 10000, replace=False) train_subset_dataset = Subset(train_dataset, list(subset_idxs)) train_subset_loader = get_dataloader(train_subset_dataset, False, args.batch_size, args.num_workers) test_dataset = get_dataset(args.eval_dataset or args.dataset, "test", args.precision) test_loader = get_dataloader(test_dataset, False, args.batch_size, args.num_workers) eval_loaders_and_datasets = ((train_subset_loader, len(train_subset_dataset), "train"), (test_loader, len(test_dataset), "test")) else:
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # train loader train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=savedir_base, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=savedir_base, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Compute fstar # ------------- if exp_dict['opt'].get('fstar_flag'): ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for e in range(s_epoch, exp_dict['max_epoch']): # Set seed seed = e + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {} # Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # Train over train loader model.train() print("%d - Training model with %s..." % (e, exp_dict["loss_func"])) # train and validate s_time = time.time() for batch in tqdm.tqdm(train_loader): images, labels = batch["images"].cuda(), batch["labels"].cuda() opt.zero_grad() # closure def closure(): return loss_function(model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record metrics score_dict["epoch"] = e score_dict["step_size"] = opt.state["step_size"] score_dict["step_size_avg"] = opt.state["step_size_avg"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["grad_norm"] = opt.state["grad_norm"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) print('Experiment completed')
def trainval(exp_dict, savedir_base, reset, metrics_flag=True, datadir=None, cuda=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) print(pprint.pprint(exp_dict)) print('Experiment saved in %s' % savedir) # set seed # ================== seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) if cuda: device = 'cuda' torch.cuda.manual_seed_all(seed) else: device = 'cpu' print('Running on device: %s' % device) # Dataset # ================== train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = DataLoader(train_set, drop_last=True, shuffle=True, sampler=None, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ================== use_backpack = exp_dict['opt'].get("backpack", False) model = models.get_model(exp_dict["model"], train_set=train_set, backpack=use_backpack).to(device=device) if use_backpack: assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second'] from backpack import extend model = extend(model) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer # ============== n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch, n_train=len(train_set), train_loader=train_loader, model=model, loss_function=loss_function, exp_dict=exp_dict, batch_size=exp_dict["batch_size"]) # Checkpointing # ============= score_list_path = os.path.join(savedir, "score_list.pkl") model_path = os.path.join(savedir, "model_state_dict.pth") opt_path = os.path.join(savedir, "opt_state_dict.pth") if os.path.exists(score_list_path): # resume experiment score_list = ut.load_pkl(score_list_path) if use_backpack: model.load_state_dict(torch.load(model_path), strict=False) else: model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Start Training # ============== n_train = len(train_loader.dataset) n_batches = len(train_loader) batch_size = train_loader.batch_size for epoch in range(s_epoch, exp_dict["max_epoch"]): # Set seed seed = epoch + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {"epoch": epoch} # Validate # -------- if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"], batch_size=exp_dict['batch_size']) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"], batch_size=exp_dict['batch_size']) # Train # ----- model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() train_on_loader(model, train_set, train_loader, opt, loss_function, epoch, use_backpack) e_time = time.time() # Record step size and batch size score_dict["step"] = opt.state.get("step", 0) / int(n_batches_per_epoch) score_dict["step_size"] = opt.state.get("step_size", {}) score_dict["step_size_avg"] = opt.state.get("step_size_avg", {}) score_dict["n_forwards"] = opt.state.get("n_forwards", {}) score_dict["n_backwards"] = opt.state.get("n_backwards", {}) score_dict["grad_norm"] = opt.state.get("grad_norm", {}) score_dict["batch_size"] = batch_size score_dict["train_epoch_time"] = e_time - s_time score_dict.update(opt.state["gv_stats"]) # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) ut.save_pkl(score_list_path, score_list) ut.torch_save(model_path, model.state_dict()) ut.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) return score_list
argparser.add_argument("--sigma", default=0.0, type=float) argparser.add_argument("--noise", default="Clean", type=str) argparser.add_argument("--k", default=None, type=int) argparser.add_argument("--j", default=None, type=int) argparser.add_argument("--a", default=None, type=int) argparser.add_argument("--lambd", default=None, type=float) argparser.add_argument("--adv", default=2, type=int) argparser.add_argument("--experiment-name", default="cifar", type=str) argparser.add_argument("--dataset", default="cifar", type=str) argparser.add_argument("--model", default="WideResNet", type=str) argparser.add_argument("--output-dir", type=str, default=os.getenv("PT_OUTPUT_DIR")) args = argparser.parse_args() test_dataset = get_dataset(args.dataset, "test") test_loader = DataLoader( test_dataset, shuffle=False, batch_size=args.batch_size, # todo: fix num_workers=args.num_workers) save_path = f"{args.output_dir}/{args.experiment_name}/model_ckpt.torch" model = eval(args.model)(dataset=args.dataset, device=args.device) model.load_state_dict(torch.load(save_path)) model.eval() noise = parse_noise_from_args(args, device=args.device, dim=get_dim(args.dataset))
def trainval(exp_dict, savedir_base, reset=False, num_workers=0, run_ssl=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # load datasets # ========================== train_set = datasets.get_dataset( dataset_name=exp_dict["dataset_train"], data_root=exp_dict["dataset_train_root"], split="train", transform=exp_dict["transform_train"], classes=exp_dict["classes_train"], support_size=exp_dict["support_size_train"], query_size=exp_dict["query_size_train"], n_iters=exp_dict["train_iters"], unlabeled_size=exp_dict["unlabeled_size_train"]) val_set = datasets.get_dataset( dataset_name=exp_dict["dataset_val"], data_root=exp_dict["dataset_val_root"], split="val", transform=exp_dict["transform_val"], classes=exp_dict["classes_val"], support_size=exp_dict["support_size_val"], query_size=exp_dict["query_size_val"], n_iters=exp_dict["val_iters"], unlabeled_size=exp_dict["unlabeled_size_val"]) test_set = datasets.get_dataset( dataset_name=exp_dict["dataset_test"], data_root=exp_dict["dataset_test_root"], split="test", transform=exp_dict["transform_val"], classes=exp_dict["classes_test"], support_size=exp_dict["support_size_test"], query_size=exp_dict["query_size_test"], n_iters=exp_dict["test_iters"], unlabeled_size=exp_dict["unlabeled_size_test"]) # get dataloaders # ========================== train_loader = torch.utils.data.DataLoader( train_set, batch_size=exp_dict["batch_size"], shuffle=True, num_workers=num_workers, collate_fn=ut.get_collate(exp_dict["collate_fn"]), drop_last=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) # create model and trainer # ========================== # Create model, opt, wrapper backbone = backbones.get_backbone( backbone_name=exp_dict['model']["backbone"], exp_dict=exp_dict) model = models.get_model(model_name=exp_dict["model"]['name'], backbone=backbone, n_classes=exp_dict["n_classes"], exp_dict=exp_dict) if run_ssl: # runs the SSL experiments score_list_path = os.path.join(savedir, 'score_list.pkl') if not os.path.exists(score_list_path): test_dict = model.test_on_loader(test_loader, max_iter=None) hu.save_pkl(score_list_path, [test_dict]) return # Checkpoint # ----------- checkpoint_path = os.path.join(savedir, 'checkpoint.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(checkpoint_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} score_dict.update(model.get_lr()) # train score_dict.update(model.train_on_loader(train_loader)) # validate score_dict.update(model.val_on_loader(val_loader)) score_dict.update(model.test_on_loader(test_loader)) # Add score_dict to score_list score_list += [score_dict] # Report score_df = pd.DataFrame(score_list) print(score_df.tail()) # Save checkpoint hu.save_pkl(score_list_path, score_list) hu.torch_save(checkpoint_path, model.get_state_dict()) print("Saved: %s" % savedir) if "accuracy" in exp_dict["target_loss"]: is_best = score_dict[exp_dict["target_loss"]] >= score_df[ exp_dict["target_loss"]][:-1].max() else: is_best = score_dict[exp_dict["target_loss"]] <= score_df[ exp_dict["target_loss"]][:-1].min() # Save best checkpoint if is_best: hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "checkpoint_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) # Check for end of training conditions if model.is_end_of_training(): break