def __call__(self, data_loader, epochs=10, checkpoint_every=10): """ Trains the model. Parameters ---------- data_loader: torch.utils.data.DataLoader epochs: int, optional Number of epochs to train the model for. checkpoint_every: int, optional Save a checkpoint of the trained model every n epoch. """ start = default_timer() self.model.train() for epoch in range(epochs): storer = defaultdict(list) mean_epoch_loss = self._train_epoch(data_loader, storer, epoch) self.logger.info('Epoch: {} Average loss per image: {:.2f}'.format( epoch + 1, mean_epoch_loss)) self.losses_logger.log(epoch, storer) if self.gif_visualizer is not None: self.gif_visualizer() if epoch % checkpoint_every == 0: save_model(self.model, self.save_dir, filename="model-{}.pt".format(epoch)) if self.gif_visualizer is not None: self.gif_visualizer.save_reset() self.model.eval() delta_time = (default_timer() - start) / 60 self.logger.info( 'Finished training after {:.1f} min.'.format(delta_time))
def main(args): """Main train and evaluation function. Parameters ---------- args: argparse.Namespace Arguments """ formatter = logging.Formatter( '%(asctime)s %(levelname)s - %(funcName)s: %(message)s', "%H:%M:%S") logger = logging.getLogger(__name__) logger.setLevel(args.log_level.upper()) stream = logging.StreamHandler() stream.setLevel(args.log_level.upper()) stream.setFormatter(formatter) logger.addHandler(stream) set_seed(args.seed) device = get_device(is_gpu=not args.no_cuda) exp_dir = os.path.join(RES_DIR, args.name) logger.info("Root directory for saving and loading experiments: {}".format( exp_dir)) if not args.is_eval_only: create_safe_directory(exp_dir, logger=logger) if args.loss == "factor": logger.info( "FactorVae needs 2 batches per iteration. To replicate this behavior while being consistent, we double the batch size and the the number of epochs." ) args.batch_size *= 2 args.epochs *= 2 # PREPARES DATA train_loader = get_dataloaders(args.dataset, batch_size=args.batch_size, logger=logger) logger.info("Train {} with {} samples".format( args.dataset, len(train_loader.dataset))) # PREPARES MODEL args.img_size = get_img_size(args.dataset) # stores for metadata model = init_specific_model(args.model_type, args.img_size, args.latent_dim) logger.info('Num parameters in model: {}'.format(get_n_param(model))) # TRAINS optimizer = optim.Adam(model.parameters(), lr=args.lr) model = model.to(device) # make sure trainer and viz on same device gif_visualizer = GifTraversalsTraining(model, args.dataset, exp_dir) loss_f = get_loss_f(args.loss, n_data=len(train_loader.dataset), device=device, **vars(args)) trainer = Trainer(model, optimizer, loss_f, device=device, logger=logger, save_dir=exp_dir, is_progress_bar=not args.no_progress_bar, gif_visualizer=gif_visualizer) trainer( train_loader, epochs=args.epochs, checkpoint_every=args.checkpoint_every, ) # SAVE MODEL AND EXPERIMENT INFORMATION save_model(trainer.model, exp_dir, metadata=vars(args)) if args.is_metrics or not args.no_test: model = load_model(exp_dir, is_gpu=not args.no_cuda) metadata = load_metadata(exp_dir) # TO-DO: currently uses train datatset test_loader = get_dataloaders(metadata["dataset"], batch_size=args.eval_batchsize, shuffle=False, logger=logger) loss_f = get_loss_f(args.loss, n_data=len(test_loader.dataset), device=device, **vars(args)) use_wandb = False if use_wandb: loss = args.loss wandb.init(project="atmlbetavae", config={"VAE_loss": args.loss}) if loss == "betaH": beta = loss_f.beta wandb.config["Beta"] = beta evaluator = Evaluator(model, loss_f, device=device, logger=logger, save_dir=exp_dir, is_progress_bar=not args.no_progress_bar, use_wandb=use_wandb) evaluator(test_loader, is_metrics=args.is_metrics, is_losses=not args.no_test)
device=device, **vars(args)) wandb.watch(model, optimizer, log="parameters", log_freq=1000) trainer = Trainer(model, optimizer, loss_f=loss_f, loss_name=args.loss, device=device, logger=logger, save_dir=exp_dir, is_progress_bar=not args.no_progress_bar, gif_visualizer=gif_visualizer) trainer(train_loader, epochs=args.epochs, checkpoint_every=args.checkpoint_every) # SAVE MODEL AND EXPERIMENT INFORMATION save_model(trainer.model, exp_dir, metadata=vars(args)) if args.is_metrics or not args.no_test: model = load_model(exp_dir, is_gpu=not args.no_cuda) metadata = load_metadata(exp_dir) # TO-DO: currently uses train datatset test_loader = get_dataloaders(metadata["dataset"], batch_size=args.eval_batchsize, shuffle=False, logger=logger) loss_f = get_loss_f(args.loss, n_data=len(test_loader.dataset), device=device, **vars(args)) evaluator = Evaluator(model, loss_f, device=device,