def __call__(self, data_loader, epochs=10, checkpoint_every=10):
        """
        Trains the model.

        Parameters
        ----------
        data_loader: torch.utils.data.DataLoader

        epochs: int, optional
            Number of epochs to train the model for.

        checkpoint_every: int, optional
            Save a checkpoint of the trained model every n epoch.
        """
        start = default_timer()
        self.model.train()
        for epoch in range(epochs):
            storer = defaultdict(list)
            mean_epoch_loss = self._train_epoch(data_loader, storer, epoch)
            self.logger.info('Epoch: {} Average loss per image: {:.2f}'.format(
                epoch + 1, mean_epoch_loss))
            self.losses_logger.log(epoch, storer)

            if self.gif_visualizer is not None:
                self.gif_visualizer()

            if epoch % checkpoint_every == 0:
                save_model(self.model,
                           self.save_dir,
                           filename="model-{}.pt".format(epoch))

        if self.gif_visualizer is not None:
            self.gif_visualizer.save_reset()

        self.model.eval()

        delta_time = (default_timer() - start) / 60
        self.logger.info(
            'Finished training after {:.1f} min.'.format(delta_time))
Exemple #2
0
def main(args):
    """Main train and evaluation function.

    Parameters
    ----------
    args: argparse.Namespace
        Arguments
    """
    formatter = logging.Formatter(
        '%(asctime)s %(levelname)s - %(funcName)s: %(message)s', "%H:%M:%S")
    logger = logging.getLogger(__name__)
    logger.setLevel(args.log_level.upper())
    stream = logging.StreamHandler()
    stream.setLevel(args.log_level.upper())
    stream.setFormatter(formatter)
    logger.addHandler(stream)

    set_seed(args.seed)
    device = get_device(is_gpu=not args.no_cuda)
    exp_dir = os.path.join(RES_DIR, args.name)
    logger.info("Root directory for saving and loading experiments: {}".format(
        exp_dir))

    if not args.is_eval_only:

        create_safe_directory(exp_dir, logger=logger)

        if args.loss == "factor":
            logger.info(
                "FactorVae needs 2 batches per iteration. To replicate this behavior while being consistent, we double the batch size and the the number of epochs."
            )
            args.batch_size *= 2
            args.epochs *= 2

        # PREPARES DATA
        train_loader = get_dataloaders(args.dataset,
                                       batch_size=args.batch_size,
                                       logger=logger)
        logger.info("Train {} with {} samples".format(
            args.dataset, len(train_loader.dataset)))

        # PREPARES MODEL
        args.img_size = get_img_size(args.dataset)  # stores for metadata
        model = init_specific_model(args.model_type, args.img_size,
                                    args.latent_dim)
        logger.info('Num parameters in model: {}'.format(get_n_param(model)))

        # TRAINS
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

        model = model.to(device)  # make sure trainer and viz on same device
        gif_visualizer = GifTraversalsTraining(model, args.dataset, exp_dir)
        loss_f = get_loss_f(args.loss,
                            n_data=len(train_loader.dataset),
                            device=device,
                            **vars(args))
        trainer = Trainer(model,
                          optimizer,
                          loss_f,
                          device=device,
                          logger=logger,
                          save_dir=exp_dir,
                          is_progress_bar=not args.no_progress_bar,
                          gif_visualizer=gif_visualizer)
        trainer(
            train_loader,
            epochs=args.epochs,
            checkpoint_every=args.checkpoint_every,
        )

        # SAVE MODEL AND EXPERIMENT INFORMATION
        save_model(trainer.model, exp_dir, metadata=vars(args))

    if args.is_metrics or not args.no_test:
        model = load_model(exp_dir, is_gpu=not args.no_cuda)
        metadata = load_metadata(exp_dir)
        # TO-DO: currently uses train datatset

        test_loader = get_dataloaders(metadata["dataset"],
                                      batch_size=args.eval_batchsize,
                                      shuffle=False,
                                      logger=logger)
        loss_f = get_loss_f(args.loss,
                            n_data=len(test_loader.dataset),
                            device=device,
                            **vars(args))

        use_wandb = False
        if use_wandb:
            loss = args.loss
            wandb.init(project="atmlbetavae", config={"VAE_loss": args.loss})
            if loss == "betaH":
                beta = loss_f.beta
                wandb.config["Beta"] = beta
        evaluator = Evaluator(model,
                              loss_f,
                              device=device,
                              logger=logger,
                              save_dir=exp_dir,
                              is_progress_bar=not args.no_progress_bar,
                              use_wandb=use_wandb)

        evaluator(test_loader,
                  is_metrics=args.is_metrics,
                  is_losses=not args.no_test)
                                device=device,
                                **vars(args))
            wandb.watch(model, optimizer, log="parameters", log_freq=1000)
            trainer = Trainer(model, optimizer, loss_f=loss_f,
                            loss_name=args.loss,
                            device=device,
                            logger=logger,
                            save_dir=exp_dir,
                            is_progress_bar=not args.no_progress_bar,
                            gif_visualizer=gif_visualizer)
            trainer(train_loader,
                    epochs=args.epochs,
                    checkpoint_every=args.checkpoint_every)

            # SAVE MODEL AND EXPERIMENT INFORMATION
            save_model(trainer.model, exp_dir, metadata=vars(args))

        if args.is_metrics or not args.no_test:
            model = load_model(exp_dir, is_gpu=not args.no_cuda)
            metadata = load_metadata(exp_dir)
            # TO-DO: currently uses train datatset
            test_loader = get_dataloaders(metadata["dataset"],
                                        batch_size=args.eval_batchsize,
                                        shuffle=False,
                                        logger=logger)
            loss_f = get_loss_f(args.loss,
                                n_data=len(test_loader.dataset),
                                device=device,
                                **vars(args))
            evaluator = Evaluator(model, loss_f,
                                device=device,