def test_equal_optimizer_update(device):
    if device == "cuda" and not torch.cuda.is_available():
        print("No GPU available, skipping GPU test.")
        return
    """Verify that the parameters are the same after a few updates."""
    x = torch.randn(1, 8).to(device)

    model_ref = nn.Sequential(*[nn.Linear(8, 8) for i in range(10)])
    model_ref = model_ref.to(device)
    optimizer = torch.optim.SGD(model_ref.parameters(), lr=1e-3)

    model_c = deepcopy(model_ref)
    parameters_c = ContiguousParams(model_c.parameters())
    optimizer_c = torch.optim.SGD(parameters_c.contiguous(), lr=1e-3)

    for model, optimizer in zip([model_ref, model_c],
                                [optimizer, optimizer_c]):
        for step in range(5):
            loss = model(x).sum()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    # Verify that the model/optimizer did not modify the data or grad handle.
    parameters_c.assert_buffer_is_valid()

    # Verify that both models applied the same parameter updates.
    for p1, p2 in zip(model_ref.parameters(), model_c.parameters()):
        assert torch.allclose(p1.data, p2.data, atol=1e-06)
 def configure_optimizers(self):
     if self.use_contiguous:
         self.contiguous_params = ContiguousParams(self.parameters())
         params = self.contiguous_params.contiguous()
     else:
         params = self.model.parameters()
     self.optimizer = torch.optim.SGD(params, lr=1e-3)
     return self.optimizer
    class Model(pytorch_lightning.LightningModule):
        def __init__(self, use_contiguous):
            super().__init__()
            self.model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(),
                                       nn.Linear(10, 5))
            self.use_contiguous = use_contiguous
            self.loss_fn = torch.nn.CrossEntropyLoss()
            self.dataset = TensorDataset(data_X, data_y)
            self.contiguous_params = None
            self.optimizer = None

        def forward(self, x):
            return self.model(x)

        def training_step(self, batch, batch_idx):
            x, target = batch
            prediction = self(x)
            loss_value = self.loss_fn(prediction, target)
            return {'loss': loss_value}

        def train_dataloader(self):
            return torch.utils.data.DataLoader(self.dataset,
                                               batch_size=2,
                                               shuffle=False)

        def configure_optimizers(self):
            if self.use_contiguous:
                self.contiguous_params = ContiguousParams(self.parameters())
                params = self.contiguous_params.contiguous()
            else:
                params = self.model.parameters()
            self.optimizer = torch.optim.SGD(params, lr=1e-3)
            return self.optimizer
    def __init__(self, model, loss, optimizer, resume, config):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        self.params = ContiguousParams(self.model.parameters())
        self.optimizer = optimizer[0](self.params.contiguous(), **optimizer[1])

        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.amp_enabled = config['trainer']['amp']
        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp_enabled)

        self.steps = config['trainer']['steps']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']
        self.start_step = 0
        self.inference = config['trainer'].get('inference', False)

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)
def test_buffer_invalidation_detection(device):
    """Verify that we recognize an invalidated buffer."""
    if device == "cuda" and not torch.cuda.is_available():
        print("No GPU available, skipping GPU test.")
        return
    model = nn.Linear(8, 8)
    parameters = ContiguousParams(model.parameters())
    assert parameters.buffer_is_valid()
    # Invalidate the buffer.
    model.weight.data = model.weight + 4
    assert not parameters.buffer_is_valid()
    with pytest.raises(ValueError):
        parameters.assert_buffer_is_valid()
Exemple #6
0
    def __init__(self, args):
        super(Model, self).__init__()
        self.fusion = Decomposition()
        self.D = MultiscaleDiscriminator(input_nc=1)
        self.MSE_fun = nn.MSELoss()
        self.L1_loss = nn.L1Loss()
        self.SSIM_fun = SSIM()

        if args.contiguousparams == True:
            print("ContiguousParams---")
            parametersF = ContiguousParams(self.fusion.parameters())
            parametersD = ContiguousParams(self.D.parameters())
            self.optimizer_G = optim.Adam(parametersF.contiguous(), lr=args.lr)
            self.optimizer_D = optim.Adam(parametersD.contiguous(), lr=args.lr)
        else:
            self.optimizer_G = optim.Adam(self.fusion.parameters(), lr=args.lr)
            self.optimizer_D = optim.Adam(self.D.parameters(), lr=args.lr)

        self.g1 = self.g2 = self.g3 = self.s = self.img_re = None
        self.loss = torch.zeros(1)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer_G,
            mode='min',
            factor=0.5,
            patience=2,
            verbose=False,
            threshold=0.0001,
            threshold_mode='rel',
            cooldown=0,
            min_lr=0,
            eps=1e-10)
        self.min_loss = 1000
        self.args = args
        self.downsample = downsample()
        self.criterionGAN = torch.nn.MSELoss()

        if args.multiGPU:
            self.mulgpus()
        self.load()
        self.load_D()
Exemple #7
0
class Trainer:
    def __init__(self,
                 model,
                 train_loader,
                 val_loader,
                 criterion,
                 optimizer,
                 gpu=None,
                 output_file='checkpoint.pth',
                 acc_threshold=0.05,
                 configwb=None):

        self.model = model
        self.configwb = configwb
        self.train_loader = train_loader
        self.val_loader = val_loader

        self.criterion = criterion
        self.optimizer = optimizer
        self.acc_threshold = acc_threshold

        # Output file
        self.output_file = output_file

        # Initialize
        self.example_ct = 0
        self.step = 0
        self.start_epoch = 0
        self.epoch = 0
        self.train_loss = 0
        self.train_loss_recorder = []
        self.val_loss_recorder = []
        self.min_loss = math.inf
        self.gpu = gpu
        self._params = None
        self.lr_recorder, self.loss_recorder = [], []
        self.set_device(gpu)
        self._reset_opt(self.optimizer.defaults['lr'])

    def set_device(self, gpu):
        """
        Move tensor objects to the target GPU
        Args:
            gpu (int): index of the target GPU device
        """
        if isinstance(gpu, int):
            if not torch.cuda.is_available():
                raise AssertionError(
                    "PyTorch cannot access your GPU. Please investigate!")
            if gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            torch.cuda.set_device(gpu)
            self.model = self.model.cuda()
            if isinstance(self.criterion, torch.nn.Module):
                self.criterion = self.criterion.cuda()

    def save(self, output_file):
        """
        Save a trainer checkpoint
        Args:
            output_file (str): destination file path
        """

        torch.save(self.model.state_dict(), output_file)

    def load(self, state):
        """
        Resume from a trainer state
        Args:
            state (dict): checkpoint dictionary
        """
        self.start_epoch = state['epoch']
        self.epoch = self.start_epoch
        self.step = state['step']
        self.min_loss = state['min_loss']
        self.optimizer.load_state_dict(state['optimizer'])
        self.model.load_state_dict(state['model'])

    def _fit_epoch(self, freeze_until, mb):
        """
        Fit a single epoch
        Args:
            freeze_until (str): last layer to freeze
            mb (fastprogress.master_bar): primary progress bar
        """
        # self.model = freeze_bn(self.model.train())
        self.train_loss = 0
        self.model.train()
        pb = progress_bar(self.train_loader, parent=mb)
        for x, target in pb:
            x, target = self.to_cuda(x, target)
            self.example_ct += x.shape[0]
            # Forward
            batch_loss = self._get_loss(x, target)
            self.train_loss += batch_loss.item()

            # Backprop
            self._backprop_step(batch_loss)
            # Update LR
            self.scheduler.step()
            pb.comment = f"Training loss: {batch_loss.item():.4}"

            self.step += 1

        self.epoch += 1
        # print(self.train_loss,len(self.train_loader),self.train_loss/len(self.train_loader))
        self.train_loss /= len(self.train_loader)
        self.train_loss_recorder.append(self.train_loss)

    def to_cuda(self, x, target):
        """Move input and target to GPU !"""
        if isinstance(self.gpu, int):
            if self.gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            return self._to_cuda(x, target)
        else:
            return x, target

    @staticmethod
    def _to_cuda(x, target):
        """Move input and target to GPU !"""
        x = x.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        return x, target

    def _backprop_step(self, loss):
        # Clean gradients
        self.optimizer.zero_grad()
        # Backpropate the loss
        loss.backward()
        # Update the params
        self.optimizer.step()

    def _get_loss(self, x, target):
        # Forward
        out = self.model(x)
        # Loss computation
        return 1 - self.criterion(out, target)

    def _set_params(self):
        self._params = ContiguousParams(
            [p for p in self.model.parameters() if p.requires_grad])

    def _reset_opt(self, lr):
        """Reset the target params of the optimizer !"""
        self.optimizer.defaults['lr'] = lr
        self.optimizer.state = defaultdict(dict)
        self.optimizer.param_groups = []
        self._set_params()
        self.optimizer.add_param_group(dict(params=self._params.contiguous()))

    @torch.no_grad()
    def evaluate(self):
        raise NotImplementedError

    @staticmethod
    def _eval_metrics_str(eval_metrics):
        raise NotImplementedError

    def _reset_scheduler(self, lr, num_epochs, sched_type='onecycle'):
        if sched_type == 'onecycle':
            self.scheduler = OneCycleLR(self.optimizer, lr,
                                        num_epochs * len(self.train_loader))
        elif sched_type == 'cosine':
            self.scheduler = CosineAnnealingLR(self.optimizer,
                                               num_epochs *
                                               len(self.train_loader),
                                               eta_min=lr / 25e4)
        else:
            raise ValueError(
                f"The following scheduler type is not supported: {sched_type}")

    def fit_n_epochs(self,
                     num_epochs,
                     lr,
                     freeze_until=None,
                     sched_type='onecycle'):
        """
        Train the model for a given number of epochs
        Args:
            num_epochs (int): number of epochs to train
            lr (float): learning rate to be used by the scheduler
            freeze_until (str, optional): last layer to freeze
            sched_type (str, optional): type of scheduler to use
        """

        if self.configwb:
            wandb.watch(self.criterion, log="all", log_freq=10)

        self.epoch = 0
        self.train_loss_recorder = []
        self.val_loss_recorder = []

        self.model = freeze_model(self.model, freeze_until)
        # Update param groups & LR
        self._reset_opt(lr)
        # Scheduler
        self._reset_scheduler(lr, num_epochs, sched_type)

        mb = master_bar(range(num_epochs))
        for _ in mb:

            self._fit_epoch(freeze_until, mb)
            # Check whether ops invalidated the buffer
            self._params.assert_buffer_is_valid()
            eval_metrics = self.evaluate()

            # master bar
            mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}"
            mb.write(
                f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - "
                f"{self._eval_metrics_str(eval_metrics)}")

            self.save(self.output_file)
Exemple #8
0
 def _set_params(self):
     self._params = ContiguousParams(
         [p for p in self.model.parameters() if p.requires_grad])
class BaseTrainer:
    """
    Base class for all trainers
    """
    def __init__(self, model, loss, optimizer, resume, config):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)

        # setup GPU device if available, move model into configured device
        self.device, device_ids = self._prepare_device(config['n_gpu'])
        self.model = model.to(self.device)
        self.params = ContiguousParams(self.model.parameters())
        self.optimizer = optimizer[0](self.params.contiguous(), **optimizer[1])

        if len(device_ids) > 1:
            self.model = torch.nn.DataParallel(model, device_ids=device_ids)

        self.loss = loss
        self.amp_enabled = config['trainer']['amp']
        self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp_enabled)

        self.steps = config['trainer']['steps']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']
        self.start_step = 0
        self.inference = config['trainer'].get('inference', False)

        # setup directory for checkpoint saving
        start_time = datetime.datetime.now().strftime('%m%d_%H%M%S')
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           config['name'], start_time)
        # setup visualization writer instance
        writer_dir = os.path.join(config['visualization']['log_dir'],
                                  config['name'], start_time)
        self.writer = WriterTensorboardX(
            writer_dir, self.logger, config['visualization']['tensorboardX'])

        # Save configuration file into checkpoint directory:
        ensure_dir(self.checkpoint_dir)
        config_save_path = os.path.join(self.checkpoint_dir, 'config.json')
        with open(config_save_path, 'w') as handle:
            json.dump(config, handle, indent=4, sort_keys=False)

        if resume:
            self._resume_checkpoint(resume)

    def _prepare_device(self, n_gpu_use):
        """ 
        setup GPU device if available, move model into configured device
        """
        n_gpu = torch.cuda.device_count()
        if n_gpu_use > 0 and n_gpu == 0:
            self.logger.warning(
                "Warning: There\'s no GPU available on this machine, training will be performed on CPU."
            )
            n_gpu_use = 0
        if n_gpu_use > n_gpu:
            msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format(
                n_gpu_use, n_gpu)
            self.logger.warning(msg)
            n_gpu_use = n_gpu

        if n_gpu_use:
            torch.backends.cudnn.benchmark = True

        device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
        list_ids = list(range(n_gpu_use))
        return device, list_ids

    def train(self):
        """
        Full training logic
        """
        raise NotImplementedError

    def _save_checkpoint(self, step):
        """
        Saving checkpoints

        :param step: current step number
        """
        arch = type(self.model).__name__
        state = {
            'arch': arch,
            'step': step,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scaler': self.scaler.state_dict(),
            'config': self.config
        }
        filename = os.path.join(self.checkpoint_dir,
                                'checkpoint-step{}.pth'.format(step))
        torch.save(state, filename)
        self.logger.info("Saving checkpoint: {} ...".format(filename))

    def _resume_checkpoint(self, resume_path):
        """
        Resume from saved checkpoints

        :param resume_path: Checkpoint path to be resumed
        """
        self.logger.info("Loading checkpoint: {} ...".format(resume_path))
        checkpoint = torch.load(resume_path)
        self.start_step = checkpoint['step']
        # self.monitor_best = checkpoint['monitor_best']

        # load architecture params from checkpoint.
        if checkpoint['config']['arch'] != self.config['arch']:
            self.logger.warning(
                'Warning: Architecture configuration given in config file is different from that of checkpoint. ' + \
                'This may yield an exception while state_dict is being loaded.')
        self.model.load_state_dict(checkpoint['state_dict'], False)
        self.scaler.load_state_dict(checkpoint['scaler'])

        # load optimizer state from checkpoint only when optimizer type is not changed.
        if checkpoint['config']['optimizer']['type'] != self.config[
                'optimizer']['type']:
            self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. ' + \
                                'Optimizer parameters not being resumed.')
        else:
            self.optimizer.load_state_dict(checkpoint['optimizer'])

        self.logger.info("Checkpoint '{}' (step {}) loaded".format(
            resume_path, self.start_step))
Exemple #10
0
class Trainer:

    def __init__(self, model, train_loader, val_loader, criterion, optimizer,
                 gpu=None, output_file='./checkpoint.pth', acc_threshold=0.5, wb=None):

        self.model = model
        self.wb = wb
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.acc_threshold = acc_threshold

        # Output file
        self.output_file = output_file

        # Initialize
        self.example_ct=0
        self.step = 0
        self.start_epoch = 0
        self.epoch = 0
        self.train_loss = 0
        self.train_loss_recorder = []
        self.val_loss_recorder = []
        self.min_loss = math.inf
        self.gpu = gpu
        self._params = None
        self.lr_recorder, self.loss_recorder = [], []
        self.set_device(gpu)
        self._reset_opt(self.optimizer.defaults['lr'])

    def set_device(self, gpu):
        """
        Move tensor objects to the target GPU

        Args:
            gpu (int): index of the target GPU device
        """
        if isinstance(gpu, int):
            if not torch.cuda.is_available():
                raise AssertionError("PyTorch cannot access your GPU. Please investigate!")
            if gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            torch.cuda.set_device(gpu)
            self.model = self.model.cuda()
            if isinstance(self.criterion, torch.nn.Module):
                self.criterion = self.criterion.cuda()

    def save(self, output_file):
        """
        Save a trainer checkpoint

        Args:
            output_file (str): destination file path
        """
        if self.wb:
            torch.save(self.model.state_dict(), os.path.join(wandb.run.dir, output_file))
        else:
            torch.save(self.model.state_dict(), output_file)


    def _fit_epoch(self, freeze_until, mb):
        """
        Fit a single epoch

        Args:
            freeze_until (str): last layer to freeze
            mb (fastprogress.master_bar): primary progress bar
        """
        # self.model = freeze_bn(self.model.train())
        self.train_loss = 0
        self.model.train()
        pb = progress_bar(self.train_loader, parent=mb)
        for x, target in pb:
            x, target = self.to_cuda(x, target)
            self.example_ct +=  x.shape[0]
            # Forward
            batch_loss = self._get_loss(x, target)
            self.train_loss += batch_loss.item()

            # Backprop
            self._backprop_step(batch_loss)
            # Update LR
            self.scheduler.step()
            pb.comment = f"Training loss: {batch_loss.item():.4}"

            self.step += 1

            # Report metrics every 20th batch
            if self.step % 5 == 0:
                # where the magic happens
                if self.wb:
                    wandb.log({"epoch": self.epoch, "train_loss": batch_loss.item()}, step=self.example_ct)
   

        self.epoch += 1
        # print(self.train_loss,len(self.train_loader),self.train_loss/len(self.train_loader))
        self.train_loss /= len(self.train_loader)
        self.train_loss_recorder.append(self.train_loss)

    def to_cuda(self, x, target):
        """Move input and target to GPU !"""
        if isinstance(self.gpu, int):
            if self.gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            return self._to_cuda(x, target)
        else:
            return x, target

    @staticmethod
    def _to_cuda(x, target):
        """Move input and target to GPU !"""
        x = x.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        return x, target

    def _backprop_step(self, loss):
        # Clean gradients
        self.optimizer.zero_grad()
        # Backpropate the loss
        loss.backward()
        # Update the params
        self.optimizer.step()

    def _get_loss(self, x, target):
        # Forward
        out = self.model(x)
        # Loss computation
        return self.criterion(out, target)

    def _set_params(self):
        self._params = ContiguousParams([p for p in self.model.parameters() if p.requires_grad])

    def _reset_opt(self, lr):
        """Reset the target params of the optimizer !"""
        self.optimizer.defaults['lr'] = lr
        self.optimizer.state = defaultdict(dict)
        self.optimizer.param_groups = []
        self._set_params()
        self.optimizer.add_param_group(dict(params=self._params.contiguous()))

    @torch.no_grad()
    def evaluate(self):
        raise NotImplementedError

    @staticmethod
    def _eval_metrics_str(eval_metrics):
        raise NotImplementedError

    def _reset_scheduler(self, lr, num_epochs, sched_type='onecycle'):
        if sched_type == 'onecycle':
            self.scheduler = OneCycleLR(self.optimizer, lr, num_epochs * len(self.train_loader))
        elif sched_type == 'cosine':
            self.scheduler = CosineAnnealingLR(self.optimizer, num_epochs * len(self.train_loader), eta_min=lr / 25e4)
        else:
            raise ValueError(f"The following scheduler type is not supported: {sched_type}")

    def fit_n_epochs(self, num_epochs, lr, freeze_until=None, sched_type='onecycle'):
        """
        Train the model for a given number of epochs

        Args:
            num_epochs (int): number of epochs to train
            lr (float): learning rate to be used by the scheduler
            freeze_until (str, optional): last layer to freeze
            sched_type (str, optional): type of scheduler to use
        """

        if self.wb:
          wandb.watch(self.model, self.criterion, log="all", log_freq=10)

        self.epoch = 0
        self.train_loss_recorder = []
        self.val_loss_recorder = []

        self.model = freeze_model(self.model, freeze_until)
        # Update param groups & LR
        self._reset_opt(lr)
        # Scheduler
        self._reset_scheduler(lr, num_epochs, sched_type)

        mb = master_bar(range(num_epochs))
        for _ in mb:

            self._fit_epoch(freeze_until, mb)
            # Check whether ops invalidated the buffer
            self._params.assert_buffer_is_valid()
            eval_metrics = self.evaluate()

            # master bar
            mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}"
            mb.write(f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - "
                     f"{self._eval_metrics_str(eval_metrics)}")

            if eval_metrics['val_loss'] < self.min_loss:
                print(f"Validation loss decreased {self.min_loss:.4} --> "
                      f"{eval_metrics['val_loss']:.4}: saving state...")
                self.min_loss = eval_metrics['val_loss']
                if self.wb:
                    wandb.log({"best_val_loss": self.min_loss})
                    wandb.log({"best_val_acc": eval_metrics['acc1']})
                self.save(self.output_file)

    def lr_find(self, freeze_until=None, start_lr=1e-7, end_lr=1, num_it=100):
        """
        Gridsearch the optimal learning rate for the training

        Args:
           freeze_until (str, optional): last layer to freeze
           start_lr (float, optional): initial learning rate
           end_lr (float, optional): final learning rate
           num_it (int, optional): number of iterations to perform
        """
        if len(self.train_loader) < num_it:
            print("Can't reach", num_it, "iterations, num_it is now", len(self.train_loader))
            num_it = len(self.train_loader)

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(start_lr)
        gamma = (end_lr / start_lr) ** (1 / (num_it - 1))
        scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma)

        self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)]
        self.loss_recorder = []

        for batch_idx, (x, target) in enumerate(self.train_loader):
            x, target = self.to_cuda(x, target)

            # Forward
            batch_loss = self._get_loss(x, target)
            self._backprop_step(batch_loss)
            # Update LR
            scheduler.step()

            # Record
            self.loss_recorder.append(batch_loss.item())
            # Stop after the number of iterations
            if batch_idx + 1 == num_it:
                break

    def showBatch(self, nb_images=None, nrow=4, fig_size_X=15, fig_size_Y=15, normalize=True):

        x, target = next(iter(self.train_loader))

        if(nb_images):
            x = x[:nb_images, :, :, :]
            target = target[:nb_images]

        images = make_grid(x, nrow=nrow)  # the default nrow is 8

        # Inverse normalize the images
        inv_normalize = transforms.Normalize(
            mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225],
            std=[1 / 0.229, 1 / 0.224, 1 / 0.225]
        )
        if normalize:
            im_inv = inv_normalize(images)
        else:
            im_inv = images

        # Print the images
        plt.figure(figsize=(fig_size_X, fig_size_Y))
        plt.imshow(np.transpose(im_inv.numpy(), (1, 2, 0)))

    def plot_recorder(self, beta=0.95, block=True):
        """
        Display the results of the LR grid search

        Args:
            beta (float, optional): smoothing factor
            block (bool, optional): whether the plot should block execution
        """
        if len(self.lr_recorder) != len(self.loss_recorder) or len(self.lr_recorder) == 0:
            raise AssertionError("Please run the `lr_find` method first")

        # Exp moving average of loss
        smoothed_losses = []
        avg_loss = 0
        for idx, loss in enumerate(self.loss_recorder):
            avg_loss = beta * avg_loss + (1 - beta) * loss
            smoothed_losses.append(avg_loss / (1 - beta ** (idx + 1)))

        plt.plot(self.lr_recorder[10:-5], smoothed_losses[10:-5])
        plt.xscale('log')
        plt.xlabel('Learning Rate')
        plt.ylabel('Training loss')
        plt.grid(True, linestyle='--', axis='x')
        plt.show(block=block)

    def plot_losses(self):

        plt.plot(self.val_loss_recorder, label='val loss')
        plt.plot(self.train_loss_recorder, label='train loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend(loc='lower right')
        plt.show()

    def check_setup(self, freeze_until=None, lr=3e-4, num_it=100):
        """
        Check whether you can overfit one batch

        Args:
            freeze_until (str, optional): last layer to freeze
            lr (float, optional): learning rate to be used for training
            num_it (int, optional): number of iterations to perform
        """
        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(lr)

        prev_loss = math.inf

        x, target = next(iter(self.train_loader))
        x, target = self.to_cuda(x, target)

        for _ in range(num_it):
            # Forward
            batch_loss = self._get_loss(x, target)
            # Backprop
            self._backprop_step(batch_loss)

            # Check that loss decreases
            if batch_loss.item() > prev_loss:
                return False
            prev_loss = batch_loss.item()

        return True
Exemple #11
0
                    optimizer.zero_grad()
                if device == 'cuda':
                    torch.cuda.synchronize()
                step_times.append(time.time() - start)
            print(f"Mean step time: {sum(step_times) / 10} seconds. "
                  f"(Autograd profiler enabled: {profile_autograd})")
    prof.export_chrome_trace(f"{name}_timeline.json")


if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else 'cpu'
    model = nn.Sequential(*[nn.Linear(128, 128)
                          for i in range(100)]).to(device)
    print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
    x = torch.randn(1, 128).to(device)

    model_copies = [deepcopy(model) for _ in range(2)]

    # Benchmark original.
    parameters = list(model_copies[0].parameters())
    optimizer = torch.optim.Adam(parameters)
    benchmark_model(model_copies[0], optimizer, parameters, "original_params")

    # Benchmark contiguous.
    parameters = ContiguousParams(model_copies[1].parameters())
    optimizer = torch.optim.Adam(parameters.contiguous())
    benchmark_model(model_copies[1], optimizer, parameters.parameters(),
                    "contiguous_params")
    # Ensure the parameter buffers are still valid.
    parameters.assert_buffer_is_valid()
Exemple #12
0
class Trainer:

    def __init__(
        self,
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        criterion: nn.Module,
        optimizer: torch.optim.Optimizer,
        gpu: Optional[int] = None,
        output_file: str = './checkpoint.pth'
    ) -> None:
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer

        # Output file
        self.output_file = output_file

        # Initialize
        self.step = 0
        self.start_epoch = 0
        self.epoch = 0
        self.min_loss = math.inf
        self.gpu = gpu
        self._params: Optional[ContiguousParams] = None
        self.lr_recorder: List[float] = []
        self.loss_recorder: List[float] = []
        self.set_device(gpu)
        self._reset_opt(self.optimizer.defaults['lr'])

    def set_device(self, gpu: Optional[int] = None) -> None:
        """Move tensor objects to the target GPU

        Args:
            gpu: index of the target GPU device
        """
        if isinstance(gpu, int):
            if not torch.cuda.is_available():
                raise AssertionError("PyTorch cannot access your GPU. Please investigate!")
            if gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            torch.cuda.set_device(gpu)
            self.model = self.model.cuda()
            if isinstance(self.criterion, torch.nn.Module):
                self.criterion = self.criterion.cuda()

    def save(self, output_file: str) -> None:
        """Save a trainer checkpoint

        Args:
            output_file: destination file path
        """
        torch.save(dict(epoch=self.epoch, step=self.step, min_loss=self.min_loss,
                        optimizer=self.optimizer.state_dict(),
                        model=self.model.state_dict()),
                   output_file,
                   _use_new_zipfile_serialization=False)

    def load(self, state: Dict[str, Any]) -> None:
        """Resume from a trainer state

        Args:
            state (dict): checkpoint dictionary
        """
        self.start_epoch = state['epoch']
        self.epoch = self.start_epoch
        self.step = state['step']
        self.min_loss = state['min_loss']
        self.optimizer.load_state_dict(state['optimizer'])
        self.model.load_state_dict(state['model'])

    def _fit_epoch(self, mb: ConsoleMasterBar) -> None:
        """Fit a single epoch

        Args:
            mb (fastprogress.master_bar): primary progress bar
        """
        self.model = freeze_bn(self.model.train())

        pb = progress_bar(self.train_loader, parent=mb)
        for x, target in pb:
            x, target = self.to_cuda(x, target)

            # Forward
            batch_loss = self._get_loss(x, target)

            # Backprop
            self._backprop_step(batch_loss)
            # Update LR
            self.scheduler.step()
            pb.comment = f"Training loss: {batch_loss.item():.4}"

            self.step += 1
        self.epoch += 1

    def to_cuda(
        self,
        x: Tensor,
        target: Union[Tensor, List[Dict[str, Tensor]]]
    ) -> Tuple[Tensor, Union[Tensor, List[Dict[str, Tensor]]]]:
        """Move input and target to GPU"""
        if isinstance(self.gpu, int):
            if self.gpu >= torch.cuda.device_count():
                raise ValueError("Invalid device index")
            return self._to_cuda(x, target)  # type: ignore[arg-type]
        else:
            return x, target

    @staticmethod
    def _to_cuda(x: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]:
        """Move input and target to GPU"""
        x = x.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)
        return x, target

    def _backprop_step(self, loss: Tensor) -> None:
        # Clean gradients
        self.optimizer.zero_grad()
        # Backpropate the loss
        loss.backward()
        # Update the params
        self.optimizer.step()

    def _get_loss(self, x: Tensor, target: Tensor) -> Tensor:
        # Forward
        out = self.model(x)
        # Loss computation
        return self.criterion(out, target)

    def _set_params(self) -> None:
        self._params = ContiguousParams([p for p in self.model.parameters() if p.requires_grad])

    def _reset_opt(self, lr: float) -> None:
        """Reset the target params of the optimizer"""
        self.optimizer.defaults['lr'] = lr
        self.optimizer.state = defaultdict(dict)
        self.optimizer.param_groups = []
        self._set_params()
        self.optimizer.add_param_group(dict(params=self._params.contiguous()))  # type: ignore[union-attr]

    @torch.no_grad()
    def evaluate(self):
        raise NotImplementedError

    @staticmethod
    def _eval_metrics_str(eval_metrics):
        raise NotImplementedError

    def _reset_scheduler(self, lr: float, num_epochs: int, sched_type: str = 'onecycle') -> None:
        if sched_type == 'onecycle':
            self.scheduler = OneCycleLR(self.optimizer, lr, num_epochs * len(self.train_loader))
        elif sched_type == 'cosine':
            self.scheduler = CosineAnnealingLR(self.optimizer, num_epochs * len(self.train_loader), eta_min=lr / 25e4)
        else:
            raise ValueError(f"The following scheduler type is not supported: {sched_type}")

    def fit_n_epochs(
        self,
        num_epochs: int,
        lr: float,
        freeze_until: Optional[str] = None,
        sched_type: str = 'onecycle'
    ) -> None:
        """Train the model for a given number of epochs

        Args:
            num_epochs (int): number of epochs to train
            lr (float): learning rate to be used by the scheduler
            freeze_until (str, optional): last layer to freeze
            sched_type (str, optional): type of scheduler to use
        """

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(lr)
        # Scheduler
        self._reset_scheduler(lr, num_epochs, sched_type)

        mb = master_bar(range(num_epochs))
        for _ in mb:

            self._fit_epoch(mb)
            # Check whether ops invalidated the buffer
            self._params.assert_buffer_is_valid()  # type: ignore[union-attr]
            eval_metrics = self.evaluate()

            # master bar
            mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}"
            mb.write(f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - "
                     f"{self._eval_metrics_str(eval_metrics)}")

            if eval_metrics['val_loss'] < self.min_loss:
                print(f"Validation loss decreased {self.min_loss:.4} --> "
                      f"{eval_metrics['val_loss']:.4}: saving state...")
                self.min_loss = eval_metrics['val_loss']
                self.save(self.output_file)

    def lr_find(
        self,
        freeze_until: Optional[str] = None,
        start_lr: float = 1e-7,
        end_lr: float = 1,
        num_it: int = 100
    ) -> None:
        """Gridsearch the optimal learning rate for the training

        Args:
           freeze_until (str, optional): last layer to freeze
           start_lr (float, optional): initial learning rate
           end_lr (float, optional): final learning rate
           num_it (int, optional): number of iterations to perform
        """

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(start_lr)
        gamma = (end_lr / start_lr) ** (1 / (num_it - 1))
        scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma)

        self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)]
        self.loss_recorder = []

        for batch_idx, (x, target) in enumerate(self.train_loader):
            x, target = self.to_cuda(x, target)

            # Forward
            batch_loss = self._get_loss(x, target)
            self._backprop_step(batch_loss)
            # Update LR
            scheduler.step()

            # Record
            self.loss_recorder.append(batch_loss.item())
            # Stop after the number of iterations
            if batch_idx + 1 == num_it:
                break

    def plot_recorder(self, beta: float = 0.95, block: bool = True) -> None:
        """Display the results of the LR grid search

        Args:
            beta (float, optional): smoothing factor
            block (bool, optional): whether the plot should block execution
        """

        if len(self.lr_recorder) != len(self.loss_recorder) or len(self.lr_recorder) == 0:
            raise AssertionError("Please run the `lr_find` method first")

        # Exp moving average of loss
        smoothed_losses = []
        avg_loss = 0.
        for idx, loss in enumerate(self.loss_recorder):
            avg_loss = beta * avg_loss + (1 - beta) * loss
            smoothed_losses.append(avg_loss / (1 - beta ** (idx + 1)))

        plt.plot(self.lr_recorder[10:-5], smoothed_losses[10:-5])
        plt.xscale('log')
        plt.xlabel('Learning Rate')
        plt.ylabel('Training loss')
        plt.grid(True, linestyle='--', axis='x')
        plt.show(block=block)

    def check_setup(self, freeze_until: Optional[str] = None, lr: float = 3e-4, num_it: int = 100) -> bool:
        """Check whether you can overfit one batch

        Args:
            freeze_until (str, optional): last layer to freeze
            lr (float, optional): learning rate to be used for training
            num_it (int, optional): number of iterations to perform
        """

        self.model = freeze_model(self.model.train(), freeze_until)
        # Update param groups & LR
        self._reset_opt(lr)

        prev_loss = math.inf

        x, target = next(iter(self.train_loader))
        x, target = self.to_cuda(x, target)

        for _ in range(num_it):
            # Forward
            batch_loss = self._get_loss(x, target)
            # Backprop
            self._backprop_step(batch_loss)

            # Check that loss decreases
            if batch_loss.item() > prev_loss:
                return False
            prev_loss = batch_loss.item()

        return True