def test_equal_optimizer_update(device): if device == "cuda" and not torch.cuda.is_available(): print("No GPU available, skipping GPU test.") return """Verify that the parameters are the same after a few updates.""" x = torch.randn(1, 8).to(device) model_ref = nn.Sequential(*[nn.Linear(8, 8) for i in range(10)]) model_ref = model_ref.to(device) optimizer = torch.optim.SGD(model_ref.parameters(), lr=1e-3) model_c = deepcopy(model_ref) parameters_c = ContiguousParams(model_c.parameters()) optimizer_c = torch.optim.SGD(parameters_c.contiguous(), lr=1e-3) for model, optimizer in zip([model_ref, model_c], [optimizer, optimizer_c]): for step in range(5): loss = model(x).sum() loss.backward() optimizer.step() optimizer.zero_grad() # Verify that the model/optimizer did not modify the data or grad handle. parameters_c.assert_buffer_is_valid() # Verify that both models applied the same parameter updates. for p1, p2 in zip(model_ref.parameters(), model_c.parameters()): assert torch.allclose(p1.data, p2.data, atol=1e-06)
def configure_optimizers(self): if self.use_contiguous: self.contiguous_params = ContiguousParams(self.parameters()) params = self.contiguous_params.contiguous() else: params = self.model.parameters() self.optimizer = torch.optim.SGD(params, lr=1e-3) return self.optimizer
class Model(pytorch_lightning.LightningModule): def __init__(self, use_contiguous): super().__init__() self.model = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 5)) self.use_contiguous = use_contiguous self.loss_fn = torch.nn.CrossEntropyLoss() self.dataset = TensorDataset(data_X, data_y) self.contiguous_params = None self.optimizer = None def forward(self, x): return self.model(x) def training_step(self, batch, batch_idx): x, target = batch prediction = self(x) loss_value = self.loss_fn(prediction, target) return {'loss': loss_value} def train_dataloader(self): return torch.utils.data.DataLoader(self.dataset, batch_size=2, shuffle=False) def configure_optimizers(self): if self.use_contiguous: self.contiguous_params = ContiguousParams(self.parameters()) params = self.contiguous_params.contiguous() else: params = self.model.parameters() self.optimizer = torch.optim.SGD(params, lr=1e-3) return self.optimizer
def __init__(self, model, loss, optimizer, resume, config): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) self.params = ContiguousParams(self.model.parameters()) self.optimizer = optimizer[0](self.params.contiguous(), **optimizer[1]) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.amp_enabled = config['trainer']['amp'] self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp_enabled) self.steps = config['trainer']['steps'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.start_step = 0 self.inference = config['trainer'].get('inference', False) # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def test_buffer_invalidation_detection(device): """Verify that we recognize an invalidated buffer.""" if device == "cuda" and not torch.cuda.is_available(): print("No GPU available, skipping GPU test.") return model = nn.Linear(8, 8) parameters = ContiguousParams(model.parameters()) assert parameters.buffer_is_valid() # Invalidate the buffer. model.weight.data = model.weight + 4 assert not parameters.buffer_is_valid() with pytest.raises(ValueError): parameters.assert_buffer_is_valid()
def __init__(self, args): super(Model, self).__init__() self.fusion = Decomposition() self.D = MultiscaleDiscriminator(input_nc=1) self.MSE_fun = nn.MSELoss() self.L1_loss = nn.L1Loss() self.SSIM_fun = SSIM() if args.contiguousparams == True: print("ContiguousParams---") parametersF = ContiguousParams(self.fusion.parameters()) parametersD = ContiguousParams(self.D.parameters()) self.optimizer_G = optim.Adam(parametersF.contiguous(), lr=args.lr) self.optimizer_D = optim.Adam(parametersD.contiguous(), lr=args.lr) else: self.optimizer_G = optim.Adam(self.fusion.parameters(), lr=args.lr) self.optimizer_D = optim.Adam(self.D.parameters(), lr=args.lr) self.g1 = self.g2 = self.g3 = self.s = self.img_re = None self.loss = torch.zeros(1) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer_G, mode='min', factor=0.5, patience=2, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-10) self.min_loss = 1000 self.args = args self.downsample = downsample() self.criterionGAN = torch.nn.MSELoss() if args.multiGPU: self.mulgpus() self.load() self.load_D()
class Trainer: def __init__(self, model, train_loader, val_loader, criterion, optimizer, gpu=None, output_file='checkpoint.pth', acc_threshold=0.05, configwb=None): self.model = model self.configwb = configwb self.train_loader = train_loader self.val_loader = val_loader self.criterion = criterion self.optimizer = optimizer self.acc_threshold = acc_threshold # Output file self.output_file = output_file # Initialize self.example_ct = 0 self.step = 0 self.start_epoch = 0 self.epoch = 0 self.train_loss = 0 self.train_loss_recorder = [] self.val_loss_recorder = [] self.min_loss = math.inf self.gpu = gpu self._params = None self.lr_recorder, self.loss_recorder = [], [] self.set_device(gpu) self._reset_opt(self.optimizer.defaults['lr']) def set_device(self, gpu): """ Move tensor objects to the target GPU Args: gpu (int): index of the target GPU device """ if isinstance(gpu, int): if not torch.cuda.is_available(): raise AssertionError( "PyTorch cannot access your GPU. Please investigate!") if gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") torch.cuda.set_device(gpu) self.model = self.model.cuda() if isinstance(self.criterion, torch.nn.Module): self.criterion = self.criterion.cuda() def save(self, output_file): """ Save a trainer checkpoint Args: output_file (str): destination file path """ torch.save(self.model.state_dict(), output_file) def load(self, state): """ Resume from a trainer state Args: state (dict): checkpoint dictionary """ self.start_epoch = state['epoch'] self.epoch = self.start_epoch self.step = state['step'] self.min_loss = state['min_loss'] self.optimizer.load_state_dict(state['optimizer']) self.model.load_state_dict(state['model']) def _fit_epoch(self, freeze_until, mb): """ Fit a single epoch Args: freeze_until (str): last layer to freeze mb (fastprogress.master_bar): primary progress bar """ # self.model = freeze_bn(self.model.train()) self.train_loss = 0 self.model.train() pb = progress_bar(self.train_loader, parent=mb) for x, target in pb: x, target = self.to_cuda(x, target) self.example_ct += x.shape[0] # Forward batch_loss = self._get_loss(x, target) self.train_loss += batch_loss.item() # Backprop self._backprop_step(batch_loss) # Update LR self.scheduler.step() pb.comment = f"Training loss: {batch_loss.item():.4}" self.step += 1 self.epoch += 1 # print(self.train_loss,len(self.train_loader),self.train_loss/len(self.train_loader)) self.train_loss /= len(self.train_loader) self.train_loss_recorder.append(self.train_loss) def to_cuda(self, x, target): """Move input and target to GPU !""" if isinstance(self.gpu, int): if self.gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") return self._to_cuda(x, target) else: return x, target @staticmethod def _to_cuda(x, target): """Move input and target to GPU !""" x = x.cuda(non_blocking=True) target = target.cuda(non_blocking=True) return x, target def _backprop_step(self, loss): # Clean gradients self.optimizer.zero_grad() # Backpropate the loss loss.backward() # Update the params self.optimizer.step() def _get_loss(self, x, target): # Forward out = self.model(x) # Loss computation return 1 - self.criterion(out, target) def _set_params(self): self._params = ContiguousParams( [p for p in self.model.parameters() if p.requires_grad]) def _reset_opt(self, lr): """Reset the target params of the optimizer !""" self.optimizer.defaults['lr'] = lr self.optimizer.state = defaultdict(dict) self.optimizer.param_groups = [] self._set_params() self.optimizer.add_param_group(dict(params=self._params.contiguous())) @torch.no_grad() def evaluate(self): raise NotImplementedError @staticmethod def _eval_metrics_str(eval_metrics): raise NotImplementedError def _reset_scheduler(self, lr, num_epochs, sched_type='onecycle'): if sched_type == 'onecycle': self.scheduler = OneCycleLR(self.optimizer, lr, num_epochs * len(self.train_loader)) elif sched_type == 'cosine': self.scheduler = CosineAnnealingLR(self.optimizer, num_epochs * len(self.train_loader), eta_min=lr / 25e4) else: raise ValueError( f"The following scheduler type is not supported: {sched_type}") def fit_n_epochs(self, num_epochs, lr, freeze_until=None, sched_type='onecycle'): """ Train the model for a given number of epochs Args: num_epochs (int): number of epochs to train lr (float): learning rate to be used by the scheduler freeze_until (str, optional): last layer to freeze sched_type (str, optional): type of scheduler to use """ if self.configwb: wandb.watch(self.criterion, log="all", log_freq=10) self.epoch = 0 self.train_loss_recorder = [] self.val_loss_recorder = [] self.model = freeze_model(self.model, freeze_until) # Update param groups & LR self._reset_opt(lr) # Scheduler self._reset_scheduler(lr, num_epochs, sched_type) mb = master_bar(range(num_epochs)) for _ in mb: self._fit_epoch(freeze_until, mb) # Check whether ops invalidated the buffer self._params.assert_buffer_is_valid() eval_metrics = self.evaluate() # master bar mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}" mb.write( f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - " f"{self._eval_metrics_str(eval_metrics)}") self.save(self.output_file)
def _set_params(self): self._params = ContiguousParams( [p for p in self.model.parameters() if p.requires_grad])
class BaseTrainer: """ Base class for all trainers """ def __init__(self, model, loss, optimizer, resume, config): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) self.params = ContiguousParams(self.model.parameters()) self.optimizer = optimizer[0](self.params.contiguous(), **optimizer[1]) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.amp_enabled = config['trainer']['amp'] self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp_enabled) self.steps = config['trainer']['steps'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.start_step = 0 self.inference = config['trainer'].get('inference', False) # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) def _prepare_device(self, n_gpu_use): """ setup GPU device if available, move model into configured device """ n_gpu = torch.cuda.device_count() if n_gpu_use > 0 and n_gpu == 0: self.logger.warning( "Warning: There\'s no GPU available on this machine, training will be performed on CPU." ) n_gpu_use = 0 if n_gpu_use > n_gpu: msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format( n_gpu_use, n_gpu) self.logger.warning(msg) n_gpu_use = n_gpu if n_gpu_use: torch.backends.cudnn.benchmark = True device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu') list_ids = list(range(n_gpu_use)) return device, list_ids def train(self): """ Full training logic """ raise NotImplementedError def _save_checkpoint(self, step): """ Saving checkpoints :param step: current step number """ arch = type(self.model).__name__ state = { 'arch': arch, 'step': step, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scaler': self.scaler.state_dict(), 'config': self.config } filename = os.path.join(self.checkpoint_dir, 'checkpoint-step{}.pth'.format(step)) torch.save(state, filename) self.logger.info("Saving checkpoint: {} ...".format(filename)) def _resume_checkpoint(self, resume_path): """ Resume from saved checkpoints :param resume_path: Checkpoint path to be resumed """ self.logger.info("Loading checkpoint: {} ...".format(resume_path)) checkpoint = torch.load(resume_path) self.start_step = checkpoint['step'] # self.monitor_best = checkpoint['monitor_best'] # load architecture params from checkpoint. if checkpoint['config']['arch'] != self.config['arch']: self.logger.warning( 'Warning: Architecture configuration given in config file is different from that of checkpoint. ' + \ 'This may yield an exception while state_dict is being loaded.') self.model.load_state_dict(checkpoint['state_dict'], False) self.scaler.load_state_dict(checkpoint['scaler']) # load optimizer state from checkpoint only when optimizer type is not changed. if checkpoint['config']['optimizer']['type'] != self.config[ 'optimizer']['type']: self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. ' + \ 'Optimizer parameters not being resumed.') else: self.optimizer.load_state_dict(checkpoint['optimizer']) self.logger.info("Checkpoint '{}' (step {}) loaded".format( resume_path, self.start_step))
class Trainer: def __init__(self, model, train_loader, val_loader, criterion, optimizer, gpu=None, output_file='./checkpoint.pth', acc_threshold=0.5, wb=None): self.model = model self.wb = wb self.train_loader = train_loader self.val_loader = val_loader self.criterion = criterion self.optimizer = optimizer self.acc_threshold = acc_threshold # Output file self.output_file = output_file # Initialize self.example_ct=0 self.step = 0 self.start_epoch = 0 self.epoch = 0 self.train_loss = 0 self.train_loss_recorder = [] self.val_loss_recorder = [] self.min_loss = math.inf self.gpu = gpu self._params = None self.lr_recorder, self.loss_recorder = [], [] self.set_device(gpu) self._reset_opt(self.optimizer.defaults['lr']) def set_device(self, gpu): """ Move tensor objects to the target GPU Args: gpu (int): index of the target GPU device """ if isinstance(gpu, int): if not torch.cuda.is_available(): raise AssertionError("PyTorch cannot access your GPU. Please investigate!") if gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") torch.cuda.set_device(gpu) self.model = self.model.cuda() if isinstance(self.criterion, torch.nn.Module): self.criterion = self.criterion.cuda() def save(self, output_file): """ Save a trainer checkpoint Args: output_file (str): destination file path """ if self.wb: torch.save(self.model.state_dict(), os.path.join(wandb.run.dir, output_file)) else: torch.save(self.model.state_dict(), output_file) def _fit_epoch(self, freeze_until, mb): """ Fit a single epoch Args: freeze_until (str): last layer to freeze mb (fastprogress.master_bar): primary progress bar """ # self.model = freeze_bn(self.model.train()) self.train_loss = 0 self.model.train() pb = progress_bar(self.train_loader, parent=mb) for x, target in pb: x, target = self.to_cuda(x, target) self.example_ct += x.shape[0] # Forward batch_loss = self._get_loss(x, target) self.train_loss += batch_loss.item() # Backprop self._backprop_step(batch_loss) # Update LR self.scheduler.step() pb.comment = f"Training loss: {batch_loss.item():.4}" self.step += 1 # Report metrics every 20th batch if self.step % 5 == 0: # where the magic happens if self.wb: wandb.log({"epoch": self.epoch, "train_loss": batch_loss.item()}, step=self.example_ct) self.epoch += 1 # print(self.train_loss,len(self.train_loader),self.train_loss/len(self.train_loader)) self.train_loss /= len(self.train_loader) self.train_loss_recorder.append(self.train_loss) def to_cuda(self, x, target): """Move input and target to GPU !""" if isinstance(self.gpu, int): if self.gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") return self._to_cuda(x, target) else: return x, target @staticmethod def _to_cuda(x, target): """Move input and target to GPU !""" x = x.cuda(non_blocking=True) target = target.cuda(non_blocking=True) return x, target def _backprop_step(self, loss): # Clean gradients self.optimizer.zero_grad() # Backpropate the loss loss.backward() # Update the params self.optimizer.step() def _get_loss(self, x, target): # Forward out = self.model(x) # Loss computation return self.criterion(out, target) def _set_params(self): self._params = ContiguousParams([p for p in self.model.parameters() if p.requires_grad]) def _reset_opt(self, lr): """Reset the target params of the optimizer !""" self.optimizer.defaults['lr'] = lr self.optimizer.state = defaultdict(dict) self.optimizer.param_groups = [] self._set_params() self.optimizer.add_param_group(dict(params=self._params.contiguous())) @torch.no_grad() def evaluate(self): raise NotImplementedError @staticmethod def _eval_metrics_str(eval_metrics): raise NotImplementedError def _reset_scheduler(self, lr, num_epochs, sched_type='onecycle'): if sched_type == 'onecycle': self.scheduler = OneCycleLR(self.optimizer, lr, num_epochs * len(self.train_loader)) elif sched_type == 'cosine': self.scheduler = CosineAnnealingLR(self.optimizer, num_epochs * len(self.train_loader), eta_min=lr / 25e4) else: raise ValueError(f"The following scheduler type is not supported: {sched_type}") def fit_n_epochs(self, num_epochs, lr, freeze_until=None, sched_type='onecycle'): """ Train the model for a given number of epochs Args: num_epochs (int): number of epochs to train lr (float): learning rate to be used by the scheduler freeze_until (str, optional): last layer to freeze sched_type (str, optional): type of scheduler to use """ if self.wb: wandb.watch(self.model, self.criterion, log="all", log_freq=10) self.epoch = 0 self.train_loss_recorder = [] self.val_loss_recorder = [] self.model = freeze_model(self.model, freeze_until) # Update param groups & LR self._reset_opt(lr) # Scheduler self._reset_scheduler(lr, num_epochs, sched_type) mb = master_bar(range(num_epochs)) for _ in mb: self._fit_epoch(freeze_until, mb) # Check whether ops invalidated the buffer self._params.assert_buffer_is_valid() eval_metrics = self.evaluate() # master bar mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}" mb.write(f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - " f"{self._eval_metrics_str(eval_metrics)}") if eval_metrics['val_loss'] < self.min_loss: print(f"Validation loss decreased {self.min_loss:.4} --> " f"{eval_metrics['val_loss']:.4}: saving state...") self.min_loss = eval_metrics['val_loss'] if self.wb: wandb.log({"best_val_loss": self.min_loss}) wandb.log({"best_val_acc": eval_metrics['acc1']}) self.save(self.output_file) def lr_find(self, freeze_until=None, start_lr=1e-7, end_lr=1, num_it=100): """ Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate num_it (int, optional): number of iterations to perform """ if len(self.train_loader) < num_it: print("Can't reach", num_it, "iterations, num_it is now", len(self.train_loader)) num_it = len(self.train_loader) self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr) gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)] self.loss_recorder = [] for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break def showBatch(self, nb_images=None, nrow=4, fig_size_X=15, fig_size_Y=15, normalize=True): x, target = next(iter(self.train_loader)) if(nb_images): x = x[:nb_images, :, :, :] target = target[:nb_images] images = make_grid(x, nrow=nrow) # the default nrow is 8 # Inverse normalize the images inv_normalize = transforms.Normalize( mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225], std=[1 / 0.229, 1 / 0.224, 1 / 0.225] ) if normalize: im_inv = inv_normalize(images) else: im_inv = images # Print the images plt.figure(figsize=(fig_size_X, fig_size_Y)) plt.imshow(np.transpose(im_inv.numpy(), (1, 2, 0))) def plot_recorder(self, beta=0.95, block=True): """ Display the results of the LR grid search Args: beta (float, optional): smoothing factor block (bool, optional): whether the plot should block execution """ if len(self.lr_recorder) != len(self.loss_recorder) or len(self.lr_recorder) == 0: raise AssertionError("Please run the `lr_find` method first") # Exp moving average of loss smoothed_losses = [] avg_loss = 0 for idx, loss in enumerate(self.loss_recorder): avg_loss = beta * avg_loss + (1 - beta) * loss smoothed_losses.append(avg_loss / (1 - beta ** (idx + 1))) plt.plot(self.lr_recorder[10:-5], smoothed_losses[10:-5]) plt.xscale('log') plt.xlabel('Learning Rate') plt.ylabel('Training loss') plt.grid(True, linestyle='--', axis='x') plt.show(block=block) def plot_losses(self): plt.plot(self.val_loss_recorder, label='val loss') plt.plot(self.train_loss_recorder, label='train loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend(loc='lower right') plt.show() def check_setup(self, freeze_until=None, lr=3e-4, num_it=100): """ Check whether you can overfit one batch Args: freeze_until (str, optional): last layer to freeze lr (float, optional): learning rate to be used for training num_it (int, optional): number of iterations to perform """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(lr) prev_loss = math.inf x, target = next(iter(self.train_loader)) x, target = self.to_cuda(x, target) for _ in range(num_it): # Forward batch_loss = self._get_loss(x, target) # Backprop self._backprop_step(batch_loss) # Check that loss decreases if batch_loss.item() > prev_loss: return False prev_loss = batch_loss.item() return True
optimizer.zero_grad() if device == 'cuda': torch.cuda.synchronize() step_times.append(time.time() - start) print(f"Mean step time: {sum(step_times) / 10} seconds. " f"(Autograd profiler enabled: {profile_autograd})") prof.export_chrome_trace(f"{name}_timeline.json") if __name__ == "__main__": device = "cuda" if torch.cuda.is_available() else 'cpu' model = nn.Sequential(*[nn.Linear(128, 128) for i in range(100)]).to(device) print("Number of parameters: ", sum(p.numel() for p in model.parameters())) x = torch.randn(1, 128).to(device) model_copies = [deepcopy(model) for _ in range(2)] # Benchmark original. parameters = list(model_copies[0].parameters()) optimizer = torch.optim.Adam(parameters) benchmark_model(model_copies[0], optimizer, parameters, "original_params") # Benchmark contiguous. parameters = ContiguousParams(model_copies[1].parameters()) optimizer = torch.optim.Adam(parameters.contiguous()) benchmark_model(model_copies[1], optimizer, parameters.parameters(), "contiguous_params") # Ensure the parameter buffers are still valid. parameters.assert_buffer_is_valid()
class Trainer: def __init__( self, model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, criterion: nn.Module, optimizer: torch.optim.Optimizer, gpu: Optional[int] = None, output_file: str = './checkpoint.pth' ) -> None: self.model = model self.train_loader = train_loader self.val_loader = val_loader self.criterion = criterion self.optimizer = optimizer # Output file self.output_file = output_file # Initialize self.step = 0 self.start_epoch = 0 self.epoch = 0 self.min_loss = math.inf self.gpu = gpu self._params: Optional[ContiguousParams] = None self.lr_recorder: List[float] = [] self.loss_recorder: List[float] = [] self.set_device(gpu) self._reset_opt(self.optimizer.defaults['lr']) def set_device(self, gpu: Optional[int] = None) -> None: """Move tensor objects to the target GPU Args: gpu: index of the target GPU device """ if isinstance(gpu, int): if not torch.cuda.is_available(): raise AssertionError("PyTorch cannot access your GPU. Please investigate!") if gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") torch.cuda.set_device(gpu) self.model = self.model.cuda() if isinstance(self.criterion, torch.nn.Module): self.criterion = self.criterion.cuda() def save(self, output_file: str) -> None: """Save a trainer checkpoint Args: output_file: destination file path """ torch.save(dict(epoch=self.epoch, step=self.step, min_loss=self.min_loss, optimizer=self.optimizer.state_dict(), model=self.model.state_dict()), output_file, _use_new_zipfile_serialization=False) def load(self, state: Dict[str, Any]) -> None: """Resume from a trainer state Args: state (dict): checkpoint dictionary """ self.start_epoch = state['epoch'] self.epoch = self.start_epoch self.step = state['step'] self.min_loss = state['min_loss'] self.optimizer.load_state_dict(state['optimizer']) self.model.load_state_dict(state['model']) def _fit_epoch(self, mb: ConsoleMasterBar) -> None: """Fit a single epoch Args: mb (fastprogress.master_bar): primary progress bar """ self.model = freeze_bn(self.model.train()) pb = progress_bar(self.train_loader, parent=mb) for x, target in pb: x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) # Backprop self._backprop_step(batch_loss) # Update LR self.scheduler.step() pb.comment = f"Training loss: {batch_loss.item():.4}" self.step += 1 self.epoch += 1 def to_cuda( self, x: Tensor, target: Union[Tensor, List[Dict[str, Tensor]]] ) -> Tuple[Tensor, Union[Tensor, List[Dict[str, Tensor]]]]: """Move input and target to GPU""" if isinstance(self.gpu, int): if self.gpu >= torch.cuda.device_count(): raise ValueError("Invalid device index") return self._to_cuda(x, target) # type: ignore[arg-type] else: return x, target @staticmethod def _to_cuda(x: Tensor, target: Tensor) -> Tuple[Tensor, Tensor]: """Move input and target to GPU""" x = x.cuda(non_blocking=True) target = target.cuda(non_blocking=True) return x, target def _backprop_step(self, loss: Tensor) -> None: # Clean gradients self.optimizer.zero_grad() # Backpropate the loss loss.backward() # Update the params self.optimizer.step() def _get_loss(self, x: Tensor, target: Tensor) -> Tensor: # Forward out = self.model(x) # Loss computation return self.criterion(out, target) def _set_params(self) -> None: self._params = ContiguousParams([p for p in self.model.parameters() if p.requires_grad]) def _reset_opt(self, lr: float) -> None: """Reset the target params of the optimizer""" self.optimizer.defaults['lr'] = lr self.optimizer.state = defaultdict(dict) self.optimizer.param_groups = [] self._set_params() self.optimizer.add_param_group(dict(params=self._params.contiguous())) # type: ignore[union-attr] @torch.no_grad() def evaluate(self): raise NotImplementedError @staticmethod def _eval_metrics_str(eval_metrics): raise NotImplementedError def _reset_scheduler(self, lr: float, num_epochs: int, sched_type: str = 'onecycle') -> None: if sched_type == 'onecycle': self.scheduler = OneCycleLR(self.optimizer, lr, num_epochs * len(self.train_loader)) elif sched_type == 'cosine': self.scheduler = CosineAnnealingLR(self.optimizer, num_epochs * len(self.train_loader), eta_min=lr / 25e4) else: raise ValueError(f"The following scheduler type is not supported: {sched_type}") def fit_n_epochs( self, num_epochs: int, lr: float, freeze_until: Optional[str] = None, sched_type: str = 'onecycle' ) -> None: """Train the model for a given number of epochs Args: num_epochs (int): number of epochs to train lr (float): learning rate to be used by the scheduler freeze_until (str, optional): last layer to freeze sched_type (str, optional): type of scheduler to use """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(lr) # Scheduler self._reset_scheduler(lr, num_epochs, sched_type) mb = master_bar(range(num_epochs)) for _ in mb: self._fit_epoch(mb) # Check whether ops invalidated the buffer self._params.assert_buffer_is_valid() # type: ignore[union-attr] eval_metrics = self.evaluate() # master bar mb.main_bar.comment = f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs}" mb.write(f"Epoch {self.start_epoch + self.epoch}/{self.start_epoch + num_epochs} - " f"{self._eval_metrics_str(eval_metrics)}") if eval_metrics['val_loss'] < self.min_loss: print(f"Validation loss decreased {self.min_loss:.4} --> " f"{eval_metrics['val_loss']:.4}: saving state...") self.min_loss = eval_metrics['val_loss'] self.save(self.output_file) def lr_find( self, freeze_until: Optional[str] = None, start_lr: float = 1e-7, end_lr: float = 1, num_it: int = 100 ) -> None: """Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate num_it (int, optional): number of iterations to perform """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr) gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)] self.loss_recorder = [] for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break def plot_recorder(self, beta: float = 0.95, block: bool = True) -> None: """Display the results of the LR grid search Args: beta (float, optional): smoothing factor block (bool, optional): whether the plot should block execution """ if len(self.lr_recorder) != len(self.loss_recorder) or len(self.lr_recorder) == 0: raise AssertionError("Please run the `lr_find` method first") # Exp moving average of loss smoothed_losses = [] avg_loss = 0. for idx, loss in enumerate(self.loss_recorder): avg_loss = beta * avg_loss + (1 - beta) * loss smoothed_losses.append(avg_loss / (1 - beta ** (idx + 1))) plt.plot(self.lr_recorder[10:-5], smoothed_losses[10:-5]) plt.xscale('log') plt.xlabel('Learning Rate') plt.ylabel('Training loss') plt.grid(True, linestyle='--', axis='x') plt.show(block=block) def check_setup(self, freeze_until: Optional[str] = None, lr: float = 3e-4, num_it: int = 100) -> bool: """Check whether you can overfit one batch Args: freeze_until (str, optional): last layer to freeze lr (float, optional): learning rate to be used for training num_it (int, optional): number of iterations to perform """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(lr) prev_loss = math.inf x, target = next(iter(self.train_loader)) x, target = self.to_cuda(x, target) for _ in range(num_it): # Forward batch_loss = self._get_loss(x, target) # Backprop self._backprop_step(batch_loss) # Check that loss decreases if batch_loss.item() > prev_loss: return False prev_loss = batch_loss.item() return True