def compute_importances(self, model, criterion, optimizer, dataset, device, batch_size): """ Compute EWC importance matrix for each parameter """ model.train() # list of list importances = zerolike_params_dict(model) dataloader = DataLoader(dataset, batch_size=batch_size) for i, (x, y, _) in enumerate(dataloader): x, y = x.to(device), y.to(device) optimizer.zero_grad() out = model(x) loss = criterion(out, y) loss.backward() for (k1, p), (k2, imp) in zip(model.named_parameters(), importances): assert (k1 == k2) imp += p.grad.data.clone().pow(2) # average over mini batch length for _, imp in importances: imp /= float(len(dataloader)) return importances
def after_training_exp(self, strategy, *args, **kwargs): self.exp_importance = self.iter_importance self.exp_params = copy_params_dict(strategy.model) if self.exp_scores is None: self.exp_scores = self.checkpoint_scores else: exp_scores = [] for (k1, p_score), (k2, p_cp_score) in zip(self.exp_scores, self.checkpoint_scores): assert k1 == k2, "Error in RWalk score computation." exp_scores.append((k1, 0.5 * (p_score + p_cp_score))) self.exp_scores = exp_scores # Compute weight penalties once for all successive iterations # (t_k+1 variables remain constant in Eq. 8 in the paper) self.exp_penalties = [] # Normalize terms in [0,1] interval, as suggested in the paper # (the importance is already > 0, while negative scores are relu-ed # out, hence we scale only the max-values of both terms) max_score = max(map(lambda x: x[1].max(), self.exp_scores)) max_imp = max(map(lambda x: x[1].max(), self.exp_importance)) for (k1, imp), (k2, score) in zip(self.exp_importance, self.exp_scores): assert k1 == k2, "Error in RWalk penalties computation." self.exp_penalties.append( (k1, imp / max_imp + F.relu(score) / max_score)) self.checkpoint_scores = zerolike_params_dict(strategy.model)
def before_training(self, strategy: BaseSGDTemplate, **kwargs): # Parameters before the first task starts if not self.params: self.params = dict(copy_params_dict(strategy.model)) # Initialize Fisher information weight importance if not self.importance: self.importance = dict(zerolike_params_dict(strategy.model))
def after_training_iteration(self, strategy, *args, **kwargs): self._update_loss(strategy) if self._is_checkpoint_iter(strategy): self._update_score(strategy) self.checkpoint_loss = zerolike_params_dict(strategy.model) self.checkpoint_params = copy_params_dict(strategy.model)
def _get_importance(self, strategy: BaseSGDTemplate): # Initialize importance matrix importance = dict(zerolike_params_dict(strategy.model)) if not strategy.experience: raise ValueError("Current experience is not available") if strategy.experience.dataset is None: raise ValueError("Current dataset is not available") # Do forward and backward pass to accumulate L2-loss gradients strategy.model.train() dataloader = DataLoader( strategy.experience.dataset, batch_size=strategy.train_mb_size, ) # type: ignore # Progress bar if self.verbose: print("Computing importance") dataloader = tqdm(dataloader) for _, batch in enumerate(dataloader): # Get batch if len(batch) == 2 or len(batch) == 3: x, _, t = batch[0], batch[1], batch[-1] else: raise ValueError("Batch size is not valid") # Move batch to device x = x.to(strategy.device) # Forward pass strategy.model.zero_grad() out = avalanche_forward(strategy.model, x, t) # Average L2-Norm of the output loss = torch.norm(out, p="fro", dim=1).mean() loss.backward() # Accumulate importance for name, param in strategy.model.named_parameters(): if param.requires_grad: # In multi-head architectures, the gradient is going # to be None for all the heads different from the # current one. if param.grad is not None: importance[name] += param.grad.abs() * len(batch) # Normalize importance importance = { name: importance[name] / len(dataloader) for name in importance.keys() } return importance
def compute_importances( self, model, criterion, optimizer, dataset, device, batch_size ): """ Compute EWC importance matrix for each parameter """ model.eval() # Set RNN-like modules on GPU to training mode to avoid CUDA error if device == "cuda": for module in model.modules(): if isinstance(module, torch.nn.RNNBase): warnings.warn( "RNN-like modules do not support " "backward calls while in `eval` mode on CUDA " "devices. Setting all `RNNBase` modules to " "`train` mode. May produce inconsistent " "output if such modules have `dropout` > 0." ) module.train() # list of list importances = zerolike_params_dict(model) dataloader = DataLoader(dataset, batch_size=batch_size) for i, batch in enumerate(dataloader): # get only input, target and task_id from the batch x, y, task_labels = batch[0], batch[1], batch[-1] x, y = x.to(device), y.to(device) optimizer.zero_grad() out = avalanche_forward(model, x, task_labels) loss = criterion(out, y) loss.backward() for (k1, p), (k2, imp) in zip( model.named_parameters(), importances ): assert k1 == k2 if p.grad is not None: imp += p.grad.data.clone().pow(2) # average over mini batch length for _, imp in importances: imp /= float(len(dataloader)) return importances
def before_training(self, strategy, *args, **kwargs): self.checkpoint_loss = zerolike_params_dict(strategy.model) self.checkpoint_scores = zerolike_params_dict(strategy.model) self.checkpoint_params = copy_params_dict(strategy.model)