Exemple #1
0
    def compute_importances(self, model, criterion, optimizer, dataset, device,
                            batch_size):
        """
        Compute EWC importance matrix for each parameter
        """

        model.train()

        # list of list
        importances = zerolike_params_dict(model)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        for i, (x, y, _) in enumerate(dataloader):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()

            for (k1, p), (k2, imp) in zip(model.named_parameters(),
                                          importances):
                assert (k1 == k2)
                imp += p.grad.data.clone().pow(2)

        # average over mini batch length
        for _, imp in importances:
            imp /= float(len(dataloader))

        return importances
Exemple #2
0
    def after_training_exp(self, strategy, *args, **kwargs):
        self.exp_importance = self.iter_importance
        self.exp_params = copy_params_dict(strategy.model)

        if self.exp_scores is None:
            self.exp_scores = self.checkpoint_scores
        else:
            exp_scores = []

            for (k1, p_score), (k2, p_cp_score) in zip(self.exp_scores,
                                                       self.checkpoint_scores):
                assert k1 == k2, "Error in RWalk score computation."
                exp_scores.append((k1, 0.5 * (p_score + p_cp_score)))

            self.exp_scores = exp_scores

        # Compute weight penalties once for all successive iterations
        # (t_k+1 variables remain constant in Eq. 8 in the paper)
        self.exp_penalties = []

        # Normalize terms in [0,1] interval, as suggested in the paper
        # (the importance is already > 0, while negative scores are relu-ed
        # out, hence we scale only the max-values of both terms)
        max_score = max(map(lambda x: x[1].max(), self.exp_scores))
        max_imp = max(map(lambda x: x[1].max(), self.exp_importance))

        for (k1, imp), (k2, score) in zip(self.exp_importance,
                                          self.exp_scores):
            assert k1 == k2, "Error in RWalk penalties computation."

            self.exp_penalties.append(
                (k1, imp / max_imp + F.relu(score) / max_score))

        self.checkpoint_scores = zerolike_params_dict(strategy.model)
Exemple #3
0
    def before_training(self, strategy: BaseSGDTemplate, **kwargs):
        # Parameters before the first task starts
        if not self.params:
            self.params = dict(copy_params_dict(strategy.model))

        # Initialize Fisher information weight importance
        if not self.importance:
            self.importance = dict(zerolike_params_dict(strategy.model))
Exemple #4
0
    def after_training_iteration(self, strategy, *args, **kwargs):
        self._update_loss(strategy)

        if self._is_checkpoint_iter(strategy):
            self._update_score(strategy)

            self.checkpoint_loss = zerolike_params_dict(strategy.model)
            self.checkpoint_params = copy_params_dict(strategy.model)
Exemple #5
0
    def _get_importance(self, strategy: BaseSGDTemplate):

        # Initialize importance matrix
        importance = dict(zerolike_params_dict(strategy.model))

        if not strategy.experience:
            raise ValueError("Current experience is not available")

        if strategy.experience.dataset is None:
            raise ValueError("Current dataset is not available")

        # Do forward and backward pass to accumulate L2-loss gradients
        strategy.model.train()
        dataloader = DataLoader(
            strategy.experience.dataset,
            batch_size=strategy.train_mb_size,
        )  # type: ignore

        # Progress bar
        if self.verbose:
            print("Computing importance")
            dataloader = tqdm(dataloader)

        for _, batch in enumerate(dataloader):
            # Get batch
            if len(batch) == 2 or len(batch) == 3:
                x, _, t = batch[0], batch[1], batch[-1]
            else:
                raise ValueError("Batch size is not valid")

            # Move batch to device
            x = x.to(strategy.device)

            # Forward pass
            strategy.model.zero_grad()
            out = avalanche_forward(strategy.model, x, t)

            # Average L2-Norm of the output
            loss = torch.norm(out, p="fro", dim=1).mean()
            loss.backward()

            # Accumulate importance
            for name, param in strategy.model.named_parameters():
                if param.requires_grad:
                    # In multi-head architectures, the gradient is going
                    # to be None for all the heads different from the
                    # current one.
                    if param.grad is not None:
                        importance[name] += param.grad.abs() * len(batch)

        # Normalize importance
        importance = {
            name: importance[name] / len(dataloader)
            for name in importance.keys()
        }

        return importance
Exemple #6
0
    def compute_importances(
        self, model, criterion, optimizer, dataset, device, batch_size
    ):
        """
        Compute EWC importance matrix for each parameter
        """

        model.eval()

        # Set RNN-like modules on GPU to training mode to avoid CUDA error
        if device == "cuda":
            for module in model.modules():
                if isinstance(module, torch.nn.RNNBase):
                    warnings.warn(
                        "RNN-like modules do not support "
                        "backward calls while in `eval` mode on CUDA "
                        "devices. Setting all `RNNBase` modules to "
                        "`train` mode. May produce inconsistent "
                        "output if such modules have `dropout` > 0."
                    )
                    module.train()

        # list of list
        importances = zerolike_params_dict(model)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        for i, batch in enumerate(dataloader):
            # get only input, target and task_id from the batch
            x, y, task_labels = batch[0], batch[1], batch[-1]
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            out = avalanche_forward(model, x, task_labels)
            loss = criterion(out, y)
            loss.backward()

            for (k1, p), (k2, imp) in zip(
                model.named_parameters(), importances
            ):
                assert k1 == k2
                if p.grad is not None:
                    imp += p.grad.data.clone().pow(2)

        # average over mini batch length
        for _, imp in importances:
            imp /= float(len(dataloader))

        return importances
Exemple #7
0
 def before_training(self, strategy, *args, **kwargs):
     self.checkpoint_loss = zerolike_params_dict(strategy.model)
     self.checkpoint_scores = zerolike_params_dict(strategy.model)
     self.checkpoint_params = copy_params_dict(strategy.model)