Exemple #1
0
    def compute_importances(self, model, criterion, optimizer, dataset, device,
                            batch_size):
        """
        Compute EWC importance matrix for each parameter
        """

        model.train()

        # list of list
        importances = zerolike_params_dict(model)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        for i, (x, y, task_labels) in enumerate(dataloader):
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            out = avalanche_forward(model, x, task_labels)
            loss = criterion(out, y)
            loss.backward()

            for (k1, p), (k2, imp) in zip(model.named_parameters(),
                                          importances):
                assert (k1 == k2)
                if p.grad is not None:
                    imp += p.grad.data.clone().pow(2)

        # average over mini batch length
        for _, imp in importances:
            imp /= float(len(dataloader))

        return importances
Exemple #2
0
    def _update_grad(self, strategy):
        model = strategy.model
        batch = strategy.mbatch

        model.eval()

        # Set RNN-like modules on GPU to training mode to avoid CUDA error
        if strategy.device == "cuda":
            for module in model.modules():
                if isinstance(module, torch.nn.RNNBase):
                    warnings.warn(
                        "RNN-like modules do not support "
                        "backward calls while in `eval` mode on CUDA "
                        "devices. Setting all `RNNBase` modules to "
                        "`train` mode. May produce inconsistent "
                        "output if such modules have `dropout` > 0.")
                    module.train()

        x, y, task_labels = batch[0], batch[1], batch[-1]

        strategy.optimizer.zero_grad()
        out = avalanche_forward(model, x, task_labels)
        loss = strategy._criterion(out, y)  # noqa
        loss.backward()

        self.iter_grad = copy_params_dict(model, copy_grad=True)
Exemple #3
0
    def _get_importance(self, strategy: BaseSGDTemplate):

        # Initialize importance matrix
        importance = dict(zerolike_params_dict(strategy.model))

        if not strategy.experience:
            raise ValueError("Current experience is not available")

        if strategy.experience.dataset is None:
            raise ValueError("Current dataset is not available")

        # Do forward and backward pass to accumulate L2-loss gradients
        strategy.model.train()
        dataloader = DataLoader(
            strategy.experience.dataset,
            batch_size=strategy.train_mb_size,
        )  # type: ignore

        # Progress bar
        if self.verbose:
            print("Computing importance")
            dataloader = tqdm(dataloader)

        for _, batch in enumerate(dataloader):
            # Get batch
            if len(batch) == 2 or len(batch) == 3:
                x, _, t = batch[0], batch[1], batch[-1]
            else:
                raise ValueError("Batch size is not valid")

            # Move batch to device
            x = x.to(strategy.device)

            # Forward pass
            strategy.model.zero_grad()
            out = avalanche_forward(strategy.model, x, t)

            # Average L2-Norm of the output
            loss = torch.norm(out, p="fro", dim=1).mean()
            loss.backward()

            # Accumulate importance
            for name, param in strategy.model.named_parameters():
                if param.requires_grad:
                    # In multi-head architectures, the gradient is going
                    # to be None for all the heads different from the
                    # current one.
                    if param.grad is not None:
                        importance[name] += param.grad.abs() * len(batch)

        # Normalize importance
        importance = {
            name: importance[name] / len(dataloader)
            for name in importance.keys()
        }

        return importance
Exemple #4
0
    def compute_importances(
        self, model, criterion, optimizer, dataset, device, batch_size
    ):
        """
        Compute EWC importance matrix for each parameter
        """

        model.eval()

        # Set RNN-like modules on GPU to training mode to avoid CUDA error
        if device == "cuda":
            for module in model.modules():
                if isinstance(module, torch.nn.RNNBase):
                    warnings.warn(
                        "RNN-like modules do not support "
                        "backward calls while in `eval` mode on CUDA "
                        "devices. Setting all `RNNBase` modules to "
                        "`train` mode. May produce inconsistent "
                        "output if such modules have `dropout` > 0."
                    )
                    module.train()

        # list of list
        importances = zerolike_params_dict(model)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        for i, batch in enumerate(dataloader):
            # get only input, target and task_id from the batch
            x, y, task_labels = batch[0], batch[1], batch[-1]
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            out = avalanche_forward(model, x, task_labels)
            loss = criterion(out, y)
            loss.backward()

            for (k1, p), (k2, imp) in zip(
                model.named_parameters(), importances
            ):
                assert k1 == k2
                if p.grad is not None:
                    imp += p.grad.data.clone().pow(2)

        # average over mini batch length
        for _, imp in importances:
            imp /= float(len(dataloader))

        return importances
Exemple #5
0
    def inner_update(self, fast_model, x, y, t):
        """Update fast weights using current samples and
        return the updated fast model.
        """
        logits = avalanche_forward(fast_model, x, t)
        loss = self._criterion(logits, y)

        # Compute gradient with respect to the current fast weights
        grads = list(
            torch.autograd.grad(
                loss,
                fast_model.fast_params,
                create_graph=self.second_order,
                retain_graph=self.second_order,
                allow_unused=True,
            )
        )

        # Clip grad norms
        grads = [
            torch.clamp(g, min=-self.grad_clip_norm, max=self.grad_clip_norm)
            if g is not None
            else g
            for g in grads
        ]

        # New fast parameters
        new_fast_params = [
            param - alpha * grad if grad is not None else param
            for (param, alpha, grad) in zip(
                fast_model.fast_params, self.alpha_params.parameters(), grads
            )
        ]

        # Update fast model's weights
        fast_model.update_params(new_fast_params)
Exemple #6
0
 def forward(self):
     return avalanche_forward(self.model, self.mb_x, self.mb_task_id)
Exemple #7
0
    def train_batch(self):
        # Create a stateless copy of the model for inner-updates
        fast_model = higher.patch.monkeypatch(
            self.model,
            copy_initial_weights=True,
            track_higher_grads=self.second_order,
        )
        if self.clock.train_exp_counter > 0:
            batch_x = self.mb_x[: self.train_mb_size]
            batch_y = self.mb_y[: self.train_mb_size]
            batch_t = self.mb_task_id[: self.train_mb_size]
        else:
            batch_x, batch_y, batch_t = self.mb_x, self.mb_y, self.mb_task_id

        bsize_data = batch_x.shape[0]
        rough_sz = math.ceil(bsize_data / self.n_inner_updates)
        meta_losses = [0 for _ in range(self.n_inner_updates)]

        for i in range(self.n_inner_updates):
            batch_x_i = batch_x[i * rough_sz : (i + 1) * rough_sz]
            batch_y_i = batch_y[i * rough_sz : (i + 1) * rough_sz]
            batch_t_i = batch_t[i * rough_sz : (i + 1) * rough_sz]

            # We assume that samples for inner update are from the same task
            self.inner_update(fast_model, batch_x_i, batch_y_i, batch_t_i)

            # Compute meta-loss with the combination of batch and buffer samples
            logits_meta = avalanche_forward(
                fast_model, self.mb_x, self.mb_task_id
            )
            meta_loss = self._criterion(logits_meta, self.mb_y)
            meta_losses[i] = meta_loss

        # Compute meta-gradient for the main model
        meta_loss = sum(meta_losses) / len(meta_losses)
        meta_grad_model = torch.autograd.grad(
            meta_loss,
            fast_model.parameters(time=0),
            retain_graph=True,
            allow_unused=True,
        )
        self.model.zero_grad()
        self.apply_grad(self.model, meta_grad_model)

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(
            self.model.parameters(), self.grad_clip_norm
        )

        if self.learn_lr:
            # Compute meta-gradient for alpha-lr parameters
            meta_grad_alpha = torch.autograd.grad(
                meta_loss, self.alpha_params.parameters(), allow_unused=True
            )
            self.alpha_params.zero_grad()
            self.apply_grad(self.alpha_params, meta_grad_alpha)

            torch.nn.utils.clip_grad_norm_(
                self.alpha_params.parameters(), self.grad_clip_norm
            )
            self.optimizer_alpha.step()

        # If sync-update: update with self.optimizer
        # o.w: use the learned LRs to update the model
        if self.sync_update:
            self.optimizer.step()
        else:
            for p, alpha in zip(
                self.model.parameters(), self.alpha_params.parameters()
            ):
                # Use relu on updated LRs to avoid negative values
                p.data = p.data - p.grad * F.relu(alpha)

        self.loss = meta_loss
 def forward(self):
     """Compute the model's output given the current mini-batch."""
     return avalanche_forward(self.model, self.mb_x, self.mb_task_id)
Exemple #9
0
    def environment_to_experience(self, env, setting):
        all_observations: List[Observations] = []
        all_rewards: List[Rewards] = []

        for batch in tqdm.tqdm(
                env, desc="Converting environment into TensorDataset"):
            observations: Observations
            rewards: Optional[Rewards]
            if isinstance(batch, Observations):
                observations = batch
                rewards = None
            else:
                assert isinstance(batch, tuple) and len(batch) == 2
                observations, rewards = batch

            if rewards is None:
                # Need to send actions to the env before we can actually get the
                # associated Reward. Here there are (at least) three options to choose
                # from:

                # Option 1: Select action at random:
                # action = env.action_space.sample()
                # if observations.batch_size != action.shape[0]:
                #     action = action[: observations.batch_size]
                # rewards: Rewards = env.send(action)

                # Option 2: Use the current model, in 'inference' mode:
                # action = self.get_actions(observations, action_space=env.action_space)
                # rewards: Rewards = env.send(action)

                # Option 3: Train an online model:
                # NOTE: You might have to change this for your strategy. For instance,
                # currently does not take any plugins into consideration.
                self.cl_strategy.optimizer.zero_grad()

                x = observations.x.to(self.cl_strategy.device)
                task_labels = observations.task_labels
                logits = avalanche_forward(self.model,
                                           x=x,
                                           task_labels=task_labels)
                y_pred = logits.argmax(-1)
                action = self.target_setting.Actions(y_pred=y_pred)

                rewards: Rewards = env.send(action)

                y = rewards.y.to(self.cl_strategy.device)
                # Train the model:
                loss = self.cl_strategy.criterion(logits, y)
                loss.backward()
                self.cl_strategy.optimizer.step()

            all_observations.append(observations)
            all_rewards.append(rewards)

        # Stack all the observations into a single `Observations` object:
        stacked_observations: Observations = Observations.concatenate(
            all_observations)
        x = stacked_observations.x
        task_labels = stacked_observations.task_labels
        stacked_rewards: Rewards = Rewards.concatenate(all_rewards)
        y = stacked_rewards.y
        return SequoiaExperience(env=env,
                                 setting=setting,
                                 x=x,
                                 y=y,
                                 task_labels=task_labels)