def check_is_multitask_env(env: Environment, has_rewards: bool):
    # dataloader-style:
    for i, (observations, rewards) in itertools.islice(enumerate(env), 10):
        assert isinstance(observations, MultiTaskSetting.Observations)
        assert len(set(observations.task_labels.cpu().tolist())) > 1
        if has_rewards:
            assert isinstance(rewards, MultiTaskSetting.Rewards)
            # Check that there is no relabelling happening, by checking that there are
            # more different y's then there are usually classes in each batch.
            assert len(set(rewards.y.cpu().tolist())) > 2
        else:
            assert rewards is None

    # gym-style interaction:
    obs = env.reset()
    assert env.observation_space.contains(obs.numpy())
    done = False
    steps = 0
    while not done and steps < 10:
        action = Actions(y_pred=torch.randint(10, [env.batch_size]))
        # BUG: convert_tensors seems to be causing issues again: We shouldn't have
        # to manually convert obs to numpy before checking `obs in obs_space`.
        # TODO: Also not super clean that we can't just do `action in action_space`.
        # assert action.numpy() in env.action_space
        assert action.y_pred.numpy() in env.action_space
        obs, reward, done, info = env.step(action)
        assert obs.numpy() in env.observation_space
        assert reward.y in env.reward_space
        steps += 1
        assert done is False
    assert steps == 10
Ejemplo n.º 2
0
    def shared_step(
        self,
        batch: Tuple[Observations, Rewards],
        batch_idx: int,
        environment: Environment,
        loss_name: str,
        dataloader_idx: int = None,
        optimizer_idx: int = None,
    ) -> Dict:
        """
        This is the shared step for this 'example' LightningModule.
        Feel free to customize/change it if you want!
        """
        if dataloader_idx is not None:
            assert isinstance(dataloader_idx, int)
            loss_name += f"/{dataloader_idx}"

        # Split the batch into observations and rewards.
        # NOTE: Only in the case of the Supervised settings do we ever get the
        # Rewards at the same time as the Observations.
        # TODO: It would be nice if we could actually do the same things for
        # both sides of the tree here..
        observations, rewards = self.split_batch(batch)

        # FIXME: Remove this, debugging:
        assert isinstance(observations, Observations), observations
        assert isinstance(observations.x, Tensor), observations.shapes
        # Get the forward pass results, containing:
        # - "observation": the augmented/transformed/processed observation.
        # - "representations": the representations for the observations.
        # - "actions": The actions (predictions)
        forward_pass: ForwardPass = self(observations)

        # get the actions from the forward pass:
        actions = forward_pass.actions

        if rewards is None:
            # Get the reward from the environment (the dataloader).
            if self.config.debug and self.config.render:
                environment.render("human")
                # import matplotlib.pyplot as plt
                # plt.waitforbuttonpress(10)

            rewards = environment.send(actions)
            assert rewards is not None

        loss: Loss = self.get_loss(forward_pass, rewards, loss_name=loss_name)
        return {
            "loss": loss.loss,
            "loss_object": loss,
        }
Ejemplo n.º 3
0
    def fit(self, train_env: Environment, valid_env: Environment):
        for i, batch in enumerate(train_env):
            if isinstance(batch, Observations):
                observations, rewards = batch, None
            else:
                assert isinstance(batch, tuple) and len(batch) == 2
                observations, rewards = batch

            y_preds = train_env.action_space.sample()
            if rewards is None:
                action_space = train_env.action_space
                if train_env.action_space.shape:
                    # This is a bit complicated, but it's needed because the last batch
                    # might have a different batch dimension than the env's action
                    # space, (only happens on the last batch in supervised learning).
                    # TODO: Should we perhaps drop the last batch?
                    action_space = train_env.action_space
                    batch_size = getattr(train_env, "num_envs",
                                         getattr(train_env, "batch_size", 0))
                    env_is_batched = batch_size is not None and batch_size >= 1
                    if env_is_batched:
                        # NOTE: Need to pass an action space that actually reflects the batch
                        # size, even for the last batch!
                        obs_batch_size = observations.x.shape[
                            0] if observations.x.shape else None
                        action_space_batch_size = (
                            train_env.action_space.shape[0]
                            if train_env.action_space.shape else None)
                        if (obs_batch_size is not None
                                and obs_batch_size != action_space_batch_size):
                            action_space = batch_space(
                                train_env.single_action_space, obs_batch_size)

                y_preds = action_space.sample()
                rewards = train_env.send(Actions(y_pred=y_preds))
Ejemplo n.º 4
0
    def fit(self, train_env: Environment, valid_env: Environment):
        """ Example train loop.
        You can do whatever you want with train_env and valid_env here.

        NOTE: In the Settings where task boundaries are known (in this case all
        the supervised CL settings), this will be called once per task.
        """
        # configure() will have been called by the setting before we get here.
        episodes = 0
        with tqdm.tqdm(desc="training") as train_pbar:

            while not train_env.is_closed():
                for i, batch in enumerate(train_env):
                    if isinstance(batch, Observations):
                        observations, rewards = batch, None
                    else:
                        observations, rewards = batch

                    batch_size = observations.x.shape[0]

                    y_pred = train_env.action_space.sample()

                    # If we're at the last batch, it might have a different size, so w
                    # give only the required number of values.
                    if isinstance(y_pred, (np.ndarray, Tensor)):
                        if y_pred.shape[0] != batch_size:
                            y_pred = y_pred[:batch_size]

                    if rewards is None:
                        rewards = train_env.send(y_pred)

                    train_pbar.set_postfix({
                        "Episode": episodes,
                        "Step": i,
                    })
                    # train as you usually would.

                episodes += 1
                if self.max_train_episodes and episodes >= self.max_train_episodes:
                    train_env.close()
                    break
Ejemplo n.º 5
0
    def shared_step(
        self, batch: Tuple[Observations, Optional[Rewards]], environment: Environment
    ) -> Tuple[Tensor, Dict]:
        """Shared step used for both training and validation.

        Parameters
        ----------
        batch : Tuple[Observations, Optional[Rewards]]
            Batch containing Observations, and optional Rewards. When the Rewards are
            None, it means that we'll need to provide the Environment with actions
            before we can get the Rewards (e.g. image labels) back.

            This happens for example when being applied in a Setting which cares about
            sample efficiency or training performance, for example.

        environment : Environment
            The environment we're currently interacting with. Used to provide the
            rewards when they aren't already part of the batch, for example when our
            performance is being monitored during training.

        Returns
        -------
        Tuple[Tensor, Dict]
            The Loss tensor, and a dict of metrics to be logged.
        """
        # Since we're training on a Passive environment, we will get both observations
        # and rewards, unless we're being evaluated based on our training performance,
        # in which case we will need to send actions to the environments before we can
        # get the corresponding rewards (image labels) back.
        observations: Observations = batch[0]
        rewards: Optional[Rewards] = batch[1]

        # Get the predictions:
        logits, _ = self(observations)
        y_pred = logits.argmax(-1)

        if rewards is None:
            # If the rewards in the batch were None, it means we're expected to give
            # actions before we can get rewards back from the environment.
            # This happens when the Setting is monitoring our training performance.
            rewards = environment.send(Actions(y_pred))

        assert rewards is not None
        image_labels = rewards.y

        loss = self.loss(logits, image_labels)

        accuracy = (y_pred == image_labels).sum().float() / len(image_labels)
        metrics_dict = {"accuracy": accuracy}
        return loss, metrics_dict
Ejemplo n.º 6
0
    def fit(self, train_env: Environment, valid_env: Environment):
        for i, batch in enumerate(train_env):
            if isinstance(batch, Observations):
                observations, rewards = batch, None
            else:
                assert isinstance(batch, tuple) and len(batch) == 2
                observations, rewards = batch

            y_preds = train_env.action_space.sample()
            if rewards is None:
                action_space = train_env.action_space
                if train_env.action_space.shape:
                    obs_batch_size = observations.x.shape[0]
                    # BUG: Fix the `batch_size` attribute on `Batch` so it works
                    # even when task labels are None, by checking wether there is
                    # one or more shapes, and then if there are, then that the first
                    # dimension match between those.
                    action_space_batch_size = action_space.shape[0]
                    if obs_batch_size != action_space_batch_size:
                        action_space = batch_space(
                            train_env.single_action_space, obs_batch_size)

                rewards = train_env.send(Actions(action_space.sample()))