Beispiel #1
0
    def sample_batch(self, learner_device: str) -> StructuredSpacesRecord:
        """Sample a batch from the buffer and return it as a batched structured spaces record.

        :param learner_device: The device of the learner (cpu or cuda).
        :return: An batched structured spaces record object holding the batched rollouts.
        """
        batch = self.replay_buffer.sample_batch(n_samples=self.batch_size, learner_device=learner_device)
        if self.split_rollouts_into_transitions:
            # Stack records into one, then add an additional dimension
            stacked_records = StructuredSpacesRecord.stack_records(batch)
            return StructuredSpacesRecord.stack_records([stacked_records]).to_torch(learner_device)
        else:
            # Stack trajectories in time major, then stack into a single spaces record
            return SpacesTrajectoryRecord.stack_trajectories(batch).stack().to_torch(learner_device)
def test_record_stacking():
    r1 = _mock_structured_spaces_record(1)
    r2 = _mock_structured_spaces_record(2)
    r3 = _mock_structured_spaces_record(3, done=True)

    stacked = StructuredSpacesRecord.stack_records([r1, r2, r3])

    # Check that the observations are stacked as expected

    expected_observations = {
        0:
        dict(x=np.array([[10, 10], [20, 20], [30, 30]]),
             y=np.array([[10, 10], [20, 20], [30, 30]])),
        1:
        dict(z=np.array([[11], [21], [31]]))
    }

    for step_key in [0, 1]:
        for obs_key, exp_value in expected_observations[step_key].items():
            assert np.all(
                stacked.observations_dict[step_key][obs_key] == exp_value)

    # Check a couple of other values

    assert np.all(stacked.rewards_dict[0] == [1, 2, 3])
    assert np.all(stacked.dones_dict[1] == [False, False, True])
    assert stacked.actions_dict[0]["action"].shape == (3, 2)
Beispiel #3
0
    def step(
        self, action: ActionType
    ) -> Tuple[ObservationType, Any, bool, Dict[Any, Any]]:
        """Record available step-level data."""
        assert self.episode_record is not None, "Environment must be reset before stepping."

        # If the env time changed, start a new structured step record
        if self.env.get_env_time() != self.last_env_time:
            self.episode_record.step_records.append(StructuredSpacesRecord())
            self.last_env_time = self.env.get_env_time()

        actor_id = self.env.actor_id(
        )  # Get actor Id before the step, so it corresponds to the action taken
        observation, reward, done, info = self.env.step(action)

        # Record the spaces of the current (sub)step
        self.episode_record.step_records[-1].append(
            SpacesRecord(actor_id=actor_id,
                         observation=self.last_observation,
                         action=action,
                         reward=reward,
                         done=done,
                         info=info))

        self.last_observation = observation
        return observation, reward, done, info
Beispiel #4
0
    def convert_trajectory_with_env(trajectory: TrajectoryRecord, conversion_env: Optional[MazeEnv]) \
            -> List[StructuredSpacesRecord]:
        """Convert an episode trajectory record into an array of observations and actions using the given env.

        :param trajectory: Episode record to load
        :param conversion_env: Env to use for conversion of MazeStates and MazeActions into observations and actions.
                               Required only if state records are being loaded (i.e. conversion to raw actions and
                               observations is needed).
        :return: Loaded observations and actions. I.e., a tuple (observation_list, action_list). Each of the
                 lists contains observation/action dictionaries, with keys corresponding to IDs of structured
                 sub-steps. (I.e., the dictionary will have just one entry for non-structured scenarios.)
        """
        step_records = []

        for step_id, step_record in enumerate(trajectory.step_records):

            # Process and convert in case we are dealing with state records (otherwise no conversion needed)
            if isinstance(step_record, StateRecord):
                assert conversion_env is not None, "when conversion from Maze states is needed, conversion env " \
                                                   "needs to be present."

                # Drop incomplete records (e.g. at the end of episode)
                if step_record.maze_state is None or step_record.maze_action is None:
                    continue
                # Convert to spaces
                step_record = StructuredSpacesRecord.converted_from(step_record, conversion_env=conversion_env,
                                                                    first_step_in_episode=step_id == 0)

            step_records.append(step_record)

        return step_records
def _mock_space_record(value: int):
    substep_record = SpacesRecord(
        actor_id=ActorID(0, 0),
        observation=dict(observation=np.array(value)),
        action=dict(action=np.array(value)),
        reward=value,
        done=value > 0)

    return StructuredSpacesRecord(substep_records=[substep_record])
def _mock_structured_spaces_record(step_no: int, done: bool = False):
    return StructuredSpacesRecord(substep_records=[
        _mock_spaces_record(actor_id=ActorID(0, 0),
                            keys=["x", "y"],
                            value=[step_no * 10, step_no * 10],
                            reward=step_no),
        _mock_spaces_record(actor_id=ActorID(1, 0),
                            keys=["z"],
                            value=[step_no * 10 + 1],
                            reward=step_no,
                            done=done),
    ])
Beispiel #7
0
def _mock_spaces_trajectory_record(step_count: int):
    """Produce an episode record with maze_states and maze_actions corresponding to the step no."""
    episode_record = SpacesTrajectoryRecord("test")

    for i in range(step_count):
        substep_record = SpacesRecord(
            actor_id=ActorID(0, 0),
            observation=dict(observation=np.array(i)),
            action=dict(action=np.array(i)),
            reward=0,
            done=i == step_count - 1)
        episode_record.step_records.append(
            StructuredSpacesRecord(substep_records=[substep_record]))

    return episode_record
Beispiel #8
0
    def stack_trajectories(
        cls, trajectories: List['SpacesTrajectoryRecord']
    ) -> 'SpacesTrajectoryRecord':
        """Stack multiple trajectories, keeping the time dimension intact.

        All the trajectories should be of the same length. The resulting trajectory will have the same number of steps,
        each being a stack of the corresponding steps of the input trajectories.

        :param trajectories: Trajectories to stack.
        :return: Trajectory record of the same lenght, consisting of stacked structured spaces records.
        """
        assert len(set([len(t) for t in trajectories
                        ])) == 1, "all trajectories must have the same length"

        stacked_trajectory = SpacesTrajectoryRecord(
            id=np.stack([trajectory.id for trajectory in trajectories]))
        step_records_in_time = list(
            zip(*[t.step_records for t in trajectories]))
        stacked_trajectory.step_records = [
            StructuredSpacesRecord.stack_records(list(recs))
            for recs in step_records_in_time
        ]
        return stacked_trajectory
Beispiel #9
0
    def stack(self) -> StructuredSpacesRecord:
        """Stack the whole trajectory into a single structured spaces record.

        Useful for processing whole fixed-length trajectories in a single batch.
        """
        return StructuredSpacesRecord.stack_records(self.step_records)
Beispiel #10
0
    def _update(self) -> None:
        """Perform ppo policy update.
        """

        # collect observations
        record = self._rollout()

        # iterate ppo optimization epochs
        critic_train_stats = defaultdict(lambda: defaultdict(list))
        policy_train_stats = defaultdict(lambda: defaultdict(list))
        n_samples = self.rollout_generator.env.n_envs * self.algorithm_config.n_rollout_steps

        flat_record = copy.deepcopy(record)
        self._flatten_sub_step_items(flat_record.actions)
        self._flatten_sub_step_items(flat_record.observations)

        for k in range(self.algorithm_config.n_optimization_epochs):
            # compute action log-probabilities of actions taken (aka old action log probs)
            with torch.no_grad():
                policy_output_old, critic_output_old = self.model.compute_actor_critic_output(
                    record)
                returns = self.model.critic.compute_structured_return(
                    gamma=self.algorithm_config.gamma,
                    gae_lambda=self.algorithm_config.gae_lambda,
                    rewards=record.rewards,
                    values=critic_output_old.detached_values,
                    dones=record.dones[-1])
                action_log_probs_old = policy_output_old.log_probs_for_actions(
                    record.actions)
                # manually empty GPU cache
                torch.cuda.empty_cache()

            # flatten items for batch processing/
            returns = [r.flatten() for r in returns]
            self._flatten_sub_step_items(action_log_probs_old)
            critic_output_old.reshape(returns[0].shape)

            # iterate mini-batch updates
            indices = np.random.permutation(n_samples)
            n_batches = int(
                np.ceil(float(n_samples) / self.algorithm_config.batch_size))
            for i_batch in range(n_batches):
                # manually empty GPU cache
                torch.cuda.empty_cache()

                # sample batch indices
                i0 = i_batch * self.algorithm_config.batch_size
                i1 = i0 + self.algorithm_config.batch_size
                batch_idxs = indices[i0:i1]

                # get batch data into a new spaces record
                batch_record = StructuredSpacesRecord()
                for substep_record in flat_record.substep_records:
                    batch_substep_record = SpacesRecord(
                        actor_id=substep_record.actor_id,
                        action={},
                        observation={})

                    # observations
                    for key, value in substep_record.observation.items():
                        batch_substep_record.observation[key] = value[
                            batch_idxs]

                    # actions
                    for key, value in substep_record.action.items():
                        batch_substep_record.action[key] = value[batch_idxs]

                    batch_record.append(batch_substep_record)

                # Produce policy and critic output
                policy_output, critic_output = self.model.compute_actor_critic_output(
                    batch_record)

                # Compute action log probabilities with the original actions
                action_log_probs = policy_output.log_probs_for_actions(
                    batch_record.actions)

                # compute advantages
                advantages = [
                    r[batch_idxs] - dv[batch_idxs] for r, dv in zip(
                        returns, critic_output_old.detached_values)
                ]

                # normalize advantages
                advantages = self._normalize_advantages(advantages)

                # compute value loss
                if self.model.critic.num_critics == 1:
                    value_losses = [(returns[0][batch_idxs] -
                                     critic_output.values[0]).pow(2).mean()]
                else:
                    value_losses = [
                        (ret[batch_idxs] - val).pow(2).mean()
                        for ret, val in zip(returns, critic_output.values)
                    ]

                # compute policy loss
                policy_losses = list()
                entropies = list()
                for idx, substep_record in enumerate(
                        batch_record.substep_records):

                    # compute entropies
                    entropies.append(policy_output[idx].entropy.mean())

                    # accumulate independent action losses
                    step_policy_loss = torch.tensor(0.0).to(
                        self.algorithm_config.device)
                    for key in substep_record.action.keys():

                        # get relevant log probs
                        log_probs = action_log_probs[idx][key]
                        old_log_probs = action_log_probs_old[idx][key][
                            batch_idxs]

                        # prepare advantages
                        action_advantages = advantages[idx].detach()
                        while action_advantages.ndim < action_log_probs[idx][
                                key].ndimension():
                            action_advantages = action_advantages.unsqueeze(
                                dim=-1)

                        # compute surrogate objective
                        ratio = torch.exp(log_probs - old_log_probs)
                        surr1 = ratio * action_advantages
                        surr2 = torch.clamp(
                            ratio, 1.0 - self.algorithm_config.clip_range,
                            1.0 + self.algorithm_config.clip_range
                        ) * action_advantages
                        action_loss = -torch.min(surr1, surr2).mean()
                        step_policy_loss += action_loss

                    policy_losses.append(step_policy_loss)

                # perform gradient step
                self._gradient_step(policy_losses=policy_losses,
                                    entropies=entropies,
                                    value_losses=value_losses)

                # append training stats for logging
                self._append_train_stats(policy_train_stats,
                                         critic_train_stats, record.actor_ids,
                                         policy_losses, entropies,
                                         critic_output_old.detached_values,
                                         value_losses)

        # fire logging events
        self._log_train_stats(policy_train_stats, critic_train_stats)
Beispiel #11
0
    def rollout(self, policy: Policy, n_steps: Optional[int], trajectory_id: Optional[Any] = None) \
            -> SpacesTrajectoryRecord:
        """Perform and record a rollout with given policy, for given steps or until done.

        Note that the env is only reset on the very first rollout with this generator, the following rollouts
        just pick up where the previous left off. If required, you can avoid the initial reset by assigning
        the last observation (which will be recorded with the first step) into `self.last_observation`.

        :param policy: Policy to roll out.
        :param n_steps: How many steps to perform. If None, rollouts are performed until done=True.
        :param trajectory_id: Optionally, the ID of the trajectory that we are recording.
        :return: Recorded trajectory.
        """
        # Check: Logits can be recorded with torch policy only
        if self.record_logits:
            assert isinstance(
                policy, TorchPolicy
            ), "to collect logits, the policy needs to be a Torch policy"

        # Initialize a trajectory record
        trajectory_record = SpacesTrajectoryRecord(
            trajectory_id if trajectory_id else self.rollout_counter)
        self.rollout_counter += 1

        # Reset the environment during the first rollout only
        if self.last_observation is None:
            self.last_observation = self.env.reset()

        # Step the desired number of (flat) steps
        step_count = 0
        while True:
            step_record = StructuredSpacesRecord()

            # Step through all sub-steps, i.e., step until the env time changes
            current_env_time = self.env.get_env_time()
            while np.all(current_env_time == self.env.get_env_time()):
                record = self._record_sub_step(policy=policy)
                step_record.append(record)
                # note that this also handles the special case of a done env after the first step
                if np.alltrue(record.done):
                    break

            if self.record_step_stats:
                step_record.step_stats = self.env.get_stats(
                    LogStatsLevel.STEP).last_stats

            if self.record_episode_stats and not self.is_vectorized and step_record.is_done(
            ):
                step_record.episode_stats = self.env.get_stats(
                    LogStatsLevel.EPISODE).last_stats

            # Redistribute actor rewards, if available
            actor_rewards = self.env.get_actor_rewards()
            if actor_rewards is not None:
                assert len(actor_rewards) == len(step_record.substep_records)
                for substep_record, reward in zip(step_record.substep_records,
                                                  actor_rewards):
                    substep_record.reward = reward

            trajectory_record.append(step_record)

            # Limit maximum number of steps
            step_count += 1
            if n_steps and step_count >= n_steps:
                break

            # End prematurely on env done if desired
            if self.terminate_on_done and not self.is_vectorized and step_record.is_done(
            ):
                break

        return trajectory_record