Python Trajectory Examples

Programming Language: Python

Namespace/Package Name: mlagents.trainers.trajectory

Class/Type: Trajectory

Examples at hotexamples.com: 14

Python Trajectory - 14 examples found. These are the top rated real world Python examples of mlagents.trainers.trajectory.Trajectory extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Trajectory(9)

to_agentbuffer(5)

Frequently Used Methods

Trajectory (9)

to_agentbuffer (5)

Example #1

Show file

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the replay buffer.
        """
        super()._process_trajectory(trajectory)
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()

        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(
                agent_buffer_trajectory["vector_obs"])

        # Evaluate all reward functions for reporting purposes
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory["environment_rewards"])
        for name, reward_signal in self.optimizer.reward_signals.items():
            if isinstance(reward_signal, RewardSignal):
                evaluate_result = reward_signal.evaluate_batch(
                    agent_buffer_trajectory).scaled_reward
            else:
                evaluate_result = (
                    reward_signal.evaluate(agent_buffer_trajectory) *
                    reward_signal.strength)
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
        value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory, trajectory.next_obs,
            trajectory.done_reached)
        for name, v in value_estimates.items():
            if isinstance(self.optimizer.reward_signals[name], RewardSignal):
                self._stats_reporter.add_stat(
                    self.optimizer.reward_signals[name].value_name, np.mean(v))
            else:
                self._stats_reporter.add_stat(
                    f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
                    np.mean(v),
                )

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.interrupted:
            vec_vis_obs = SplitObservations.from_observations(last_step.obs)
            for i, obs in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs
            if vec_vis_obs.vector_observations.size > 1:
                agent_buffer_trajectory["next_vector_in"][
                    -1] = vec_vis_obs.vector_observations
            agent_buffer_trajectory["done"][-1] = False

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length)

        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)

Example #2

Show file

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the replay buffer.
        """
        super()._process_trajectory(trajectory)
        last_step = trajectory.steps[-1]
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()
        # Check if we used group rewards, warn if so.
        self._warn_if_group_reward(agent_buffer_trajectory)

        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory)

        # Evaluate all reward functions for reporting purposes
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS])
        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = (
                reward_signal.evaluate(agent_buffer_trajectory) *
                reward_signal.strength)

            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Get all value estimates for reporting purposes
        (
            value_estimates,
            _,
            value_memories,
        ) = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory, trajectory.next_obs,
            trajectory.done_reached)
        if value_memories is not None:
            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(
                value_memories)

        for name, v in value_estimates.items():
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
                np.mean(v),
            )

        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
        # Set last element to duplicate obs and remove dones.
        if last_step.interrupted:
            last_step_obs = last_step.obs
            for i, obs in enumerate(last_step_obs):
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
            agent_buffer_trajectory[BufferKey.DONE][-1] = False

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length)

        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)

Example #3

Show file

def make_fake_trajectory(
    length: int,
    max_step_complete: bool = False,
    vec_obs_size: int = VEC_OBS_SIZE,
    num_vis_obs: int = 1,
    action_space: int = ACTION_SIZE,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []
    for i in range(length - 1):
        obs = []
        for i in range(num_vis_obs):
            obs.append(np.ones((84, 84, 3), dtype=np.float32))
        obs.append(np.ones(vec_obs_size, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_space, dtype=np.float32)
        action_probs = np.ones(action_space, dtype=np.float32)
        action_pre = np.zeros(action_space, dtype=np.float32)
        action_mask = np.ones(action_space, dtype=np.float32)
        prev_action = np.ones(action_space, dtype=np.float32)
        max_step = False
        memory = np.ones(10, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            max_step=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        max_step=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)

Example #4

Show file

File: mock_brain.py Project: joomon/ml-agents

def make_fake_trajectory(
    length: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
    num_other_agents_in_group: int = 0,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    for _i in range(length - 1):
        obs = []
        for obs_spec in observation_specs:
            obs.append(np.ones(obs_spec.shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = ActionTuple(
            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
        )
        action_probs = LogProbsTuple(
            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
        )
        action_mask = (
            [
                [False for _ in range(branch)]
                for branch in action_spec.discrete_branches
            ]  # type: ignore
            if action_spec.is_discrete()
            else None
        )
        if action_spec.is_discrete():
            prev_action = np.ones(action_size, dtype=np.int32)
        else:
            prev_action = np.ones(action_size, dtype=np.float32)

        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        group_status = []
        for _ in range(num_other_agents_in_group):
            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
            group_status=group_status,
            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
    for obs_spec in observation_specs:
        obs.append(np.ones(obs_spec.shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
        group_status=group_status,
        group_reward=0,
    )
    steps_list.append(last_experience)
    return Trajectory(
        steps=steps_list,
        agent_id=agent_id,
        behavior_id=behavior_id,
        next_obs=obs,
        next_group_obs=[obs] * num_other_agents_in_group,
    )

Example #5

Show file

def make_fake_trajectory(
    length: int,
    max_step_complete: bool = False,
    vec_obs_size: int = 1,
    num_vis_obs: int = 1,
    action_space: List[int] = None,
    memory_size: int = 10,
    is_discrete: bool = True,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    if action_space is None:
        action_space = [2]
    steps_list = []
    for _i in range(length - 1):
        obs = []
        for _j in range(num_vis_obs):
            obs.append(np.ones((84, 84, 3), dtype=np.float32))
        obs.append(np.ones(vec_obs_size, dtype=np.float32))
        reward = 1.0
        done = False
        if is_discrete:
            action_size = len(action_space)
            action_probs = np.ones(np.sum(action_space), dtype=np.float32)
        else:
            action_size = action_space[0]
            action_probs = np.ones((action_size), dtype=np.float32)
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_space] if is_discrete else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            max_step=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        max_step=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)

Example #6

Show file

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the update buffer.
        Processing involves calculating value and advantage targets for model updating step.
        :param trajectory: The Trajectory tuple containing the steps to be processed.
        """
        super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()
        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])

        # Get all value estimates
        value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory,
            trajectory.next_obs,
            trajectory.done_reached and not trajectory.interrupted,
        )
        for name, v in value_estimates.items():
            agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
            self._stats_reporter.add_stat(
                self.optimizer.reward_signals[name].value_name, np.mean(v)
            )

        # Evaluate all reward functions
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory["environment_rewards"]
        )
        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = reward_signal.evaluate_batch(
                agent_buffer_trajectory
            ).scaled_reward
            agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Compute GAE and returns
        tmp_advantages = []
        tmp_returns = []
        for name in self.optimizer.reward_signals:
            bootstrap_value = value_next[name]

            local_rewards = agent_buffer_trajectory[f"{name}_rewards"].get_batch()
            local_value_estimates = agent_buffer_trajectory[
                f"{name}_value_estimates"
            ].get_batch()
            local_advantage = get_gae(
                rewards=local_rewards,
                value_estimates=local_value_estimates,
                value_next=bootstrap_value,
                gamma=self.optimizer.reward_signals[name].gamma,
                lambd=self.hyperparameters.lambd,
            )
            local_return = local_advantage + local_value_estimates
            # This is later use as target for the different value estimates
            agent_buffer_trajectory[f"{name}_returns"].set(local_return)
            agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
            tmp_advantages.append(local_advantage)
            tmp_returns.append(local_return)

        # Get global advantages
        global_advantages = list(
            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
        )
        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
        agent_buffer_trajectory["advantages"].set(global_advantages)
        agent_buffer_trajectory["discounted_returns"].set(global_returns)
        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length
        )

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)

Example #7

Show file

    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_step = self.last_step_result.get(global_id, None)
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)
            if stored_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                stored_agent_step = stored_step.get_agent_step_result(local_id)
                idx = stored_step.agent_id_to_index[local_id]
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(global_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                        del self.episode_steps[global_id]
                        del self.episode_rewards[global_id]
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            self.last_step_result[global_id] = batched_step_result

        if "action" in take_action_outputs:
            self.policy.save_previous_action(previous_action.agent_ids,
                                             take_action_outputs["action"])

Example #8

Show file

    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
        global_group_id = get_global_group_id(worker_id, step.group_id)
        stored_decision_step, idx = self._last_step_result.get(
            global_agent_id, (None, None)
        )
        stored_take_action_outputs = self._last_take_action_outputs.get(
            global_agent_id, None
        )
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self._last_step_result[global_agent_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            stored_actions = stored_take_action_outputs["action"]
            action_tuple = ActionTuple(
                continuous=stored_actions.continuous[idx],
                discrete=stored_actions.discrete[idx],
            )
            stored_action_probs = stored_take_action_outputs["log_probs"]
            log_probs_tuple = LogProbsTuple(
                continuous=stored_action_probs.continuous[idx],
                discrete=stored_action_probs.discrete[idx],
            )
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]

            # Assemble teammate_obs. If none saved, then it will be an empty list.
            group_statuses = []
            for _id, _mate_status in self._group_status[global_group_id].items():
                if _id != global_agent_id:
                    group_statuses.append(_mate_status)

            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action_tuple,
                action_probs=log_probs_tuple,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
                group_status=group_statuses,
                group_reward=step.group_reward,
            )
            # Add the value outputs if needed
            self._experience_buffers[global_agent_id].append(experience)
            self._episode_rewards[global_agent_id] += step.reward
            if not terminated:
                self._episode_steps[global_agent_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self._experience_buffers[global_agent_id])
                >= self._max_trajectory_length
                or terminated
            ):
                next_obs = step.obs
                next_group_obs = []
                for _id, _obs in self._current_group_obs[global_group_id].items():
                    if _id != global_agent_id:
                        next_group_obs.append(_obs)

                trajectory = Trajectory(
                    steps=self._experience_buffers[global_agent_id],
                    agent_id=global_agent_id,
                    next_obs=next_obs,
                    next_group_obs=next_group_obs,
                    behavior_id=self._behavior_id,
                )
                for traj_queue in self._trajectory_queues:
                    traj_queue.put(trajectory)
                self._experience_buffers[global_agent_id] = []
            if terminated:
                # Record episode length.
                self._stats_reporter.add_stat(
                    "Environment/Episode Length",
                    self._episode_steps.get(global_agent_id, 0),
                )
                self._clean_agent_data(global_agent_id)

Example #9

Show file

File: agent_processor.py Project: yasirrhaq/KartingML

    def add_experiences(
        self,
        curr_info: BrainInfo,
        next_info: BrainInfo,
        take_action_outputs: ActionInfoOutputs,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param curr_info: current BrainInfo.
        :param next_info: next BrainInfo.
        :param take_action_outputs: The outputs of the Policy's get_action method.
        """
        if take_action_outputs:
            self.stats_reporter.add_stat("Policy/Entropy",
                                         take_action_outputs["entropy"].mean())
            self.stats_reporter.add_stat("Policy/Learning Rate",
                                         take_action_outputs["learning_rate"])

        for agent_id in curr_info.agents:
            self.last_brain_info[agent_id] = curr_info
            self.last_take_action_outputs[agent_id] = take_action_outputs

        # Store the environment reward
        tmp_environment_reward = next_info.rewards

        for next_idx, agent_id in enumerate(next_info.agents):
            stored_info = self.last_brain_info.get(agent_id, None)
            if stored_info is not None:
                stored_take_action_outputs = self.last_take_action_outputs[
                    agent_id]
                idx = stored_info.agents.index(agent_id)
                obs = []
                if not stored_info.local_done[idx]:
                    for i, _ in enumerate(stored_info.visual_observations):
                        obs.append(stored_info.visual_observations[i][idx])
                    if self.policy.use_vec_obs:
                        obs.append(stored_info.vector_observations[idx])
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([agent_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = next_info.local_done[next_idx]
                    max_step = next_info.max_reached[next_idx]

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_masks = stored_info.action_masks[idx]
                    prev_action = self.policy.retrieve_previous_action(
                        [agent_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=tmp_environment_reward[next_idx],
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_masks,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[agent_id].append(experience)
                    self.episode_rewards[agent_id] += tmp_environment_reward[
                        next_idx]
                if (next_info.local_done[next_idx] or
                    (len(self.experience_buffers[agent_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[agent_id]) > 0:
                    # Make next AgentExperience
                    next_obs = []
                    for i, _ in enumerate(next_info.visual_observations):
                        next_obs.append(
                            next_info.visual_observations[i][next_idx])
                    if self.policy.use_vec_obs:
                        next_obs.append(
                            next_info.vector_observations[next_idx])
                    trajectory = Trajectory(
                        steps=self.experience_buffers[agent_id],
                        agent_id=agent_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    # This will eventually be replaced with a queue
                    self.trainer.process_trajectory(trajectory)
                    self.experience_buffers[agent_id] = []
                    if next_info.local_done[next_idx]:
                        self.stats_reporter.add_stat(
                            "Environment/Cumulative Reward",
                            self.episode_rewards.get(agent_id, 0),
                        )
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(agent_id, 0),
                        )
                        del self.episode_steps[agent_id]
                        del self.episode_rewards[agent_id]
                elif not next_info.local_done[next_idx]:
                    self.episode_steps[agent_id] += 1
        if "action" in take_action_outputs:
            self.policy.save_previous_action(curr_info.agents,
                                             take_action_outputs["action"])

Example #10

Show file

File: mock_brain.py Project: zcemycl/ml-agents

def make_fake_trajectory(
    length: int,
    observation_shapes: List[Tuple],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    action_probs = np.ones(
        int(
            np.sum(action_spec.discrete_branches) +
            action_spec.continuous_size),
        dtype=np.float32,
    )
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
            obs.append(np.ones(_shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_spec.discrete_branches
                        ]  # type: ignore
                       if action_spec.is_discrete() else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    obs = []
    for _shape in observation_shapes:
        obs.append(np.ones(_shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)

Example #11

Show file

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the update buffer.
        Processing involves calculating value and advantage targets for model updating step.
        :param trajectory: The Trajectory tuple containing the steps to be processed.
        """
        super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()
        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory)

        # Get all value estimates
        (
            value_estimates,
            baseline_estimates,
            value_next,
            value_memories,
            baseline_memories,
        ) = self.optimizer.get_trajectory_and_baseline_value_estimates(
            agent_buffer_trajectory,
            trajectory.next_obs,
            trajectory.next_group_obs,
            trajectory.all_group_dones_reached and trajectory.done_reached
            and not trajectory.interrupted,
        )

        if value_memories is not None and baseline_memories is not None:
            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(
                value_memories)
            agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set(
                baseline_memories)

        for name, v in value_estimates.items():
            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(
                name)].extend(v)
            agent_buffer_trajectory[RewardSignalUtil.baseline_estimates_key(
                name)].extend(baseline_estimates[name])
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
                np.mean(baseline_estimates[name]),
            )
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
                np.mean(value_estimates[name]),
            )

        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS])
        self.collected_group_rewards[agent_id] += np.sum(
            agent_buffer_trajectory[BufferKey.GROUP_REWARD])
        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = (
                reward_signal.evaluate(agent_buffer_trajectory) *
                reward_signal.strength)
            agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
                evaluate_result)
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Compute lambda returns and advantage
        tmp_advantages = []
        for name in self.optimizer.reward_signals:

            local_rewards = np.array(
                agent_buffer_trajectory[RewardSignalUtil.rewards_key(
                    name)].get_batch(),
                dtype=np.float32,
            )

            baseline_estimate = agent_buffer_trajectory[
                RewardSignalUtil.baseline_estimates_key(name)].get_batch()
            v_estimates = agent_buffer_trajectory[
                RewardSignalUtil.value_estimates_key(name)].get_batch()

            lambd_returns = lambda_return(
                r=local_rewards,
                value_estimates=v_estimates,
                gamma=self.optimizer.reward_signals[name].gamma,
                lambd=self.hyperparameters.lambd,
                value_next=value_next[name],
            )

            local_advantage = np.array(lambd_returns) - np.array(
                baseline_estimate)

            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
                lambd_returns)
            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
                local_advantage)
            tmp_advantages.append(local_advantage)

        # Get global advantages
        global_advantages = list(
            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0))
        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)

        # Append to update buffer
        agent_buffer_trajectory.resequence_and_append(
            self.update_buffer, training_length=self.policy.sequence_length)

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)
            # Remove dead agents from group reward recording
            if not trajectory.all_group_dones_reached:
                self.collected_group_rewards.pop(agent_id)

        # If the whole team is done, average the remaining group rewards.
        if trajectory.all_group_dones_reached and trajectory.done_reached:
            self.stats_reporter.add_stat(
                "Environment/Group Cumulative Reward",
                self.collected_group_rewards.get(agent_id, 0),
                aggregation=StatsAggregationMethod.HISTOGRAM,
            )
            self.collected_group_rewards.pop(agent_id)

Example #12

Show file

    def _process_trajectory(self, trajectory: Trajectory) -> None:
        """
        Takes a trajectory and processes it, putting it into the update buffer.
        Processing involves calculating value and advantage targets for model updating step.
        :param trajectory: The Trajectory tuple containing the steps to be processed.
        """
        super()._process_trajectory(trajectory)
        agent_id = trajectory.agent_id  # All the agents should have the same ID

        agent_buffer_trajectory = trajectory.to_agentbuffer()
        # Check if we used group rewards, warn if so.
        self._warn_if_group_reward(agent_buffer_trajectory)

        # Update the normalization
        if self.is_training:
            self.policy.update_normalization(agent_buffer_trajectory)

        # Get all value estimates
        (
            value_estimates,
            value_next,
            value_memories,
        ) = self.optimizer.get_trajectory_value_estimates(
            agent_buffer_trajectory,
            trajectory.next_obs,
            trajectory.done_reached and not trajectory.interrupted,
        )
        if value_memories is not None:
            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(
                value_memories)

        for name, v in value_estimates.items():
            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(
                name)].extend(v)
            self._stats_reporter.add_stat(
                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
                np.mean(v),
            )

        # Evaluate all reward functions
        self.collected_rewards["environment"][agent_id] += np.sum(
            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS])
        for name, reward_signal in self.optimizer.reward_signals.items():
            evaluate_result = (
                reward_signal.evaluate(agent_buffer_trajectory) *
                reward_signal.strength)
            agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
                evaluate_result)
            # Report the reward signals
            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

        # Compute GAE and returns
        tmp_advantages = []
        tmp_returns = []
        for name in self.optimizer.reward_signals:
            bootstrap_value = value_next[name]

            local_rewards = agent_buffer_trajectory[
                RewardSignalUtil.rewards_key(name)].get_batch()
            local_value_estimates = agent_buffer_trajectory[
                RewardSignalUtil.value_estimates_key(name)].get_batch()

            local_advantage = get_gae(
                rewards=local_rewards,
                value_estimates=local_value_estimates,
                value_next=bootstrap_value,
                gamma=self.optimizer.reward_signals[name].gamma,
                lambd=self.hyperparameters.lambd,
            )
            local_return = local_advantage + local_value_estimates
            # This is later use as target for the different value estimates
            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
                local_return)
            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
                local_advantage)
            tmp_advantages.append(local_advantage)
            tmp_returns.append(local_return)

        # Get global advantages
        global_advantages = list(
            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0))
        global_returns = list(
            np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
        agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(
            global_returns)

        self._append_to_update_buffer(agent_buffer_trajectory)

        # If this was a terminal trajectory, append stats and reset reward collection
        if trajectory.done_reached:
            self._update_end_episode_stats(agent_id, self.optimizer)

Example #13

Show file

File: agent_processor.py Project: porouspaper/games-test

    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], global_id: str, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        stored_decision_step, idx = self.last_step_result.get(global_id, (None, None))
        stored_take_action_outputs = self.last_take_action_outputs.get(global_id, None)
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_memories([global_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            action = stored_take_action_outputs["action"][idx]
            if self.policy.use_continuous_act:
                action_pre = stored_take_action_outputs["pre_action"][idx]
            else:
                action_pre = None
            action_probs = stored_take_action_outputs["log_probs"][idx]
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action,
                action_probs=action_probs,
                action_pre=action_pre,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
            )
            # Add the value outputs if needed
            self.experience_buffers[global_id].append(experience)
            self.episode_rewards[global_id] += step.reward
            if not terminated:
                self.episode_steps[global_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self.experience_buffers[global_id]) >= self.max_trajectory_length
                or terminated
            ):
                # Make next AgentExperience
                next_obs = step.obs
                trajectory = Trajectory(
                    steps=self.experience_buffers[global_id],
                    agent_id=global_id,
                    next_obs=next_obs,
                    behavior_id=self.behavior_id,
                )
                for traj_queue in self.trajectory_queues:
                    traj_queue.put(trajectory)
                self.experience_buffers[global_id] = []
            if terminated:
                # Record episode length.
                self.stats_reporter.add_stat(
                    "Environment/Episode Length", self.episode_steps.get(global_id, 0)
                )
                self._clean_agent_data(global_id)

Example #14

Show file

    def add_experiences(
        self,
        batched_step_result: BatchedStepResult,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param batched_step_result: current BatchedStepResult.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        for _id in batched_step_result.agent_id:  # Assume agent_id is 1-D
            local_id = int(
                _id
            )  # Needed for mypy to pass since ndarray has no content type
            curr_agent_step = batched_step_result.get_agent_step_result(
                local_id)
            global_id = get_global_agent_id(worker_id, local_id)
            stored_agent_step, idx = self.last_step_result.get(
                global_id, (None, None))
            stored_take_action_outputs = self.last_take_action_outputs.get(
                global_id, None)

            if stored_agent_step is not None and stored_take_action_outputs is not None:
                # We know the step is from the same worker, so use the local agent id.
                obs = stored_agent_step.obs
                if not stored_agent_step.done:
                    if self.policy.use_recurrent:
                        memory = self.policy.retrieve_memories([global_id
                                                                ])[0, :]
                    else:
                        memory = None

                    done = curr_agent_step.done
                    max_step = curr_agent_step.max_step

                    # Add the outputs of the last eval
                    action = stored_take_action_outputs["action"][idx]
                    if self.policy.use_continuous_act:
                        action_pre = stored_take_action_outputs["pre_action"][
                            idx]
                    else:
                        action_pre = None
                    action_probs = stored_take_action_outputs["log_probs"][idx]
                    action_mask = stored_agent_step.action_mask
                    prev_action = self.policy.retrieve_previous_action(
                        [global_id])[0, :]

                    experience = AgentExperience(
                        obs=obs,
                        reward=curr_agent_step.reward,
                        done=done,
                        action=action,
                        action_probs=action_probs,
                        action_pre=action_pre,
                        action_mask=action_mask,
                        prev_action=prev_action,
                        max_step=max_step,
                        memory=memory,
                    )
                    # Add the value outputs if needed
                    self.experience_buffers[global_id].append(experience)
                    self.episode_rewards[global_id] += curr_agent_step.reward
                if (curr_agent_step.done or
                    (len(self.experience_buffers[global_id]) >=
                     self.max_trajectory_length)) and len(
                         self.experience_buffers[global_id]) > 0:
                    # Make next AgentExperience
                    next_obs = curr_agent_step.obs
                    trajectory = Trajectory(
                        steps=self.experience_buffers[global_id],
                        agent_id=global_id,
                        next_obs=next_obs,
                        behavior_id=self.behavior_id,
                    )
                    for traj_queue in self.trajectory_queues:
                        traj_queue.put(trajectory)
                    self.experience_buffers[global_id] = []
                    if curr_agent_step.done:
                        # Record episode length for agents which have had at least
                        # 1 step. Done after reset ignored.
                        self.stats_reporter.add_stat(
                            "Environment/Episode Length",
                            self.episode_steps.get(global_id, 0),
                        )
                elif not curr_agent_step.done:
                    self.episode_steps[global_id] += 1

            # Index is needed to grab from last_take_action_outputs
            self.last_step_result[global_id] = (
                curr_agent_step,
                batched_step_result.agent_id_to_index[_id],
            )
            # Delete all done agents, regardless of if they had a 0-length episode.
            if curr_agent_step.done:
                self._clean_agent_data(global_id)

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])