コード例 #1
0
 def _make_batched_step(
         self, name: str, done: bool,
         reward: float) -> Tuple[DecisionSteps, TerminalSteps]:
     recurrent_obs_val = (self.goal[name]
                          if self.step_count[name] <= self.num_show_steps
                          else 0)
     m_vector_obs = self._make_obs(recurrent_obs_val)
     m_reward = np.array([reward], dtype=np.float32)
     m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
     action_mask = self._generate_mask()
     decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id,
                                   action_mask)
     terminal_step = TerminalSteps.empty(self.behavior_spec)
     if done:
         self.final_rewards[name].append(self.rewards[name])
         self._reset_agent(name)
         recurrent_obs_val = (self.goal[name] if
                              self.step_count[name] <= self.num_show_steps
                              else 0)
         new_vector_obs = self._make_obs(recurrent_obs_val)
         (
             new_reward,
             new_done,
             new_agent_id,
             new_action_mask,
         ) = self._construct_reset_step(name)
         decision_step = DecisionSteps(new_vector_obs, new_reward,
                                       new_agent_id, new_action_mask)
         terminal_step = TerminalSteps(m_vector_obs, m_reward,
                                       np.array([False], dtype=np.bool),
                                       m_agent_id)
     return (decision_step, terminal_step)
コード例 #2
0
    def _make_batched_step(
            self, name: str, done: bool,
            reward: float) -> Tuple[DecisionSteps, TerminalSteps]:
        m_vector_obs = self._make_obs(self.goal[name])
        m_reward = np.array([reward], dtype=np.float32)
        m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
        action_mask = self._generate_mask()
        decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id,
                                      action_mask)
        terminal_step = TerminalSteps.empty(self.behavior_spec)
        if done:
            self._reset_agent(name)
            new_vector_obs = self._make_obs(self.goal[name])
            (
                new_reward,
                new_done,
                new_agent_id,
                new_action_mask,
            ) = self._construct_reset_step(name)

            decision_step = DecisionSteps(new_vector_obs, new_reward,
                                          new_agent_id, new_action_mask)
            terminal_step = TerminalSteps(m_vector_obs, m_reward,
                                          np.array([False], dtype=np.bool),
                                          m_agent_id)
        return (decision_step, terminal_step)
コード例 #3
0
    def add_experiences(
        self,
        decision_steps: DecisionSteps,
        terminal_steps: TerminalSteps,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param decision_steps: current DecisionSteps.
        :param terminal_steps: current TerminalSteps.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self._stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id)
            for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self._last_step_result:  # Don't store if agent just reset
                self._last_take_action_outputs[global_id] = take_action_outputs

        # Iterate over all the terminal steps, first gather all the group obs
        # and then create the AgentExperiences/Trajectories. _add_to_group_status
        # stores Group statuses in a common data structure self.group_status
        for terminal_step in terminal_steps.values():
            self._add_group_status_and_obs(terminal_step, worker_id)
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(terminal_step, worker_id,
                               terminal_steps.agent_id_to_index[local_id])
            # Clear the last seen group obs when agents die.
            self._clear_group_status_and_obs(global_id)

        # Iterate over all the decision steps, first gather all the group obs
        # and then create the trajectories. _add_to_group_status
        # stores Group statuses in a common data structure self.group_status
        for ongoing_step in decision_steps.values():
            self._add_group_status_and_obs(ongoing_step, worker_id)
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            self._process_step(ongoing_step, worker_id,
                               decision_steps.agent_id_to_index[local_id])

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self._last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"])
コード例 #4
0
def create_mock_steps(
    num_agents: int = 1,
    num_vector_observations: int = 0,
    num_vis_observations: int = 0,
    action_shape: List[int] = None,
    discrete: bool = False,
    done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :int num_vector_observations: Number of "observations" in your observation space
    :int num_vis_observations: Number of "observations" in your observation space
    :int num_vector_acts: Number of actions in your action space
    :bool discrete: Whether or not action space is discrete
    :bool done: Whether all the agents in the batch are done
    """
    if action_shape is None:
        action_shape = [2]

    obs_list = []
    for _ in range(num_vis_observations):
        obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32))
    if num_vector_observations > 1:
        obs_list.append(
            np.array(num_agents * [num_vector_observations * [1]],
                     dtype=np.float32))
    action_mask = None
    if discrete:
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_shape
        ]

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * num_vis_observations +
        [(num_vector_observations, 0, 0)],
        ActionType.DISCRETE if discrete else ActionType.CONTINUOUS,
        action_shape if discrete else action_shape[0],
    )
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(obs_list, reward, interrupted, agent_id),
        )
    else:
        return (
            DecisionSteps(obs_list, reward, agent_id, action_mask),
            TerminalSteps.empty(behavior_spec),
        )
コード例 #5
0
def create_mock_steps(
    num_agents: int,
    observation_shapes: List[Tuple],
    action_shape: Union[int, Tuple[int]] = None,
    discrete: bool = False,
    done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :List observation_shapes: A List of the observation spaces in your steps
    :int num_vector_acts: Number of actions in your action space
    :bool discrete: Whether or not action space is discrete
    :bool done: Whether all the agents in the batch are done
    """
    if action_shape is None:
        action_shape = 2

    obs_list = []
    for _shape in observation_shapes:
        obs_list.append(np.ones((num_agents, ) + _shape, dtype=np.float32))
    action_mask = None
    if discrete and isinstance(action_shape, Iterable):
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_shape  # type: ignore
        ]  # type: ignore

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    behavior_spec = BehaviorSpec(
        observation_shapes,
        ActionType.DISCRETE if discrete else ActionType.CONTINUOUS,
        action_shape,
    )
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(obs_list, reward, interrupted, agent_id),
        )
    else:
        return (
            DecisionSteps(obs_list, reward, agent_id, action_mask),
            TerminalSteps.empty(behavior_spec),
        )
コード例 #6
0
def create_mock_vector_steps(specs,
                             num_agents=1,
                             number_visual_observations=0):
    """
    Creates a mock BatchedStepResult with vector observations. Imitates constant
    vector observations, rewards, dones, and agents.

    :BehaviorSpecs specs: The BehaviorSpecs for this mock
    :int num_agents: Number of "agents" to imitate in your BatchedStepResult values.
    """
    obs = [
        np.array([num_agents * [1, 2, 3]],
                 dtype=np.float32).reshape(num_agents, 3)
    ]
    if number_visual_observations:
        obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32)
                ] * number_visual_observations
    rewards = np.array(num_agents * [1.0])
    agents = np.array(range(0, num_agents))
    group_id = np.array(num_agents * [0])
    group_rewards = np.array(num_agents * [0.0])
    return (
        DecisionSteps(obs, rewards, agents, None, group_id, group_rewards),
        TerminalSteps.empty(specs),
    )
コード例 #7
0
def test_empty_terminal_steps():
    specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )],
                         action_spec=ActionSpec.create_continuous(3))
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
コード例 #8
0
ファイル: mock_brain.py プロジェクト: joomon/ml-agents
def create_mock_steps(
    num_agents: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    done: bool = False,
    grouped: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :List observation_specs: A List of the observation specs in your steps
    :int action_spec: ActionSpec for the agent
    :bool done: Whether all the agents in the batch are done
    """
    obs_list = []
    for obs_spec in observation_specs:
        obs_list.append(np.ones((num_agents,) + obs_spec.shape, dtype=np.float32))
    action_mask = None
    if action_spec.is_discrete():
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_spec.discrete_branches  # type: ignore
        ]  # type: ignore

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    _gid = 1 if grouped else 0
    group_id = np.array(num_agents * [_gid], dtype=np.int32)
    group_reward = np.array(num_agents * [0.0], dtype=np.float32)
    behavior_spec = BehaviorSpec(observation_specs, action_spec)
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(
                obs_list, reward, interrupted, agent_id, group_id, group_reward
            ),
        )
    else:
        return (
            DecisionSteps(
                obs_list, reward, agent_id, action_mask, group_id, group_reward
            ),
            TerminalSteps.empty(behavior_spec),
        )
コード例 #9
0
ファイル: test_steps.py プロジェクト: zereyak13/ml-agents
def test_empty_terminal_steps():
    specs = BehaviorSpec(
        sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]),
        action_spec=ActionSpec.create_continuous(3),
    )
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
コード例 #10
0
ファイル: test_steps.py プロジェクト: alclimb/ml-ex01
def test_empty_terminal_steps():
    specs = BehaviorSpec(
        observation_shapes=[(3, 2), (5, )],
        action_type=ActionType.CONTINUOUS,
        action_shape=3,
    )
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
コード例 #11
0
    def add_experiences(
        self,
        decision_steps: DecisionSteps,
        terminal_steps: TerminalSteps,
        worker_id: int,
        previous_action: ActionInfo,
    ) -> None:
        """
        Adds experiences to each agent's experience history.
        :param decision_steps: current DecisionSteps.
        :param terminal_steps: current TerminalSteps.
        :param previous_action: The outputs of the Policy's get_action method.
        """
        take_action_outputs = previous_action.outputs
        if take_action_outputs:
            for _entropy in take_action_outputs["entropy"]:
                self.stats_reporter.add_stat("Policy/Entropy", _entropy)

        # Make unique agent_ids that are global across workers
        action_global_agent_ids = [
            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
        ]
        for global_id in action_global_agent_ids:
            if global_id in self.last_step_result:  # Don't store if agent just reset
                self.last_take_action_outputs[global_id] = take_action_outputs

        # Iterate over all the terminal steps
        for terminal_step in terminal_steps.values():
            local_id = terminal_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                terminal_step, global_id, terminal_steps.agent_id_to_index[local_id]
            )
        # Iterate over all the decision steps
        for ongoing_step in decision_steps.values():
            local_id = ongoing_step.agent_id
            global_id = get_global_agent_id(worker_id, local_id)
            self._process_step(
                ongoing_step, global_id, decision_steps.agent_id_to_index[local_id]
            )

        for _gid in action_global_agent_ids:
            # If the ID doesn't have a last step result, the agent just reset,
            # don't store the action.
            if _gid in self.last_step_result:
                if "action" in take_action_outputs:
                    self.policy.save_previous_action(
                        [_gid], take_action_outputs["action"]
                    )
コード例 #12
0
 def _update_state(self, output: UnityRLOutputProto) -> None:
     """
     Collects experience information from all external brains in environment at current step.
     """
     for brain_name in self._env_specs.keys():
         if brain_name in output.agentInfos:
             agent_info_list = output.agentInfos[brain_name].value
             self._env_state[brain_name] = steps_from_proto(
                 agent_info_list, self._env_specs[brain_name]
             )
         else:
             self._env_state[brain_name] = (
                 DecisionSteps.empty(self._env_specs[brain_name]),
                 TerminalSteps.empty(self._env_specs[brain_name]),
             )
     self._side_channel_manager.process_side_channel_message(output.side_channel)
コード例 #13
0
ファイル: test_steps.py プロジェクト: zereyak13/ml-agents
def test_terminal_steps():
    ts = TerminalSteps(
        obs=[np.array(range(12), dtype=np.float32).reshape(3, 4)],
        reward=np.array(range(3), dtype=np.float32),
        agent_id=np.array(range(10, 13), dtype=np.int32),
        interrupted=np.array([1, 0, 1], dtype=np.bool),
    )

    assert ts.agent_id_to_index[10] == 0
    assert ts.agent_id_to_index[11] == 1
    assert ts.agent_id_to_index[12] == 2

    assert ts[10].interrupted
    assert not ts[11].interrupted
    assert ts[12].interrupted

    with pytest.raises(KeyError):
        assert ts.agent_id_to_index[-1] == -1

    for agent_id in ts:
        assert ts.agent_id_to_index[agent_id] in range(3)
コード例 #14
0
    def get_steps(self, behavior_name):
        # This gets the individual DecisionSteps and TerminalSteps
        # from the envs and merges them into a batch to be sent
        # to the AgentProcessor.
        dec_vec_obs = []
        dec_reward = []
        dec_group_reward = []
        dec_agent_id = []
        dec_group_id = []
        ter_vec_obs = []
        ter_reward = []
        ter_group_reward = []
        ter_agent_id = []
        ter_group_id = []
        interrupted = []

        action_mask = None
        terminal_step = TerminalSteps.empty(self.behavior_spec)
        decision_step = None
        for i in range(self.num_agents):
            name_and_num = behavior_name + str(i)
            env = self.envs[name_and_num]
            _dec, _term = env.step_result[behavior_name]
            if not self.dones[name_and_num]:
                dec_agent_id.append(i)
                dec_group_id.append(1)
                if len(dec_vec_obs) > 0:
                    for j, obs in enumerate(_dec.obs):
                        dec_vec_obs[j] = np.concatenate((dec_vec_obs[j], obs),
                                                        axis=0)
                else:
                    for obs in _dec.obs:
                        dec_vec_obs.append(obs)
                dec_reward.append(_dec.reward[0])
                dec_group_reward.append(_dec.group_reward[0])
                if _dec.action_mask is not None:
                    if action_mask is None:
                        action_mask = []
                    if len(action_mask) > 0:
                        action_mask[0] = np.concatenate(
                            (action_mask[0], _dec.action_mask[0]), axis=0)
                    else:
                        action_mask.append(_dec.action_mask[0])
            if len(_term.reward) > 0 and name_and_num in self.just_died:
                ter_agent_id.append(i)
                ter_group_id.append(1)
                if len(ter_vec_obs) > 0:
                    for j, obs in enumerate(_term.obs):
                        ter_vec_obs[j] = np.concatenate((ter_vec_obs[j], obs),
                                                        axis=0)
                else:
                    for obs in _term.obs:
                        ter_vec_obs.append(obs)
                ter_reward.append(_term.reward[0])
                ter_group_reward.append(_term.group_reward[0])
                interrupted.append(False)
                self.just_died.remove(name_and_num)
        decision_step = DecisionSteps(
            dec_vec_obs,
            dec_reward,
            dec_agent_id,
            action_mask,
            dec_group_id,
            dec_group_reward,
        )
        terminal_step = TerminalSteps(
            ter_vec_obs,
            ter_reward,
            interrupted,
            ter_agent_id,
            ter_group_id,
            ter_group_reward,
        )
        return (decision_step, terminal_step)
コード例 #15
0
ファイル: rpc_utils.py プロジェクト: zereyak13/ml-agents
def steps_from_proto(
    agent_info_list: Collection[AgentInfoProto],  # pylint: disable=unsubscriptable-object
    behavior_spec: BehaviorSpec,
) -> Tuple[DecisionSteps, TerminalSteps]:
    decision_agent_info_list = [
        agent_info for agent_info in agent_info_list if not agent_info.done
    ]
    terminal_agent_info_list = [
        agent_info for agent_info in agent_info_list if agent_info.done
    ]
    decision_obs_list: List[np.ndarray] = []
    terminal_obs_list: List[np.ndarray] = []
    for obs_index, sensor_specs in enumerate(behavior_spec.sensor_specs):
        is_visual = len(sensor_specs.shape) == 3
        if is_visual:
            obs_shape = cast(Tuple[int, int, int], sensor_specs.shape)
            decision_obs_list.append(
                _process_visual_observation(obs_index, obs_shape,
                                            decision_agent_info_list))
            terminal_obs_list.append(
                _process_visual_observation(obs_index, obs_shape,
                                            terminal_agent_info_list))
        else:
            decision_obs_list.append(
                _process_vector_observation(obs_index, sensor_specs.shape,
                                            decision_agent_info_list))
            terminal_obs_list.append(
                _process_vector_observation(obs_index, sensor_specs.shape,
                                            terminal_agent_info_list))
    decision_rewards = np.array(
        [agent_info.reward for agent_info in decision_agent_info_list],
        dtype=np.float32)
    terminal_rewards = np.array(
        [agent_info.reward for agent_info in terminal_agent_info_list],
        dtype=np.float32)

    _raise_on_nan_and_inf(decision_rewards, "rewards")
    _raise_on_nan_and_inf(terminal_rewards, "rewards")

    max_step = np.array(
        [
            agent_info.max_step_reached
            for agent_info in terminal_agent_info_list
        ],
        dtype=np.bool,
    )
    decision_agent_id = np.array(
        [agent_info.id for agent_info in decision_agent_info_list],
        dtype=np.int32)
    terminal_agent_id = np.array(
        [agent_info.id for agent_info in terminal_agent_info_list],
        dtype=np.int32)
    action_mask = None
    if behavior_spec.action_spec.discrete_size > 0:
        if any([agent_info.action_mask is not None]
               for agent_info in decision_agent_info_list):
            n_agents = len(decision_agent_info_list)
            a_size = np.sum(behavior_spec.action_spec.discrete_branches)
            mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
            for agent_index, agent_info in enumerate(decision_agent_info_list):
                if agent_info.action_mask is not None:
                    if len(agent_info.action_mask) == a_size:
                        mask_matrix[agent_index, :] = [
                            False if agent_info.action_mask[k] else True
                            for k in range(a_size)
                        ]
            action_mask = (1 - mask_matrix).astype(np.bool)
            indices = _generate_split_indices(
                behavior_spec.action_spec.discrete_branches)
            action_mask = np.split(action_mask, indices, axis=1)
    return (
        DecisionSteps(decision_obs_list, decision_rewards, decision_agent_id,
                      action_mask),
        TerminalSteps(terminal_obs_list, terminal_rewards, max_step,
                      terminal_agent_id),
    )