Ejemplo n.º 1
0
    def step(self) -> None:
        assert self.action is not None

        if self.discrete:
            act = self.action[0][0]
            delta = 1 if act else -1
        else:
            delta = self.action[0][0]
        delta = clamp(delta, -STEP_SIZE, STEP_SIZE)
        self.position += delta
        self.position = clamp(self.position, -1, 1)
        self.step_count += 1
        done = self.position >= 1.0 or self.position <= -1.0
        if done:
            reward = SUCCESS_REWARD * self.position * self.goal
        else:
            reward = -TIME_PENALTY

        m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
        m_reward = np.array([reward], dtype=np.float32)
        m_done = np.array([done], dtype=np.bool)
        m_agent_id = np.array([0], dtype=np.int32)

        if done:
            self._reset_agent()

        self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done,
                                             m_done, m_agent_id, None)
Ejemplo n.º 2
0
    def reset(self) -> None:  # type: ignore
        self._reset_agent()

        m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal]
        m_reward = np.array([0], dtype=np.float32)
        m_done = np.array([False], dtype=np.bool)
        m_agent_id = np.array([0], dtype=np.int32)

        self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done,
                                             m_done, m_agent_id, None)
Ejemplo n.º 3
0
def batched_step_result_from_proto(
    agent_info_list: Collection[
        AgentInfoProto
    ],  # pylint: disable=unsubscriptable-object
    group_spec: AgentGroupSpec,
) -> BatchedStepResult:
    obs_list: List[np.ndarray] = []
    for obs_index, obs_shape in enumerate(group_spec.observation_shapes):
        is_visual = len(obs_shape) == 3
        if is_visual:
            obs_shape = cast(Tuple[int, int, int], obs_shape)
            obs_list += [
                _process_visual_observation(obs_index, obs_shape, agent_info_list)
            ]
        else:
            obs_list += [
                _process_vector_observation(obs_index, obs_shape, agent_info_list)
            ]
    rewards = np.array(
        [agent_info.reward for agent_info in agent_info_list], dtype=np.float32
    )

    d = np.dot(rewards, rewards)
    has_nan = np.isnan(d)
    has_inf = not np.isfinite(d)
    # In we have any NaN or Infs, use np.nan_to_num to replace these with finite values
    if has_nan or has_inf:
        rewards = np.nan_to_num(rewards)
    if has_nan:
        logger.warning(f"An agent had a NaN reward in the environment")

    done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool)
    max_step = np.array(
        [agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool
    )
    agent_id = np.array(
        [agent_info.id for agent_info in agent_info_list], dtype=np.int32
    )
    action_mask = None
    if group_spec.is_action_discrete():
        if any([agent_info.action_mask is not None] for agent_info in agent_info_list):
            n_agents = len(agent_info_list)
            a_size = np.sum(group_spec.discrete_action_branches)
            mask_matrix = np.ones((n_agents, a_size), dtype=np.bool)
            for agent_index, agent_info in enumerate(agent_info_list):
                if agent_info.action_mask is not None:
                    if len(agent_info.action_mask) == a_size:
                        mask_matrix[agent_index, :] = [
                            False if agent_info.action_mask[k] else True
                            for k in range(a_size)
                        ]
            action_mask = (1 - mask_matrix).astype(np.bool)
            indices = _generate_split_indices(group_spec.discrete_action_branches)
            action_mask = np.split(action_mask, indices, axis=1)
    return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask)
Ejemplo n.º 4
0
def create_mock_vector_step_result(num_agents=1, number_visual_observations=0):
    """
    Creates a mock BrainInfo with vector observations. Imitates constant
    vector observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate in your BrainInfo values.
    """
    obs = [np.array([num_agents * [1, 2, 3]])]
    if number_visual_observations:
        obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32)]
    rewards = np.array(num_agents * [1.0])
    done = np.array(num_agents * [False])
    agents = np.array(range(0, num_agents))
    return BatchedStepResult(obs, rewards, done, done, agents, None)
Ejemplo n.º 5
0
 def _update_state(self, output: UnityRLOutputProto) -> None:
     """
     Collects experience information from all external brains in environment at current step.
     """
     for brain_name in self._env_specs.keys():
         if brain_name in output.agentInfos:
             agent_info_list = output.agentInfos[brain_name].value
             self._env_state[brain_name] = batched_step_result_from_proto(
                 agent_info_list, self._env_specs[brain_name]
             )
         else:
             self._env_state[brain_name] = BatchedStepResult.empty(
                 self._env_specs[brain_name]
             )
     self._parse_side_channel_message(self.side_channels, output.side_channel)
Ejemplo n.º 6
0
def step_result_to_brain_info(
    step_result: BatchedStepResult,
    group_spec: AgentGroupSpec,
    agent_id_prefix: int = None,
) -> BrainInfo:
    n_agents = step_result.n_agents()
    vis_obs_indices = []
    vec_obs_indices = []
    for index, observation in enumerate(step_result.obs):
        if len(observation.shape) == 2:
            vec_obs_indices.append(index)
        elif len(observation.shape) == 4:
            vis_obs_indices.append(index)
        else:
            raise UnityEnvironmentException(
                "Invalid input received from the environment, the observation should "
                "either be a vector of float or a PNG image")
    if len(vec_obs_indices) == 0:
        vec_obs = np.zeros((n_agents, 0), dtype=np.float32)
    else:
        vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices],
                                 axis=1)
    vis_obs = [step_result.obs[i] for i in vis_obs_indices]
    mask = np.ones((n_agents, np.sum(group_spec.action_size)),
                   dtype=np.float32)
    if group_spec.is_action_discrete():
        mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)),
                       dtype=np.float32)
        if step_result.action_mask is not None:
            mask = 1 - np.concatenate(step_result.action_mask, axis=1)
    if agent_id_prefix is None:
        agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)]
    else:
        agent_ids = [
            f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id
        ]
    return BrainInfo(
        vis_obs,
        vec_obs,
        list(step_result.reward),
        agent_ids,
        list(step_result.done),
        list(step_result.max_step),
        mask,
    )