def step(self) -> None: assert self.action is not None if self.discrete: act = self.action[0][0] delta = 1 if act else -1 else: delta = self.action[0][0] delta = clamp(delta, -STEP_SIZE, STEP_SIZE) self.position += delta self.position = clamp(self.position, -1, 1) self.step_count += 1 done = self.position >= 1.0 or self.position <= -1.0 if done: reward = SUCCESS_REWARD * self.position * self.goal else: reward = -TIME_PENALTY m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal] m_reward = np.array([reward], dtype=np.float32) m_done = np.array([done], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) if done: self._reset_agent() self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, None)
def reset(self) -> None: # type: ignore self._reset_agent() m_vector_obs = [np.ones((1, OBS_SIZE), dtype=np.float32) * self.goal] m_reward = np.array([0], dtype=np.float32) m_done = np.array([False], dtype=np.bool) m_agent_id = np.array([0], dtype=np.int32) self.step_result = BatchedStepResult(m_vector_obs, m_reward, m_done, m_done, m_agent_id, None)
def batched_step_result_from_proto( agent_info_list: Collection[ AgentInfoProto ], # pylint: disable=unsubscriptable-object group_spec: AgentGroupSpec, ) -> BatchedStepResult: obs_list: List[np.ndarray] = [] for obs_index, obs_shape in enumerate(group_spec.observation_shapes): is_visual = len(obs_shape) == 3 if is_visual: obs_shape = cast(Tuple[int, int, int], obs_shape) obs_list += [ _process_visual_observation(obs_index, obs_shape, agent_info_list) ] else: obs_list += [ _process_vector_observation(obs_index, obs_shape, agent_info_list) ] rewards = np.array( [agent_info.reward for agent_info in agent_info_list], dtype=np.float32 ) d = np.dot(rewards, rewards) has_nan = np.isnan(d) has_inf = not np.isfinite(d) # In we have any NaN or Infs, use np.nan_to_num to replace these with finite values if has_nan or has_inf: rewards = np.nan_to_num(rewards) if has_nan: logger.warning(f"An agent had a NaN reward in the environment") done = np.array([agent_info.done for agent_info in agent_info_list], dtype=np.bool) max_step = np.array( [agent_info.max_step_reached for agent_info in agent_info_list], dtype=np.bool ) agent_id = np.array( [agent_info.id for agent_info in agent_info_list], dtype=np.int32 ) action_mask = None if group_spec.is_action_discrete(): if any([agent_info.action_mask is not None] for agent_info in agent_info_list): n_agents = len(agent_info_list) a_size = np.sum(group_spec.discrete_action_branches) mask_matrix = np.ones((n_agents, a_size), dtype=np.bool) for agent_index, agent_info in enumerate(agent_info_list): if agent_info.action_mask is not None: if len(agent_info.action_mask) == a_size: mask_matrix[agent_index, :] = [ False if agent_info.action_mask[k] else True for k in range(a_size) ] action_mask = (1 - mask_matrix).astype(np.bool) indices = _generate_split_indices(group_spec.discrete_action_branches) action_mask = np.split(action_mask, indices, axis=1) return BatchedStepResult(obs_list, rewards, done, max_step, agent_id, action_mask)
def create_mock_vector_step_result(num_agents=1, number_visual_observations=0): """ Creates a mock BrainInfo with vector observations. Imitates constant vector observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate in your BrainInfo values. """ obs = [np.array([num_agents * [1, 2, 3]])] if number_visual_observations: obs += [np.zeros(shape=(num_agents, 8, 8, 3), dtype=np.float32)] rewards = np.array(num_agents * [1.0]) done = np.array(num_agents * [False]) agents = np.array(range(0, num_agents)) return BatchedStepResult(obs, rewards, done, done, agents, None)
def _update_state(self, output: UnityRLOutputProto) -> None: """ Collects experience information from all external brains in environment at current step. """ for brain_name in self._env_specs.keys(): if brain_name in output.agentInfos: agent_info_list = output.agentInfos[brain_name].value self._env_state[brain_name] = batched_step_result_from_proto( agent_info_list, self._env_specs[brain_name] ) else: self._env_state[brain_name] = BatchedStepResult.empty( self._env_specs[brain_name] ) self._parse_side_channel_message(self.side_channels, output.side_channel)
def step_result_to_brain_info( step_result: BatchedStepResult, group_spec: AgentGroupSpec, agent_id_prefix: int = None, ) -> BrainInfo: n_agents = step_result.n_agents() vis_obs_indices = [] vec_obs_indices = [] for index, observation in enumerate(step_result.obs): if len(observation.shape) == 2: vec_obs_indices.append(index) elif len(observation.shape) == 4: vis_obs_indices.append(index) else: raise UnityEnvironmentException( "Invalid input received from the environment, the observation should " "either be a vector of float or a PNG image") if len(vec_obs_indices) == 0: vec_obs = np.zeros((n_agents, 0), dtype=np.float32) else: vec_obs = np.concatenate([step_result.obs[i] for i in vec_obs_indices], axis=1) vis_obs = [step_result.obs[i] for i in vis_obs_indices] mask = np.ones((n_agents, np.sum(group_spec.action_size)), dtype=np.float32) if group_spec.is_action_discrete(): mask = np.ones((n_agents, np.sum(group_spec.discrete_action_branches)), dtype=np.float32) if step_result.action_mask is not None: mask = 1 - np.concatenate(step_result.action_mask, axis=1) if agent_id_prefix is None: agent_ids = [str(ag_id) for ag_id in list(step_result.agent_id)] else: agent_ids = [ f"${agent_id_prefix}-{ag_id}" for ag_id in step_result.agent_id ] return BrainInfo( vis_obs, vec_obs, list(step_result.reward), agent_ids, list(step_result.done), list(step_result.max_step), mask, )