def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: batch = self._from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None # Clip actions (from any values into env's bounds), if necessary. cfg = self.ioctx.config if cfg.get("clip_actions") and self.ioctx.worker is not None: if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = clip_action( batch[SampleBatch.ACTIONS], self.default_policy.action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = clip_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) # Re-normalize actions (from env's bounds to zero-centered), if # necessary. if cfg.get("actions_in_input_normalized") is False and \ self.ioctx.worker is not None: # If we have a complex action space and actions were flattened # and we have to normalize -> Error. error_msg = \ "Normalization of offline actions that are flattened is not "\ "supported! Make sure that you record actions into offline " \ "file with the `_disable_action_flattening=True` flag OR " \ "as already normalized (between -1.0 and 1.0) values. " \ "Also, when reading already normalized action values from " \ "offline files, make sure to set " \ "`actions_in_input_normalized=True` so that RLlib will not " \ "perform normalization on top." if isinstance(batch, SampleBatch): pol = self.default_policy if isinstance(pol.action_space_struct, (tuple, dict)) and \ not pol.config.get("_disable_action_flattening"): raise ValueError(error_msg) batch[SampleBatch.ACTIONS] = normalize_action( batch[SampleBatch.ACTIONS], pol.action_space_struct) else: for pid, b in batch.policy_batches.items(): pol = self.policy_map[pid] if isinstance(pol.action_space_struct, (tuple, dict)) and \ not pol.config.get("_disable_action_flattening"): raise ValueError(error_msg) b[SampleBatch.ACTIONS] = normalize_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) return batch
def __call__(self, ac_data: ActionConnectorDataType) -> ActionConnectorDataType: assert isinstance( ac_data.output, tuple), "Action connector requires PolicyOutputType data." actions, states, fetches = ac_data.output return ActionConnectorDataType( ac_data.env_id, ac_data.agent_id, (clip_action(actions, self._action_space_struct), states, fetches), )
def _try_parse(self, line: str) -> Optional[SampleBatchType]: line = line.strip() if not line: return None try: batch = _from_json(line) except Exception: logger.exception("Ignoring corrupt json record in {}: {}".format( self.cur_file, line)) return None # Clip actions (from any values into env's bounds), if necessary. cfg = self.ioctx.config if cfg.get("clip_actions"): if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = clip_action( batch[SampleBatch.ACTIONS], self.ioctx.worker. policy_map["default_policy"].action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = clip_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) # Re-normalize actions (from env's bounds to 0.0 centered), if # necessary. if "actions_in_input_normalized" in cfg and \ cfg["actions_in_input_normalized"] is False: if isinstance(batch, SampleBatch): batch[SampleBatch.ACTIONS] = normalize_action( batch[SampleBatch.ACTIONS], self.ioctx.worker. policy_map["default_policy"].action_space_struct) else: for pid, b in batch.policy_batches.items(): b[SampleBatch.ACTIONS] = normalize_action( b[SampleBatch.ACTIONS], self.ioctx.worker.policy_map[pid].action_space_struct) return batch
def _process_policy_eval_results( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]], active_episodes: Dict[str, MultiAgentEpisode], active_envs: Set[int], off_policy_actions: MultiEnvDict, policies: Dict[PolicyID, Policy], normalize_actions: bool, clip_actions: bool, ) -> Dict[EnvID, Dict[AgentID, EnvActionType]]: """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects. eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of actions, rnn-out states, extra-action-fetches dicts. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. active_envs (Set[int]): Set of non-terminated env ids. off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids -> off-policy-action, returned by a `BaseEnv.poll()` call. policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy. normalize_actions (bool): Whether to normalize actions to the action space's bounds. clip_actions (bool): Whether to clip actions to the action space's bounds. Returns: actions_to_send: Nested dict of env id -> agent id -> actions to be sent to Env (np.ndarrays). """ actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ defaultdict(dict) # types: int for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict # types: PolicyID, List[PolicyEvalData] for policy_id, eval_data in to_eval.items(): actions: TensorStructType = eval_results[policy_id][0] actions = convert_to_numpy(actions) rnn_out_cols: StateBatch = eval_results[policy_id][1] pi_info_cols: dict = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try converting it first. if isinstance(actions, list): actions = np.array(actions) # Store RNN state ins/outs and extra-action fetches to episode. for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy: Policy = _get_or_raise(policies, policy_id) # Split action-component batches into single action rows. actions: List[EnvActionType] = unbatch(actions) # types: int, EnvActionType for i, action in enumerate(actions): # Normalize, if necessary. if normalize_actions: action_to_send = unsquash_action(action, policy.action_space_struct) # Clip, if necessary. elif clip_actions: action_to_send = clip_action(action, policy.action_space_struct) else: action_to_send = action env_id: int = eval_data[i].env_id agent_id: AgentID = eval_data[i].agent_id episode: MultiAgentEpisode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) assert agent_id not in actions_to_send[env_id] actions_to_send[env_id][agent_id] = action_to_send return actions_to_send
def compute_single_action( self, obs: TensorType, state: Optional[List[TensorType]] = None, prev_action: Optional[TensorType] = None, prev_reward: Optional[TensorType] = None, info: dict = None, episode: Optional["MultiAgentEpisode"] = None, clip_actions: bool = None, explore: Optional[bool] = None, timestep: Optional[int] = None, unsquash_actions: bool = None, **kwargs) -> \ Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: """Unbatched version of compute_actions. Args: obs (TensorType): Single observation. state (Optional[List[TensorType]]): List of RNN state inputs, if any. prev_action (Optional[TensorType]): Previous action value, if any. prev_reward (Optional[TensorType]): Previous reward, if any. info (dict): Info object, if any. episode (Optional[MultiAgentEpisode]): this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. unsquash_actions (bool): Should actions be unsquashed according to the Policy's action space? clip_actions (bool): Should actions be clipped according to the Policy's action space? explore (Optional[bool]): Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep (Optional[int]): The current (sampling) time step. Keyword Args: kwargs: Forward compatibility. Returns: Tuple: - actions (TensorType): Single action. - state_outs (List[TensorType]): List of RNN state outputs, if any. - info (dict): Dictionary of extra features, if any. """ # If policy works in normalized space, we should unsquash the action. # Use value of config.normalize_actions, if None. unsquash_actions = \ unsquash_actions if unsquash_actions is not None \ else self.config["normalize_actions"] clip_actions = clip_actions if clip_actions is not None else \ self.config["clip_actions"] prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [ s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else np.expand_dims(s, 0) for s in state ] out = self.compute_actions([obs], state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) # Some policies don't return a tuple, but always just a single action. # E.g. ES and ARS. if not isinstance(out, tuple): single_action = out state_out = [] info = {} # Normal case: Policy should return (action, state, info) tuple. else: batched_action, state_out, info = out single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] # If we work in normalized action space (normalize_actions=True), # we re-translate here into the env's action space. if unsquash_actions: single_action = unsquash_action(single_action, self.action_space_struct) # Clip, according to env's action space. elif clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}