Example #1
0
    def _compute_actions(policy,
                         obs_batch,
                         add_noise=False,
                         update=True,
                         **kwargs):
        # Batch is given as list -> Try converting to numpy first.
        if isinstance(obs_batch, list) and len(obs_batch) == 1:
            obs_batch = obs_batch[0]
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(observation[None],
                                                update=update)

        observation = convert_to_torch_tensor(observation, policy.device)
        dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [],
                                      None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample()

        def _add_noise(single_action, single_action_space):
            single_action = single_action.detach().cpu().numpy()
            if add_noise and isinstance(single_action_space, gym.spaces.Box):
                single_action += np.random.randn(*single_action.shape) * \
                                 policy.action_noise_std
            return single_action

        action = tree.map_structure(_add_noise, action,
                                    policy.action_space_struct)
        action = unbatch(action)
        return action, [], {}
Example #2
0
    def compute_actions(self,
                        observation,
                        add_noise=False,
                        update=True,
                        **kwargs):
        # Squeeze batch dimension (we always calculate actions for only a
        # single obs).
        observation = observation[0]
        observation = self.preprocessor.transform(observation)
        observation = self.observation_filter(observation[None], update=update)
        # `actions` is a list of (component) batches.
        # Eager mode.
        if not self.sess:
            dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation})
            dist = self.dist_class(dist_inputs, self.model)
            actions = dist.sample()
            actions = tree.map_structure(lambda a: a.numpy(), actions)
        # Graph mode.
        else:
            actions = self.sess.run(self.sampler,
                                    feed_dict={self.inputs: observation})

        if add_noise:
            actions = tree.map_structure(self._add_noise, actions,
                                         self.action_space_struct)
        # Convert `flat_actions` to a list of lists of action components
        # (list of single actions).
        actions = unbatch(actions)
        return actions, [], {}
Example #3
0
def _process_policy_eval_results(to_eval, eval_results, active_episodes,
                                 active_envs, off_policy_actions, policies,
                                 clip_actions):
    """Process the output of policy neural network evaluation.

    Records policy evaluation results into the given episode objects and
    returns replies to send back to agents in the env.

    Returns:
        actions_to_send: nested dict of env id -> agent id -> agent replies.
    """

    actions_to_send = defaultdict(dict)
    for env_id in active_envs:
        actions_to_send[env_id] = {}  # at minimum send empty dict

    for policy_id, eval_data in to_eval.items():
        rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data])

        actions = eval_results[policy_id][0]
        rnn_out_cols = eval_results[policy_id][1]
        pi_info_cols = eval_results[policy_id][2]

        # In case actions is a list (representing the 0th dim of a batch of
        # primitive actions), try to convert it first.
        if isinstance(actions, list):
            actions = np.array(actions)

        if len(rnn_in_cols) != len(rnn_out_cols):
            raise ValueError("Length of RNN in did not match RNN out, got: "
                             "{} vs {}".format(rnn_in_cols, rnn_out_cols))
        # Add RNN state info
        for f_i, column in enumerate(rnn_in_cols):
            pi_info_cols["state_in_{}".format(f_i)] = column
        for f_i, column in enumerate(rnn_out_cols):
            pi_info_cols["state_out_{}".format(f_i)] = column

        policy = _get_or_raise(policies, policy_id)
        # Clip if necessary (while action components are still batched).
        if clip_actions:
            actions = clip_action(actions, policy.action_space_struct)
        # Split action-component batches into single action rows.
        actions = unbatch(actions)
        for i, action in enumerate(actions):
            env_id = eval_data[i].env_id
            agent_id = eval_data[i].agent_id
            actions_to_send[env_id][agent_id] = action
            episode = active_episodes[env_id]
            episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
            episode._set_last_pi_info(
                agent_id, {k: v[i]
                           for k, v in pi_info_cols.items()})
            if env_id in off_policy_actions and \
                    agent_id in off_policy_actions[env_id]:
                episode._set_last_action(agent_id,
                                         off_policy_actions[env_id][agent_id])
            else:
                episode._set_last_action(agent_id, action)

    return actions_to_send
Example #4
0
    def compute_actions(self,
                        observation,
                        add_noise=False,
                        update=True,
                        **kwargs):
        # Batch is given as list of one.
        if isinstance(observation, list) and len(observation) == 1:
            observation = observation[0]
        observation = self.preprocessor.transform(observation)
        observation = self.observation_filter(observation[None], update=update)

        # `actions` is a list of (component) batches.
        # Eager mode.
        if not self.sess:
            dist_inputs, _ = self.model({SampleBatch.CUR_OBS: observation})
            dist = self.dist_class(dist_inputs, self.model)
            actions = dist.sample()
            actions = tree.map_structure(lambda a: a.numpy(), actions)
        # Graph mode.
        else:
            actions = self.sess.run(
                self.sampler, feed_dict={self.inputs: observation})

        actions = unbatch(actions)
        if add_noise and isinstance(self.action_space, gym.spaces.Box):
            actions += np.random.randn(*actions.shape) * self.action_noise_std
        return actions, [], {}
Example #5
0
 def compute_actions(self, observation, add_noise=False, update=True):
     observation = self.preprocessor.transform(observation)
     observation = self.observation_filter(observation[None], update=update)
     action = self.sess.run(self.sampler,
                            feed_dict={self.inputs: observation})
     action = unbatch(action)
     if add_noise and isinstance(self.action_space, gym.spaces.Box):
         action += np.random.randn(*action.shape) * self.action_noise_std
     return action
Example #6
0
 def compute_actions(self, observation, add_noise=False, update=True):
     observation = self.preprocessor.transform(observation)
     observation = self.observation_filter(observation[None], update=update)
     # `actions` is a list of (component) batches.
     actions = self.sess.run(self.sampler,
                             feed_dict={self.inputs: observation})
     if add_noise:
         actions = tree.map_structure(self._add_noise, actions,
                                      self.action_space_struct)
     # Convert `flat_actions` to a list of lists of action components
     # (list of single actions).
     actions = unbatch(actions)
     return actions
Example #7
0
 def compute_actions(self,
                     observation,
                     add_noise=False,
                     update=True,
                     **kwargs):
     # Batch is given as list of one.
     if isinstance(observation, list) and len(observation) == 1:
         observation = observation[0]
     observation = self.preprocessor.transform(observation)
     observation = self.observation_filter(observation[None], update=update)
     action = self.sess.run(self.sampler,
                            feed_dict={self.inputs: observation})
     action = unbatch(action)
     if add_noise and isinstance(self.action_space, gym.spaces.Box):
         action += np.random.randn(*action.shape) * self.action_noise_std
     return action
Example #8
0
def _process_policy_eval_results(
    *,
    to_eval: Dict[PolicyID, List[PolicyEvalData]],
    eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]],
    active_episodes: Dict[str, MultiAgentEpisode],
    active_envs: Set[int],
    off_policy_actions: MultiEnvDict,
    policies: Dict[PolicyID, Policy],
    clip_actions: bool,
) -> Dict[EnvID, Dict[AgentID, EnvActionType]]:
    """Process the output of policy neural network evaluation.

    Records policy evaluation results into the given episode objects and
    returns replies to send back to agents in the env.

    Args:
        to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs
            to lists of PolicyEvalData objects.
        eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of
            actions, rnn-out states, extra-action-fetches dicts.
        active_episodes (Dict[str, MultiAgentEpisode]): Mapping from
            episode ID to currently ongoing MultiAgentEpisode object.
        active_envs (Set[int]): Set of non-terminated env ids.
        off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids ->
            off-policy-action, returned by a `BaseEnv.poll()` call.
        policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy.
        clip_actions (bool): Whether to clip actions to the action space's
            bounds.

    Returns:
        actions_to_send: Nested dict of env id -> agent id -> actions to be
            sent to Env (np.ndarrays).
    """

    actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \
        defaultdict(dict)

    # type: int
    for env_id in active_envs:
        actions_to_send[env_id] = {}  # at minimum send empty dict

    # type: PolicyID, List[PolicyEvalData]
    for policy_id, eval_data in to_eval.items():
        actions: TensorStructType = eval_results[policy_id][0]
        actions = convert_to_numpy(actions)

        rnn_out_cols: StateBatch = eval_results[policy_id][1]
        pi_info_cols: dict = eval_results[policy_id][2]

        # In case actions is a list (representing the 0th dim of a batch of
        # primitive actions), try to convert it first.
        if isinstance(actions, list):
            actions = np.array(actions)

        # Store RNN state ins/outs and extra-action fetches to episode.
        for f_i, column in enumerate(rnn_out_cols):
            pi_info_cols["state_out_{}".format(f_i)] = column

        policy: Policy = _get_or_raise(policies, policy_id)
        # Split action-component batches into single action rows.
        actions: List[EnvActionType] = unbatch(actions)
        # type: int, EnvActionType
        for i, action in enumerate(actions):
            # Clip if necessary.
            if clip_actions:
                clipped_action = clip_action(action,
                                             policy.action_space_struct)
            else:
                clipped_action = action

            env_id: int = eval_data[i].env_id
            agent_id: AgentID = eval_data[i].agent_id
            episode: MultiAgentEpisode = active_episodes[env_id]
            episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
            episode._set_last_pi_info(
                agent_id, {k: v[i]
                           for k, v in pi_info_cols.items()})
            if env_id in off_policy_actions and \
                    agent_id in off_policy_actions[env_id]:
                episode._set_last_action(agent_id,
                                         off_policy_actions[env_id][agent_id])
            else:
                episode._set_last_action(agent_id, action)

            assert agent_id not in actions_to_send[env_id]
            actions_to_send[env_id][agent_id] = clipped_action

    return actions_to_send
Example #9
0
File: policy.py Project: alipay/ray
    def compute_single_action(
        self,
        obs: Optional[TensorStructType] = None,
        state: Optional[List[TensorType]] = None,
        *,
        prev_action: Optional[TensorStructType] = None,
        prev_reward: Optional[TensorStructType] = None,
        info: dict = None,
        input_dict: Optional[SampleBatch] = None,
        episode: Optional["Episode"] = None,
        explore: Optional[bool] = None,
        timestep: Optional[int] = None,
        # Kwars placeholder for future compatibility.
        **kwargs,
    ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]:
        """Computes and returns a single (B=1) action value.

        Takes an input dict (usually a SampleBatch) as its main data input.
        This allows for using this method in case a more complex input pattern
        (view requirements) is needed, for example when the Model requires the
        last n observations, the last m actions/rewards, or a combination
        of any of these.
        Alternatively, in case no complex inputs are required, takes a single
        `obs` values (and possibly single state values, prev-action/reward
        values, etc..).

        Args:
            obs: Single observation.
            state: List of RNN state inputs, if any.
            prev_action: Previous action value, if any.
            prev_reward: Previous reward, if any.
            info: Info object, if any.
            input_dict: A SampleBatch or input dict containing the
                single (unbatched) Tensors to compute actions. If given, it'll
                be used instead of `obs`, `state`, `prev_action|reward`, and
                `info`.
            episode: This provides access to all of the internal episode state,
                which may be useful for model-based or multi-agent algorithms.
            explore: Whether to pick an exploitation or
                exploration action
                (default: None -> use self.config["explore"]).
            timestep: The current (sampling) time step.

        Keyword Args:
            kwargs: Forward compatibility placeholder.

        Returns:
            Tuple consisting of the action, the list of RNN state outputs (if
            any), and a dictionary of extra features (if any).
        """
        # Build the input-dict used for the call to
        # `self.compute_actions_from_input_dict()`.
        if input_dict is None:
            input_dict = {SampleBatch.OBS: obs}
            if state is not None:
                for i, s in enumerate(state):
                    input_dict[f"state_in_{i}"] = s
            if prev_action is not None:
                input_dict[SampleBatch.PREV_ACTIONS] = prev_action
            if prev_reward is not None:
                input_dict[SampleBatch.PREV_REWARDS] = prev_reward
            if info is not None:
                input_dict[SampleBatch.INFOS] = info

        # Batch all data in input dict.
        input_dict = tree.map_structure_with_path(
            lambda p, s:
            (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance(
                s, torch.Tensor) else np.expand_dims(s, 0)),
            input_dict,
        )

        episodes = None
        if episode is not None:
            episodes = [episode]

        out = self.compute_actions_from_input_dict(
            input_dict=SampleBatch(input_dict),
            episodes=episodes,
            explore=explore,
            timestep=timestep,
        )

        # Some policies don't return a tuple, but always just a single action.
        # E.g. ES and ARS.
        if not isinstance(out, tuple):
            single_action = out
            state_out = []
            info = {}
        # Normal case: Policy should return (action, state, info) tuple.
        else:
            batched_action, state_out, info = out
            single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        # Return action, internal state(s), infos.
        return (
            single_action,
            [s[0] for s in state_out],
            {k: v[0]
             for k, v in info.items()},
        )
Example #10
0
    def compute_single_action(self,
                              obs,
                              state=None,
                              prev_action=None,
                              prev_reward=None,
                              info=None,
                              episode=None,
                              clip_actions=False,
                              explore=None,
                              timestep=None,
                              **kwargs):
        """Unbatched version of compute_actions.

        Arguments:
            obs (obj): Single observation.
            state (list): List of RNN state inputs, if any.
            prev_action (obj): Previous action value, if any.
            prev_reward (float): Previous reward, if any.
            info (dict): info object, if any
            episode (MultiAgentEpisode): this provides access to all of the
                internal episode state, which may be useful for model-based or
                multi-agent algorithms.
            clip_actions (bool): Should actions be clipped?
            explore (bool): Whether to pick an exploitation or exploration
                action (default: None -> use self.config["explore"]).
            timestep (int): The current (sampling) time step.
            kwargs: forward compatibility placeholder

        Returns:
            actions (obj): single action
            state_outs (list): list of RNN state outputs, if any
            info (dict): dictionary of extra features, if any
        """
        prev_action_batch = None
        prev_reward_batch = None
        info_batch = None
        episodes = None
        state_batch = None
        if prev_action is not None:
            prev_action_batch = [prev_action]
        if prev_reward is not None:
            prev_reward_batch = [prev_reward]
        if info is not None:
            info_batch = [info]
        if episode is not None:
            episodes = [episode]
        if state is not None:
            state_batch = [
                s.unsqueeze(0)
                if torch and isinstance(s, torch.Tensor) else [s]
                for s in state
            ]

        batched_action, state_out, info = self.compute_actions(
            [obs],
            state_batch,
            prev_action_batch=prev_action_batch,
            prev_reward_batch=prev_reward_batch,
            info_batch=info_batch,
            episodes=episodes,
            explore=explore,
            timestep=timestep)

        single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        if clip_actions:
            single_action = clip_action(single_action,
                                        self.action_space_struct)

        # Return action, internal state(s), infos.
        return single_action, [s[0] for s in state_out], \
            {k: v[0] for k, v in info.items()}
Example #11
0
    def _process_policy_eval_results(
        self,
        to_eval: Dict[PolicyID, List[_PolicyEvalData]],
        eval_results: Dict[PolicyID, PolicyOutputType],
        off_policy_actions: MultiEnvDict,
    ):
        """Process the output of policy neural network evaluation.

        Records policy evaluation results into agent connectors and
        returns replies to send back to agents in the env.

        Args:
            to_eval: Mapping of policy IDs to lists of _PolicyEvalData objects.
            eval_results: Mapping of policy IDs to list of
                actions, rnn-out states, extra-action-fetches dicts.
            off_policy_actions: Doubly keyed dict of env-ids -> agent ids ->
                off-policy-action, returned by a `BaseEnv.poll()` call.

        Returns:
            Nested dict of env id -> agent id -> actions to be sent to
            Env (np.ndarrays).
        """
        actions_to_send: Dict[EnvID, Dict[AgentID,
                                          EnvActionType]] = defaultdict(dict)
        for eval_data in to_eval.values():
            for d in eval_data:
                actions_to_send[d.env_id] = {}  # at minimum send empty dict

        # types: PolicyID, List[_PolicyEvalData]
        for policy_id, eval_data in to_eval.items():
            actions: TensorStructType = eval_results[policy_id][0]
            actions = convert_to_numpy(actions)

            rnn_out: StateBatches = eval_results[policy_id][1]
            extra_action_out: dict = eval_results[policy_id][2]

            # In case actions is a list (representing the 0th dim of a batch of
            # primitive actions), try converting it first.
            if isinstance(actions, list):
                actions = np.array(actions)
            # Split action-component batches into single action rows.
            actions: List[EnvActionType] = unbatch(actions)

            policy: Policy = _get_or_raise(self._worker.policy_map, policy_id)
            assert (policy.agent_connectors and policy.action_connectors
                    ), "EnvRunnerV2 requires action connectors to work."

            # types: int, EnvActionType
            for i, action in enumerate(actions):
                env_id: int = eval_data[i].env_id
                agent_id: AgentID = eval_data[i].agent_id

                rnn_states: List[StateBatches] = [c[i] for c in rnn_out]
                fetches: Dict = {k: v[i] for k, v in extra_action_out.items()}

                # Post-process policy output by running them through action connectors.
                ac_data = ActionConnectorDataType(
                    env_id, agent_id, (action, rnn_states, fetches))
                action_to_send, rnn_states, fetches = policy.action_connectors(
                    ac_data).output

                action_to_buffer = (
                    action_to_send if env_id not in off_policy_actions
                    or agent_id not in off_policy_actions[env_id] else
                    off_policy_actions[env_id][agent_id])

                # Notify agent connectors with this new policy output.
                # Necessary for state buffering agent connectors, for example.
                ac_data: AgentConnectorDataType = ActionConnectorDataType(
                    env_id, agent_id, (action_to_buffer, rnn_states, fetches))
                policy.agent_connectors.on_policy_output(ac_data)

                assert agent_id not in actions_to_send[env_id]
                actions_to_send[env_id][agent_id] = action_to_send

        return actions_to_send
Example #12
0
    def compute_single_action(
            self,
            obs: TensorType,
            state: Optional[List[TensorType]] = None,
            prev_action: Optional[TensorType] = None,
            prev_reward: Optional[TensorType] = None,
            info: dict = None,
            episode: Optional["MultiAgentEpisode"] = None,
            clip_actions: bool = False,
            explore: Optional[bool] = None,
            timestep: Optional[int] = None,
            **kwargs) -> \
            Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
        """Unbatched version of compute_actions.

        Args:
            obs (TensorType): Single observation.
            state (Optional[List[TensorType]]): List of RNN state inputs, if
                any.
            prev_action (Optional[TensorType]): Previous action value, if any.
            prev_reward (Optional[TensorType]): Previous reward, if any.
            info (dict): Info object, if any.
            episode (Optional[MultiAgentEpisode]): this provides access to all
                of the internal episode state, which may be useful for
                model-based or multi-agent algorithms.
            clip_actions (bool): Should actions be clipped?
            explore (Optional[bool]): Whether to pick an exploitation or
                exploration action
                (default: None -> use self.config["explore"]).
            timestep (Optional[int]): The current (sampling) time step.

        Keyword Args:
            kwargs: Forward compatibility.

        Returns:
            Tuple:
                - actions (TensorType): Single action.
                - state_outs (List[TensorType]): List of RNN state outputs,
                    if any.
                - info (dict): Dictionary of extra features, if any.
        """
        prev_action_batch = None
        prev_reward_batch = None
        info_batch = None
        episodes = None
        state_batch = None
        if prev_action is not None:
            prev_action_batch = [prev_action]
        if prev_reward is not None:
            prev_reward_batch = [prev_reward]
        if info is not None:
            info_batch = [info]
        if episode is not None:
            episodes = [episode]
        if state is not None:
            state_batch = [
                s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else
                np.expand_dims(s, 0) for s in state
            ]

        out = self.compute_actions([obs],
                                   state_batch,
                                   prev_action_batch=prev_action_batch,
                                   prev_reward_batch=prev_reward_batch,
                                   info_batch=info_batch,
                                   episodes=episodes,
                                   explore=explore,
                                   timestep=timestep)

        # Some policies don't return a tuple, but always just a single action.
        # E.g. ES and ARS.
        if not isinstance(out, tuple):
            single_action = out
            state_out = []
            info = {}
        # Normal case: Policy should return (action, state, info) tuple.
        else:
            batched_action, state_out, info = out
            single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        if clip_actions:
            single_action = clip_action(single_action,
                                        self.action_space_struct)

        # Return action, internal state(s), infos.
        return single_action, [s[0] for s in state_out], \
            {k: v[0] for k, v in info.items()}
Example #13
0
    def compute_single_action(
            self,
            obs: TensorStructType,
            state: Optional[List[TensorType]] = None,
            prev_action: Optional[TensorStructType] = None,
            prev_reward: Optional[TensorStructType] = None,
            info: dict = None,
            episode: Optional["MultiAgentEpisode"] = None,
            clip_actions: bool = None,
            explore: Optional[bool] = None,
            timestep: Optional[int] = None,
            unsquash_actions: bool = None,
            **kwargs) -> \
            Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]:
        """Unbatched version of compute_actions.

        Args:
            obs: Single observation.
            state: List of RNN state inputs, if any.
            prev_action: Previous action value, if any.
            prev_reward: Previous reward, if any.
            info (dict): Info object, if any.
            episode: this provides access to all
                of the internal episode state, which may be useful for
                model-based or multi-agent algorithms.
            unsquash_actions: Should actions be unsquashed according to
                the Policy's action space?
            clip_actions: Should actions be clipped according to the
                Policy's action space?
            explore: Whether to pick an exploitation or
                exploration action
                (default: None -> use self.config["explore"]).
            timestep: The current (sampling) time step.

        Keyword Args:
            kwargs: Forward compatibility.

        Returns:
            - actions: Single action.
            - state_outs: List of RNN state outputs, if any.
            - info: Dictionary of extra features, if any.
        """
        # If policy works in normalized space, we should unsquash the action.
        # Use value of config.normalize_actions, if None.
        unsquash_actions = \
            unsquash_actions if unsquash_actions is not None \
            else self.config["normalize_actions"]
        clip_actions = clip_actions if clip_actions is not None else \
            self.config["clip_actions"]

        prev_action_batch = None
        prev_reward_batch = None
        info_batch = None
        episodes = None
        state_batch = None
        if prev_action is not None:
            prev_action_batch = [prev_action]
        if prev_reward is not None:
            prev_reward_batch = [prev_reward]
        if info is not None:
            info_batch = [info]
        if episode is not None:
            episodes = [episode]
        if state is not None:
            state_batch = [
                s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else
                np.expand_dims(s, 0) for s in state
            ]

        out = self.compute_actions(tree.map_structure(lambda s: np.array([s]),
                                                      obs),
                                   state_batch,
                                   prev_action_batch=prev_action_batch,
                                   prev_reward_batch=prev_reward_batch,
                                   info_batch=info_batch,
                                   episodes=episodes,
                                   explore=explore,
                                   timestep=timestep)

        # Some policies don't return a tuple, but always just a single action.
        # E.g. ES and ARS.
        if not isinstance(out, tuple):
            single_action = out
            state_out = []
            info = {}
        # Normal case: Policy should return (action, state, info) tuple.
        else:
            batched_action, state_out, info = out
            single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        # If we work in normalized action space (normalize_actions=True),
        # we re-translate here into the env's action space.
        if unsquash_actions:
            single_action = unsquash_action(single_action,
                                            self.action_space_struct)
        # Clip, according to env's action space.
        elif clip_actions:
            single_action = clip_action(single_action,
                                        self.action_space_struct)

        # Return action, internal state(s), infos.
        return single_action, [s[0] for s in state_out], \
            {k: v[0] for k, v in info.items()}
Example #14
0
File: policy.py Project: rlan/ray
    def compute_single_action(
            self,
            obs: Optional[TensorStructType] = None,
            state: Optional[List[TensorType]] = None,
            *,
            prev_action: Optional[TensorStructType] = None,
            prev_reward: Optional[TensorStructType] = None,
            info: dict = None,
            input_dict: Optional[SampleBatch] = None,
            episode: Optional["MultiAgentEpisode"] = None,
            explore: Optional[bool] = None,
            timestep: Optional[int] = None,
            # Kwars placeholder for future compatibility.
            **kwargs) -> \
            Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]]:
        """Unbatched version of compute_actions.

        Args:
            obs: Single observation.
            state: List of RNN state inputs, if any.
            prev_action: Previous action value, if any.
            prev_reward: Previous reward, if any.
            info: Info object, if any.
            input_dict: A SampleBatch or input dict containing the
                single (unbatched) Tensors to compute actions. If given, it'll
                be used instead of `obs`, `state`, `prev_action|reward`, and
                `info`.
            episode: This provides access to all of the internal episode state,
                which may be useful for model-based or multi-agent algorithms.
            explore: Whether to pick an exploitation or
                exploration action
                (default: None -> use self.config["explore"]).
            timestep: The current (sampling) time step.

        Keyword Args:
            kwargs: Forward compatibility.

        Returns:
            - actions: Single action.
            - state_outs: List of RNN state outputs, if any.
            - info: Dictionary of extra features, if any.
        """
        # Build the input-dict used for the call to
        # `self.compute_actions_from_input_dict()`.
        if input_dict is None:
            input_dict = {SampleBatch.OBS: obs}
            if state is not None:
                for i, s in enumerate(state):
                    input_dict[f"state_in_{i}"] = s
            if prev_action is not None:
                input_dict[SampleBatch.PREV_ACTIONS] = prev_action
            if prev_reward is not None:
                input_dict[SampleBatch.PREV_REWARDS] = prev_reward
            if info is not None:
                input_dict[SampleBatch.INFOS] = info

        # Batch all data in input dict.
        input_dict = tree.map_structure_with_path(
            lambda p, s:
            (s if p == "seq_lens" else s.unsqueeze(0) if torch and isinstance(
                s, torch.Tensor) else np.expand_dims(s, 0)), input_dict)

        episodes = None
        if episode is not None:
            episodes = [episode]

        out = self.compute_actions_from_input_dict(
            input_dict=SampleBatch(input_dict),
            episodes=episodes,
            explore=explore,
            timestep=timestep,
        )

        # Some policies don't return a tuple, but always just a single action.
        # E.g. ES and ARS.
        if not isinstance(out, tuple):
            single_action = out
            state_out = []
            info = {}
        # Normal case: Policy should return (action, state, info) tuple.
        else:
            batched_action, state_out, info = out
            single_action = unbatch(batched_action)
        assert len(single_action) == 1
        single_action = single_action[0]

        # Return action, internal state(s), infos.
        return single_action, [s[0] for s in state_out], \
            {k: v[0] for k, v in info.items()}
Example #15
0
        def to_config(self):
            return name, None

        @staticmethod
        def from_config(ctx: ConnectorContext, params: List[Any]):
            return LambdaActionConnector(ctx)

    LambdaActionConnector.__name__ = name
    LambdaActionConnector.__qualname__ = name

    register_connector(name, LambdaActionConnector)

    return LambdaActionConnector


# Convert actions and states into numpy arrays if necessary.
ConvertToNumpyConnector = register_lambda_action_connector(
    "ConvertToNumpyConnector",
    lambda actions, states, fetches: (
        convert_to_numpy(actions),
        convert_to_numpy(states),
        fetches,
    ),
)

# Split action-component batches into single action rows.
UnbatchActionsConnector = register_lambda_action_connector(
    "UnbatchActionsConnector",
    lambda actions, states, fetches: (unbatch(actions), states, fetches),
)
Example #16
0
def _process_policy_eval_results(*, to_eval, eval_results, active_episodes,
                                 active_envs, off_policy_actions, policies,
                                 clip_actions):
    """Process the output of policy neural network evaluation.

    Records policy evaluation results into the given episode objects and
    returns replies to send back to agents in the env.

    Args:
        to_eval (Dict[str,List[PolicyEvalData]]): Mapping of policy IDs to
            lists of PolicyEvalData objects.
        eval_results (Dict[str,List]): Mapping of policy IDs to list of
            actions, rnn-out states, extra-action-fetches dicts.
        active_episodes (defaultdict[str,MultiAgentEpisode]): Mapping from
            episode ID to currently ongoing MultiAgentEpisode object.
        active_envs (Set[int]): Set of non-terminated env ids.
        off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids ->
            off-policy-action, returned by a `BaseEnv.poll()` call.
        policies (Dict[str,Policy]): Mapping from policy ID to Policy obj.
        clip_actions (bool): Whether to clip actions to the action space's
            bounds.

    Returns:
        actions_to_send: Nested dict of env id -> agent id -> agent replies.
    """

    actions_to_send = defaultdict(dict)
    for env_id in active_envs:
        actions_to_send[env_id] = {}  # at minimum send empty dict

    for policy_id, eval_data in to_eval.items():
        rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data])

        actions = eval_results[policy_id][0]
        rnn_out_cols = eval_results[policy_id][1]
        pi_info_cols = eval_results[policy_id][2]

        # In case actions is a list (representing the 0th dim of a batch of
        # primitive actions), try to convert it first.
        if isinstance(actions, list):
            actions = np.array(actions)

        if len(rnn_in_cols) != len(rnn_out_cols):
            raise ValueError("Length of RNN in did not match RNN out, got: "
                             "{} vs {}".format(rnn_in_cols, rnn_out_cols))
        # Add RNN state info
        for f_i, column in enumerate(rnn_in_cols):
            pi_info_cols["state_in_{}".format(f_i)] = column
        for f_i, column in enumerate(rnn_out_cols):
            pi_info_cols["state_out_{}".format(f_i)] = column

        policy = _get_or_raise(policies, policy_id)
        # Split action-component batches into single action rows.
        actions = unbatch(actions)
        for i, action in enumerate(actions):
            env_id = eval_data[i].env_id
            agent_id = eval_data[i].agent_id
            # Clip if necessary.
            if clip_actions:
                clipped_action = clip_action(action,
                                             policy.action_space_struct)
            else:
                clipped_action = action
            actions_to_send[env_id][agent_id] = clipped_action
            episode = active_episodes[env_id]
            episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
            episode._set_last_pi_info(
                agent_id, {k: v[i]
                           for k, v in pi_info_cols.items()})
            if env_id in off_policy_actions and \
                    agent_id in off_policy_actions[env_id]:
                episode._set_last_action(agent_id,
                                         off_policy_actions[env_id][agent_id])
            else:
                episode._set_last_action(agent_id, action)

    return actions_to_send