def _process_policy_eval_results(to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions): """Process the output of policy neural network evaluation. Records policy evaluation results into the given episode objects and returns replies to send back to agents in the env. Returns: actions_to_send: nested dict of env id -> agent id -> agent replies. """ actions_to_send = defaultdict(dict) for env_id in active_envs: actions_to_send[env_id] = {} # at minimum send empty dict for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) actions = eval_results[policy_id][0] rnn_out_cols = eval_results[policy_id][1] pi_info_cols = eval_results[policy_id][2] # In case actions is a list (representing the 0th dim of a batch of # primitive actions), try to convert it first. if isinstance(actions, list): actions = np.array(actions) if len(rnn_in_cols) != len(rnn_out_cols): raise ValueError("Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols, rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column policy = _get_or_raise(policies, policy_id) # Clip if necessary (while action components are still batched). if clip_actions: actions = clip_action(actions, policy.action_space_struct) # Split action-component batches into single action rows. actions = unbatch(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action(agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) return actions_to_send
def compute_actions(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) action = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) action = unbatch(action) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action
def compute_actions(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) # `actions` is a list of (component) batches. actions = self.sess.run(self.sampler, feed_dict={self.inputs: observation}) if add_noise: actions = tree.map_structure(self._add_noise, actions, self.action_space_struct) # Convert `flat_actions` to a list of lists of action components # (list of single actions). actions = unbatch(actions) return actions
def _compute_actions(policy, obs_batch, add_noise=False, update=True): observation = policy.preprocessor.transform(obs_batch) observation = policy.observation_filter(observation[None], update=update) observation = convert_to_torch_tensor(observation) dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [], None) dist = policy.dist_class(dist_inputs, policy.model) action = dist.sample().detach().numpy() action = unbatch(action) if add_noise and isinstance(policy.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * policy.action_noise_std return action
def compute_single_action(self, obs, state=None, prev_action=None, prev_reward=None, info=None, episode=None, clip_actions=False, explore=None, timestep=None, **kwargs): """Unbatched version of compute_actions. Arguments: obs (obj): Single observation. state (list): List of RNN state inputs, if any. prev_action (obj): Previous action value, if any. prev_reward (float): Previous reward, if any. info (dict): info object, if any episode (MultiAgentEpisode): this provides access to all of the internal episode state, which may be useful for model-based or multi-agent algorithms. clip_actions (bool): Should actions be clipped? explore (bool): Whether to pick an exploitation or exploration action (default: None -> use self.config["explore"]). timestep (int): The current (sampling) time step. kwargs: forward compatibility placeholder Returns: actions (obj): single action state_outs (list): list of RNN state outputs, if any info (dict): dictionary of extra features, if any """ prev_action_batch = None prev_reward_batch = None info_batch = None episodes = None state_batch = None if prev_action is not None: prev_action_batch = [prev_action] if prev_reward is not None: prev_reward_batch = [prev_reward] if info is not None: info_batch = [info] if episode is not None: episodes = [episode] if state is not None: state_batch = [[s] for s in state] batched_action, state_out, info = self.compute_actions( [obs], state_batch, prev_action_batch=prev_action_batch, prev_reward_batch=prev_reward_batch, info_batch=info_batch, episodes=episodes, explore=explore, timestep=timestep) single_action = unbatch(batched_action) assert len(single_action) == 1 single_action = single_action[0] if clip_actions: single_action = clip_action(single_action, self.action_space_struct) # Return action, internal state(s), infos. return single_action, [s[0] for s in state_out], \ {k: v[0] for k, v in info.items()}