def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) # Compute actions with th.no_grad(): q_values, hiddens = _mac( self.model, th.from_numpy(obs_batch), [th.from_numpy(np.array(s)) for s in state_batches]) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() hiddens = [s.numpy() for s in hiddens] return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) assert len(state_batches) == self.n_agents, state_batches state_batches = np.stack(state_batches, axis=1) # Compute actions with th.no_grad(): q_values, hiddens = _mac(self.model, th.from_numpy(obs_batch), th.from_numpy(state_batches)) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = var_to_np(actions) hiddens = var_to_np(hiddens) return (TupleActions(list(actions.transpose([1, 0]))), hiddens.transpose([1, 0, 2]), {})
def _clip_actions(actions, space): """Called to clip actions to the specified range of this policy. Arguments: actions: Batch of actions or TupleActions. space: Action space the actions should be present in. Returns: Clipped batch of actions. """ if isinstance(space, gym.spaces.Box): return np.clip(actions, space.low, space.high) elif isinstance(space, gym.spaces.Tuple): if not isinstance(actions, TupleActions): raise ValueError("Expected tuple space for actions {}: {}".format( actions, space)) out = [] for a, s in zip(actions.batches, space.spaces): out.append(_clip_actions(a, s)) return TupleActions(out) else: return actions
def rollout_loop(agent, env_name, num_steps, num_episodes, no_render=True, fps=1000, frameskip=1): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} steps = 0 full_episodes = 0 last_render_start = time.time() avg_reward = collections.deque([], maxlen=100) while steps < (num_steps or steps + 1) and full_episodes < num_episodes: mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id_: state_init[mapping_cache[agent_id_]]) prev_actions = DefaultMapping( lambda agent_id_: action_init[mapping_cache[agent_id_]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_episode = 0.0 while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) if isinstance(env.action_space, gym.spaces.Tuple): a_action = TupleActions(a_action) a_action = _unbatch_tuple_actions(a_action)[0] action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] rewards = None for frame in range(frameskip): next_obs, reward, done, _ = env.step(action) if done: log.info('Done at steps %d', steps) break if rewards is None: rewards = reward else: if multiagent: for agent_id, r in reward.items(): rewards[agent_id] += r else: rewards += reward if not no_render: target_delay = 1.0 / fps if fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay # note: ASYNC_PLAYER mode actually makes this sleep redundant if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() steps += 1 obs = next_obs if multiagent: for agent_id, r in rewards.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = rewards if multiagent: done = done['__all__'] reward_episode += 0 if rewards is None else sum( rewards.values()) else: reward_episode += 0 if rewards is None else rewards full_episodes += 1 avg_reward.append(reward_episode) log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode, np.mean(avg_reward)) env.reset() # this guarantees that recordings are saved to disk