Esempio n. 1
0
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        obs_batch, action_mask = self._unpack_observation(obs_batch)

        # Compute actions
        with th.no_grad():
            q_values, hiddens = _mac(
                self.model, th.from_numpy(obs_batch),
                [th.from_numpy(np.array(s)) for s in state_batches])
            avail = th.from_numpy(action_mask).float()
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            # epsilon-greedy action selector
            random_numbers = th.rand_like(q_values[:, :, 0])
            pick_random = (random_numbers < self.cur_epsilon).long()
            random_actions = Categorical(avail).sample().long()
            actions = (pick_random * random_actions +
                       (1 - pick_random) * masked_q_values.max(dim=2)[1])
            actions = actions.numpy()
            hiddens = [s.numpy() for s in hiddens]

        return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
Esempio n. 2
0
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        obs_batch, action_mask = self._unpack_observation(obs_batch)
        assert len(state_batches) == self.n_agents, state_batches
        state_batches = np.stack(state_batches, axis=1)

        # Compute actions
        with th.no_grad():
            q_values, hiddens = _mac(self.model, th.from_numpy(obs_batch),
                                     th.from_numpy(state_batches))
            avail = th.from_numpy(action_mask).float()
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            # epsilon-greedy action selector
            random_numbers = th.rand_like(q_values[:, :, 0])
            pick_random = (random_numbers < self.cur_epsilon).long()
            random_actions = Categorical(avail).sample().long()
            actions = (pick_random * random_actions +
                       (1 - pick_random) * masked_q_values.max(dim=2)[1])
            actions = var_to_np(actions)
            hiddens = var_to_np(hiddens)

        return (TupleActions(list(actions.transpose([1, 0]))),
                hiddens.transpose([1, 0, 2]), {})
Esempio n. 3
0
def _clip_actions(actions, space):
    """Called to clip actions to the specified range of this policy.

    Arguments:
        actions: Batch of actions or TupleActions.
        space: Action space the actions should be present in.

    Returns:
        Clipped batch of actions.
    """

    if isinstance(space, gym.spaces.Box):
        return np.clip(actions, space.low, space.high)
    elif isinstance(space, gym.spaces.Tuple):
        if not isinstance(actions, TupleActions):
            raise ValueError("Expected tuple space for actions {}: {}".format(
                actions, space))
        out = []
        for a, s in zip(actions.batches, space.spaces):
            out.append(_clip_actions(a, s))
        return TupleActions(out)
    else:
        return actions
Esempio n. 4
0
def rollout_loop(agent,
                 env_name,
                 num_steps,
                 num_episodes,
                 no_render=True,
                 fps=1000,
                 frameskip=1):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    full_episodes = 0
    last_render_start = time.time()
    avg_reward = collections.deque([], maxlen=100)

    while steps < (num_steps or steps + 1) and full_episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id_: state_init[mapping_cache[agent_id_]])
        prev_actions = DefaultMapping(
            lambda agent_id_: action_init[mapping_cache[agent_id_]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_episode = 0.0

        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)

                    if isinstance(env.action_space, gym.spaces.Tuple):
                        a_action = TupleActions(a_action)
                        a_action = _unbatch_tuple_actions(a_action)[0]

                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            rewards = None

            for frame in range(frameskip):
                next_obs, reward, done, _ = env.step(action)
                if done:
                    log.info('Done at steps %d', steps)
                    break

                if rewards is None:
                    rewards = reward

                else:
                    if multiagent:
                        for agent_id, r in reward.items():
                            rewards[agent_id] += r
                    else:
                        rewards += reward

                if not no_render:
                    target_delay = 1.0 / fps if fps > 0 else 0
                    current_delay = time.time() - last_render_start
                    time_wait = target_delay - current_delay

                    # note: ASYNC_PLAYER mode actually makes this sleep redundant
                    if time_wait > 0:
                        # log.info('Wait time %.3f', time_wait)
                        time.sleep(time_wait)

                    last_render_start = time.time()
                    env.render()

                steps += 1
                obs = next_obs

            if multiagent:
                for agent_id, r in rewards.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = rewards

            if multiagent:
                done = done['__all__']
                reward_episode += 0 if rewards is None else sum(
                    rewards.values())
            else:
                reward_episode += 0 if rewards is None else rewards

        full_episodes += 1

        avg_reward.append(reward_episode)
        log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode,
                 np.mean(avg_reward))

    env.reset()  # this guarantees that recordings are saved to disk