Esempio n. 1
0
    def __init__(self,
                 example,
                 size,
                 B,
                 initial_replay_buffer_dict,
                 discount=1,
                 n_step_return=1,
                 fix_ratio=0.1):
        self.T = T = math.ceil(size / B)
        self.B = B
        self.size = T * B
        self.discount = discount
        self.n_step_return = n_step_return
        self.t = 0  # Cursor (in T dimension).
        if initial_replay_buffer_dict is None:
            self.samples = buffer_from_example(example, (T, B),
                                               share_memory=self.async_)
        else:
            self.samples = initial_replay_buffer_dict['buffer']
        if n_step_return > 1:  # right now n_step_return = 1
            self.samples_return_ = buffer_from_example(
                example.reward, (T, B), share_memory=self.async_)
            self.samples_done_n = buffer_from_example(example.done, (T, B),
                                                      share_memory=self.async_)
        else:
            self.samples_return_ = self.samples.reward
            self.samples_done_n = self.samples.done
        self._buffer_full = False
        self.off_backward = n_step_return  # Current invalid samples.
        self.off_forward = 1  # i.e. current cursor, prev_action overwritten.

        #!
        self.fix_T = math.ceil(self.size * fix_ratio / B)
Esempio n. 2
0
 def collect_evaluation(self, itr):
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     completed_traj_infos = list()
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     action = buffer_from_example(
         self.envs[0].action_space.sample(null=True), len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             traj_infos[b].step(observation[b], action[b], r, d,
                                agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 completed_traj_infos.append(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls()
                 o = env.reset()
             if d:
                 action[b] = 0  # Prev_action for next step.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if (self.max_trajectories is not None
                 and len(completed_traj_infos) >= self.max_trajectories):
             break
     return completed_traj_infos
Esempio n. 3
0
 def collect_evaluation(self, itr):
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, o in enumerate(observations):
         observation[b] = o
     action = buffer_from_example(self.envs[0].action_space.null_value(),
                                  len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     self.agent.eval_mode(itr)
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             traj_infos[b].step(observation[b], action[b], r, d,
                                agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 self.traj_infos_queue.put(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls()
                 o = env.reset()
             if d:
                 action[b] = 0  # Next prev_action.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if self.sync.stop_eval.value:
             break
     self.traj_infos_queue.put(None)  # End sentinel.
Esempio n. 4
0
 def __init__(self,
              example,
              sampler_B,
              optim_B,
              batch_T,
              discount=1,
              n_step_return=1,
              T_target=100):
     super().__init__()
     self.samples = buffer_from_example(example, (batch_T, optim_B),
                                        share_memory=self.async_)
     field_names = [f for f in example._fields if f != "prev_rnn_state"]
     global SamplesToBuffer
     self.SamplesToBuffer = namedarraytuple("SamplesToBuffer", field_names)
     buffer_example = self.SamplesToBuffer(*(v for k, v in example.items()
                                             if k != "prev_rnn_state"))
     self.buffer_size = optim_B * T_target
     # self.buffer_size = sampler_B * (T_target * optim_B // sampler_B)
     self.samples = buffer_from_example(buffer_example,
                                        (batch_T, self.buffer_size),
                                        share_memory=self.async_)
     self.samples_prev_rnn_state = buffer_from_example(
         example.prev_rnn_state, (self.buffer_size, ),
         share_memory=self.async_)
     self.sleep_length = 0.01
     self.T_target = T_target
     self.t = 0
     self.optim_batch_B = optim_B
Esempio n. 5
0
def build_step_buffer(examples, B):
    bufs = tuple(
        buffer_from_example(examples[k], B, shared_memory=True)
        for k in ["observation", "action", "reward", "done", "agent_info"])
    need_reset = buffer_from_example(examples["done"], B, shared_memory=True)
    step_buffer_np = StepBuffer(*bufs, need_reset)
    step_buffer_pyt = torchify_buffer(step_buffer_np)
    return step_buffer_pyt, step_buffer_np
Esempio n. 6
0
    def collect_evaluation(self, itr):
        assert self.max_trajectories == len(self.envs)
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
        completed_traj_infos = list()
        observations = list()
        for env in self.envs:
            observations.append(env.reset())
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, o in enumerate(observations):
            observation[b] = o
        action = buffer_from_example(self.envs[0].action_space.null_value(),
                                     len(self.envs))
        reward = np.zeros(len(self.envs), dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        self.agent.reset()
        self.agent.eval_mode(itr)
        live_envs = list(range(len(self.envs)))
        for t in range(self.max_T):
            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)

            b = 0
            while b < len(
                    live_envs
            ):  # don't want to do a for loop since live envs changes over time
                env_id = live_envs[b]
                o, r, d, env_info = self.envs[env_id].step(action[b])
                traj_infos[env_id].step(observation[b], action[b], r, d,
                                        agent_info[b], env_info)
                if getattr(env_info, "traj_done", d):
                    completed_traj_infos.append(
                        traj_infos[env_id].terminate(o))

                    observation = delete_ind_from_array(observation, b)
                    reward = delete_ind_from_array(reward, b)
                    action = delete_ind_from_array(action, b)
                    obs_pyt, act_pyt, rew_pyt = torchify_buffer(
                        (observation, action, reward))

                    del live_envs[b]
                    b -= 1  # live_envs[b] is now the next env, so go back one.
                else:
                    observation[b] = o
                    reward[b] = r

                b += 1

                if (self.max_trajectories is not None and
                        len(completed_traj_infos) >= self.max_trajectories):
                    logger.log("Evaluation reached max num trajectories "
                               f"({self.max_trajectories}).")
                    return completed_traj_infos

        if t == self.max_T - 1:
            logger.log("Evaluation reached max num time steps "
                       f"({self.max_T}).")
        return completed_traj_infos
Esempio n. 7
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
        agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    # import ipdb; ipdb.set_trace()
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    # import ipdb; ipdb.set_trace()
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:        
        if agent.dual_model:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv)
        else:
            bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
            agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset)
    next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) 
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        next_observation=next_observation,
        prev_reward=prev_reward,
        reward=reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt)
    return samples_pyt, samples_np, examples
Esempio n. 8
0
 def collect_evaluation(self, itr, include_observations=False):
     traj_infos = [
         self.TrajInfoCls(include_observations=include_observations)
         for _ in range(len(self.envs))
     ]
     completed_traj_infos = list()
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     action = buffer_from_example(self.envs[0].action_space.null_value(),
                                  len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     self.agent.eval_mode(itr)
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             if include_observations:
                 traj_infos[b].step(env.render(), action[b], r, d,
                                    agent_info[b], env_info)
             else:
                 traj_infos[b].step(observation[b], action[b], r, d,
                                    agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 completed_traj_infos.append(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls(
                     include_observations=include_observations)
                 o = env.reset()
             if d:
                 action[b] = 0  # Prev_action for next step.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if (self.max_trajectories is not None
                 and len(completed_traj_infos) >= self.max_trajectories):
             logger.log("Evaluation reached max num trajectories "
                        f"({self.max_trajectories}).")
             break
     if t == self.max_T - 1:
         logger.log("Evaluation reached max num time steps "
                    f"({self.max_T}).")
     return completed_traj_infos
Esempio n. 9
0
 def start_envs(self, max_decorrelation_steps=0):
     """Calls reset() on every env and returns agent_inputs buffer."""
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, obs in enumerate(observations):
         observation[b] = obs  # numpy array or namedarraytuple
     prev_action = self.envs[0].action_space.sample(len(self.envs),
                                                    null=True)
     prev_reward = np.zeros(len(self.envs), dtype="float32")
     if self.rank == 0:
         logger.log("Sampler decorrelating envs, max steps: "
                    f"{max_decorrelation_steps}")
     if max_decorrelation_steps == 0:
         return AgentInputs(observation, prev_action,
                            prev_reward), traj_infos
     for b, env in enumerate(self.envs):
         n_steps = 1 + int(np.random.rand() * max_decorrelation_steps)
         env_actions = env.action_space.sample(n_steps)
         for a in env_actions:
             o, r, d, info = env.step(a)
             traj_infos[b].step(o, a, r, d, None, info)
             if getattr(info, "traj_done", d):
                 o = env.reset()
                 traj_infos[b] = self.TrajInfoCls()
             if d:
                 a = env.action_space.sample(null=True)
                 r = 0
         observation[b] = o
         prev_action[b] = a
         prev_reward[b] = r
     return AgentInputs(observation, prev_action, prev_reward), traj_infos
Esempio n. 10
0
 def __init__(self,
              example,
              size,
              B,
              rnn_state_interval,
              batch_T=None,
              **kwargs):
     self.rnn_state_interval = rnn_state_interval
     self.batch_T = batch_T  # Maybe required fixed depending on replay type.
     if rnn_state_interval <= 1:  # Store no rnn state or every rnn state.
         buffer_example = example
     else:
         # Store some of rnn states; remove from samples.
         field_names = [f for f in example._fields if f != "prev_rnn_state"]
         global SamplesToBuffer
         SamplesToBuffer = namedarraytuple("SamplesToBuffer", field_names)
         buffer_example = SamplesToBuffer(*(v for k, v in example.items()
                                            if k != "prev_rnn_state"))
         size = (
             B * rnn_state_interval *
             math.ceil(  # T as multiple of interval.
                 math.ceil(size / B) / rnn_state_interval))
         self.samples_prev_rnn_state = buffer_from_example(
             example.prev_rnn_state,
             (size // (B * rnn_state_interval), B),
             share_memory=self.async_,
         )
     super().__init__(example=buffer_example, size=size, B=B, **kwargs)
     if rnn_state_interval > 1:
         assert self.T % rnn_state_interval == 0
         self.rnn_T = self.T // rnn_state_interval
Esempio n. 11
0
 def __init__(self, example, size, B, replay_T):
     self.T = T = math.ceil(size / B)
     self.B = B
     self.size = T * B
     self.t = 0  # cursor
     self.replay_T = replay_T
     self.samples = buffer_from_example(example, (T, B), share_memory=self.async_)
     self._buffer_full = False
Esempio n. 12
0
def build_step_buffer(examples, B):
    step_bufs = {
        k: buffer_from_example(examples[k], B, share_memory=True)
        for k in ["observation", "action", "reward", "done", "agent_info"]
    }
    step_buffer_np = StepBuffer(**step_bufs)
    step_buffer_pyt = torchify_buffer(step_buffer_np)
    return step_buffer_pyt, step_buffer_np
Esempio n. 13
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     if self.n_step_return > 1:
         self.samples_timeout_n = buffer_from_example(
             self.samples.timeout[0, 0], (self.T, self.B),
             share_memory=self.async_)
     else:
         self.samples_timeout_n = self.samples.timeout
Esempio n. 14
0
def build_samples_buffer(agent,
                         env,
                         batch_spec,
                         bootstrap_value=False,
                         agent_shared=True,
                         env_shared=True,
                         subprocess=True,
                         examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)"""
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B),
                                     agent_shared)
    action = all_action[1:]
    prev_action = all_action[:
                             -1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B),
                                     agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B),
                                 agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B),
                                      env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B),
                                     env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:
                             -1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
Esempio n. 15
0
def simulate_policy(env, agent):
    # snapshot = torch.load(path_to_params, map_location=torch.device('cpu'))
    # agent_state_dict = snapshot['agent_state_dict']
    # env = GymEnvWrapper(gym.make(env_id, render=True))
    # env = gym.make('HopperPyBulletEnv-v0')
    # env.render(mode='human')
    # env = GymEnvWrapper(env)
    # agent_kwargs = dict(ModelCls=PiMcpVisionModel, QModelCls=QofMcpVisionModel)
    # agent = SacAgent(**agent_kwargs)
    # agent = SacAgent(model_kwargs=dict(hidden_sizes=[512,256, 256]), q_model_kwargs=dict(hidden_sizes=[512, 256, 256]))
    # agent = MujocoFfAgent(ModelCls=PPOMcpModel)
    # agent.initialize(env_spaces=env.spaces)
    # agent.load_state_dict(agent_state_dict)
    # agent.eval_mode(0)
    obs = env.reset()
    observation = buffer_from_example(obs, 1)
    loop_time = 0.04
    while True:
        observation[0] = env.reset()
        action = buffer_from_example(env.action_space.null_value(), 1)
        reward = np.zeros(1, dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        done = False
        step = 0
        reward_sum = 0
        env.render()
        # time.sleep(5)
        while not done:
            loop_start = time.time()
            step += 1
            act_pyt, agent_info = agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)
            obs, reward, done, info = env.step(action[0])
            reward_sum += reward
            observation[0] = obs
            rew_pyt[0] = reward
            sleep_time = loop_time - (time.time() - loop_start)
            sleep_time = 0 if (sleep_time < 0) else sleep_time
            time.sleep(sleep_time)
            env.render(mode='human')
        print('return: ' + str(reward_sum) + '  num_steps: ' + str(step))
Esempio n. 16
0
 def __init__(self, example, size, B, discount=1, n_step_return=1):
     self.T = T = math.ceil(size / B)
     self.B = B
     self.size = T * B
     self.discount = discount
     self.n_step_return = n_step_return
     self.t = 0  # Cursor (in T dimension).
     self.samples = buffer_from_example(example, (T, B),
                                        share_memory=self.async_)
     if n_step_return > 1:
         self.samples_return_ = buffer_from_example(
             example.reward, (T, B), share_memory=self.async_)
         self.samples_done_n = buffer_from_example(example.done, (T, B),
                                                   share_memory=self.async_)
     else:
         self.samples_return_ = self.samples.reward
         self.samples_done_n = self.samples.done
     self._buffer_full = False
     self.off_backward = n_step_return  # Current invalid samples.
     self.off_forward = 1  # i.e. current cursor, prev_action overwritten.
Esempio n. 17
0
File: mtgail.py Progetto: qxcv/mtil
 def __init__(self, total_n_samples, example_samples):
     self.total_n_samples = total_n_samples
     replay_samples = DiscrimReplaySamples(
         all_observation=example_samples.env.observation,
         all_action=example_samples.agent.action)
     T, B = get_leading_dims(replay_samples, n_dim=2)
     assert total_n_samples >= T * B > 0, (total_n_samples, T * B)
     self.circ_buf = buffer_from_example(replay_samples[0, 0],
                                         (total_n_samples, ))
     self.samples_in_buffer = 0
     self.ptr = 0
Esempio n. 18
0
    def initialize(
        self,
        agent,
        affinity=None,
        seed=None,
        bootstrap_value=False,
        traj_info_kwargs=None,
        rank=0,
        world_size=1,
    ):
        assert world_size == 1  # world size used in async samplers, not relevant for this class

        T, B = self.batch_spec
        self.agent = agent
        self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs)
        env_ranks = list(range(rank * B, (rank + 1) * B))
        agent.initialize(self.env.spaces,
                         share_memory=False,
                         global_B=B,
                         env_ranks=env_ranks)
        self.samples_pyt, self.samples_np, examples = build_samples_buffer(
            agent,
            self.env,
            self.batch_spec,
            bootstrap_value,
            agent_shared=False,
            env_shared=False,
            subprocess=False,
            examples=self._get_example_outputs())

        self.samples_np.env.done[:-1, :] = False
        self.samples_np.env.done[-1, :] = True
        self.traj_info_kwargs = traj_info_kwargs

        self.agent_inputs = AgentInputs(
            buffer_from_example(examples["observation"], (B, )),
            buffer_from_example(examples["action"], (B, )),
            buffer_from_example(examples["reward"], (B, )))
        self._start_agent(B, env_ranks)
        logger.log("BatchedEpisodicSampler initialized.")
        return examples
Esempio n. 19
0
 def __init__(self, example, size, B, replay_T, discount, n_step_return,
              alpha, beta):
     self.T = T = math.ceil(size / B)
     self.B = B
     self.size = T * B
     self.t = 0  # cursor
     self.replay_T = replay_T
     self.discount = discount
     self.n_step_return = n_step_return
     self.alpha = alpha
     self.beta = beta
     self.samples = buffer_from_example(example, (T, B),
                                        share_memory=self.async_)
     if n_step_return > 1:
         self.samples_return_ = buffer_from_example(example.reward, (T, B))
         self.samples_done_n = buffer_from_example(example.done, (T, B))
     else:
         self.samples_return_ = self.samples.reward
         self.samples_done_n = self.samples.done
     self._buffer_full = False
     self.init_priority_tree()
Esempio n. 20
0
    def start_envs(self, max_decorrelation_steps=0):
        """Calls ``reset()`` on every environment instance, then steps each
        one through a random number of random actions, and returns the
        resulting agent_inputs buffer (`observation`, `prev_action`,
        `prev_reward`)."""
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]

        prev_action = np.stack(
            [env.action_space.null_value() for env in self.envs])  # noop
        prev_reward = np.zeros(
            len(self.envs),
            dtype="float32")  # total reward (extrinsic + intrinsic)
        observations = list()
        for env in self.envs:
            o = env.reset()
            observations.append(deepcopy(o))  # emulates stepping with noop
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, obs in enumerate(observations):
            observation[b] = obs

        if self.rank == 0:
            logger.log("Sampler decorrelating envs, max steps: "
                       f"{max_decorrelation_steps}")
        if max_decorrelation_steps != 0:
            for b, env in enumerate(self.envs):
                n_steps = 1 + int(np.random.rand() * max_decorrelation_steps)
                for _ in range(n_steps):
                    a = env.action_space.sample()
                    if a.shape == (
                    ):  # 'a' gets stored, but if form is array(3) you need to pass int(3) for env
                        action = int(a)
                    else:
                        action = a
                    o, r, d, info = env.step(action)

                    traj_infos[b].step(o, a, r, d, None, info)
                    if getattr(info, "traj_done", d):
                        o = env.reset()
                        traj_infos[b] = self.TrajInfoCls()
                    if d:
                        a = env.action_space.null_value()
                        r = 0
                observation[b] = o
                prev_action[b] = a
                prev_reward[b] = r

        # For action-server samplers.
        if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None:
            self.step_buffer_np.prev_action[:] = prev_action
            self.step_buffer_np.prev_reward[:] = prev_reward
            self.step_buffer_np.observation[:] = observation

        return AgentInputs(observation, prev_action, prev_reward), traj_infos
Esempio n. 21
0
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False,
                         agent_shared=True, env_shared=True, subprocess=True, examples=None):
    """Recommended to step/reset agent and env in subprocess, so it doesn't
    affect settings in master before forking workers (e.g. torch num_threads
    (MKL) may be set at first forward computation.)

    :param agent: 一个Agent类的对象。
    :param env: 一个environment类的对象。
    :param batch_spec: 一个BatchSpec类的对象。
    """
    if examples is None:
        if subprocess:  # 创建子进程
            mgr = mp.Manager()  # Manager模块用于资源共享
            examples = mgr.dict()  # Examples pickled back to master. 可以被子进程共享的全局变量
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))  # 创建worker进程,此进程执行的是target指定的函数,参数由args指定
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)  # examples会在get_example_outputs()函数中被更新,所以没有返回值

    T, B = batch_spec  # time step数,以及environment实例数
    all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared)
    action = all_action[1:]
    prev_action = all_action[:-1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:
        bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared)
        agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv)

    observation = buffer_from_example(examples["observation"], (T, B), env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:-1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)
    env_buffer = EnvSamples(
        observation=observation,
        reward=reward,
        prev_reward=prev_reward,
        done=done,
        env_info=env_info,
    )
    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
Esempio n. 22
0
def simulate_policy(env, agent, render):
    static_decoder_path = './qec/referee_decoders/nn_d5_DP_p5'
    static_decoder = load_model(static_decoder_path, compile=True)
    obs = env.reset()
    observation = buffer_from_example(obs, 1)
    loop_time = 0.01
    returns = []
    mses = []
    lifetimes = []
    while True:
        observation[0] = env.reset()
        action = buffer_from_example(env.action_space.null_value(), 1)
        reward = np.zeros(1, dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        agent.reset()
        done = False
        step = 0
        reward_sum = 0
        while not done:
            loop_start = time.time()
            step += 1
            act_pyt, agent_info = agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)[0]
            obs, reward, done, info = env.step(action)
            # done = np.argmax(static_decoder(info.static_decoder_input)[0]) != info.correct_label
            reward_sum += reward
            observation[0] = obs
            rew_pyt[0] = float(reward)

        returns.append(reward_sum)
        lifetimes.append(info.lifetime)
        print('avg return: ' + str(sum(returns) / len(returns)) + ' return: ' +
              str(reward_sum) + '  num_steps: ' + str(step))
        print(
            f'average lifetime: {sum(lifetimes)/len(lifetimes)} lifetime: {info.lifetime}'
        )
Esempio n. 23
0
 def __init__(self, example, shared_memory=False, **kwargs):
     field_names = [f for f in example._fields if f != "observation"]
     global BufferSamples
     BufferSamples = namedarraytuple("BufferSamples", field_names)
     buffer_example = BufferSamples(*(v for k, v in example.items()
                                      if k != "observation"))
     super().__init__(example=buffer_example,
                      shared_memory=shared_memory,
                      **kwargs)
     # Equivalent to image.shape[0] if observation is image array (C,H,W):
     self.n_frames = n_frames = get_leading_dims(example.observation,
                                                 n_dim=1)[0]
     logger.log(f"Frame-based buffer using {n_frames}-frame sequences.")
     # frames: oldest stored at t; duplicate n_frames - 1 beginning & end.
     self.samples_frames = buffer_from_example(
         example.observation[0], (self.T + n_frames - 1, self.B),
         shared_memory=shared_memory)  # [T+n_frames-1,B,H,W]
     # new_frames: shifted so newest stored at t; no duplication.
     self.samples_new_frames = self.samples_frames[n_frames -
                                                   1:]  # [T,B,H,W]
     self.samples_n_blanks = buffer_from_example(
         np.zeros(1, dtype="uint8"), (self.T, self.B),
         shared_memory=shared_memory)
     self.off_forward = max(self.off_forward, n_frames - 1)
Esempio n. 24
0
 def start_envs(self, max_decorrelation_steps=0):
     """Calls ``reset()`` on every environment instance, then steps each
     one through a random number of random actions, and returns the
     resulting agent_inputs buffer (`observation`, `prev_action`,
     `prev_reward`)."""
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, obs in enumerate(observations):
         observation[b] = obs  # numpy array or namedarraytuple
     prev_action = np.stack([env.action_space.null_value()
         for env in self.envs])
     prev_reward = np.zeros(len(self.envs), dtype="float32")
     if self.rank == 0:
         logger.log("Sampler decorrelating envs, max steps: "
             f"{max_decorrelation_steps}")
     if max_decorrelation_steps != 0:
         for b, env in enumerate(self.envs):
             n_steps = 1 + int(np.random.rand() * max_decorrelation_steps)
             for _ in range(n_steps):
                 a = env.action_space.sample()
                 o, r, d, info = env.step(a)
                 traj_infos[b].step(o, a, r, d, None, info)
                 if getattr(info, "traj_done", d):
                     o = env.reset()
                     traj_infos[b] = self.TrajInfoCls()
                 if ((type(d) is np.ndarray
                     and d.any()) or (type(d) is bool and d)):
                     a = env.action_space.null_value()
                     r = 0
             observation[b] = o
             prev_action[b] = a
             prev_reward[b] = r
     # For action-server samplers.
     if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None:
         self.step_buffer_np.observation[:] = observation
         self.step_buffer_np.action[:] = prev_action
         self.step_buffer_np.reward[:] = prev_reward
     return AgentInputs(observation, prev_action, prev_reward), traj_infos
Esempio n. 25
0
    def _decorrelate_envs(self):
        """Return agent_inputs and traj_info at the end of decorrelation using random actions (collector.start_envs)"""
        o = self.env.reset()
        prev_observation = buffer_from_example(o[0], self.batch_spec.B)
        prev_reward = np.zeros(self.batch_spec.B, dtype="float32")
        prev_action = np.zeros(self.batch_spec.B, dtype=int)
        traj_infos = self.TrajInfoCls(B=self.batch_spec.B)
        for _ in range(self.decorrelation_steps):
            prev_action[:] = self.env.action_space.sample(
            )  # Sample random actions for each
            prev_observation[:], prev_reward[:], d, info = self.env.step(
                prev_action[:])  # Take step
            traj_infos.step(prev_observation,
                            prev_action,
                            prev_reward,
                            d,
                            None,
                            info,
                            reset_dones=True)  # Update traj_info

        return AgentInputs(prev_observation, prev_action,
                           prev_reward), traj_infos
Esempio n. 26
0
def build_intrinsic_samples_buffer(agent,
                                   env,
                                   batch_spec,
                                   bootstrap_value=False,
                                   next_obs=False,
                                   agent_shared=True,
                                   env_shared=True,
                                   subprocess=True,
                                   examples=None):
    """
    Replaces ``build_samples_buffer`` to add additional buffer space for intrinsic bonus agents.
    If bootstrap_value=True, also adds space for int_bootstrap_value from intrinsic value head.
    If next_obs=True, also adds space for next observations (NOTE: This is memory intensive with
    raw pixel states, as it doubles the space to store images. Keep this as False unless the
    algorithm needs it).
    """
    if examples is None:
        if subprocess:
            mgr = mp.Manager()
            examples = mgr.dict()  # Examples pickled back to master.
            w = mp.Process(target=get_example_outputs,
                           args=(agent, env, examples, subprocess))
            w.start()
            w.join()
        else:
            examples = dict()
            get_example_outputs(agent, env, examples)

    T, B = batch_spec
    all_action = buffer_from_example(examples["action"], (T + 1, B),
                                     agent_shared)
    action = all_action[1:]
    prev_action = all_action[:
                             -1]  # Writing to action will populate prev_action.
    agent_info = buffer_from_example(examples["agent_info"], (T, B),
                                     agent_shared)
    agent_buffer = AgentSamples(
        action=action,
        prev_action=prev_action,
        agent_info=agent_info,
    )
    if bootstrap_value:  # Added buffer space for intrinsic bootstrap value
        bv = buffer_from_example(examples["agent_info"].ext_value, (1, B),
                                 agent_shared)
        int_bv = buffer_from_example(examples["agent_info"].int_value, (1, B),
                                     agent_shared)
        agent_buffer = IntAgentSamplesBsv(*agent_buffer,
                                          bootstrap_value=bv,
                                          int_bootstrap_value=int_bv)

    observation = buffer_from_example(examples["observation"], (T, B),
                                      env_shared)
    all_reward = buffer_from_example(examples["reward"], (T + 1, B),
                                     env_shared)
    reward = all_reward[1:]
    prev_reward = all_reward[:
                             -1]  # Writing to reward will populate prev_reward.
    done = buffer_from_example(examples["done"], (T, B), env_shared)
    env_info = buffer_from_example(examples["env_info"], (T, B), env_shared)

    if next_obs:  # Add buffer space for next obs, if specified
        next_observation = buffer_from_example(examples["observation"], (T, B),
                                               env_shared)
        env_buffer = EnvSamplesPlus(
            observation=observation,
            next_observation=next_observation,
            reward=reward,
            prev_reward=prev_reward,
            done=done,
            env_info=env_info,
        )
    else:
        env_buffer = EnvSamples(
            observation=observation,
            reward=reward,
            prev_reward=prev_reward,
            done=done,
            env_info=env_info,
        )

    samples_np = Samples(agent=agent_buffer, env=env_buffer)
    samples_pyt = torchify_buffer(samples_np)
    return samples_pyt, samples_np, examples
Esempio n. 27
0
    def start_envs(self, max_decorrelation_steps=0):
        """
        Calls reset() on every env and returns agent_inputs buffer.

        这个函数在Sampler类(例如SerialSampler)中的 initialize() 里会被调用,进行诸如收集(采样)第一批数据的工作。
        :param: max_decorrelation_steps: 最大[去相关性]的步数。
        :return 一个 namedarraytuple,包含3个元素(observation,action,reward),每个元素又分别是一个list;以及trajectory的一些统计
        信息(TrajInfo类对象组成的一个list)。
        """
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))
                      ]  # 每一个environment都对应一个TrajInfo对象
        observations = list()
        for env in self.envs:  # self.envs是一个environment的list,它是在sampler类(例如SerialSampler)里面实例化的
            observations.append(env.reset())
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, obs in enumerate(observations):
            observation[b] = obs  # numpy array or namedarraytuple
        prev_action = np.stack(
            [env.action_space.null_value() for env in self.envs])
        prev_reward = np.zeros(len(self.envs), dtype="float32")
        if self.rank == 0:
            logger.log("Sampler decorrelating envs, max steps: "
                       f"{max_decorrelation_steps}")
        """
        在所有environment内,依次采样一批数据。按我的理解,这里的decorrelation逻辑是这样的:首先指定一个步数(例如100),然后对每一个
        environment都走100步来采样,如果不到100步environment就走到头了也没关系,reset之后从头继续走,反正一共走够100步。所有environment
        里的数据混在一起返回,这样做确实起到了decorrelation的作用。
        """
        if max_decorrelation_steps != 0:
            for b, env in enumerate(
                    self.envs
            ):  # 遍历所有environment,b为从0开始的索引值,env为envs里面的每一个environment实例
                n_steps = 1 + int(np.random.rand() *
                                  max_decorrelation_steps)  # +1是防止结果为0导致逻辑不通
                for _ in range(n_steps):
                    """
                    关于env.action_space,可参考Env._action_space这个成员变量的值。这里的 env.action_space.sample(),对AtariEnv
                    就是计算IntBox.sample(),即在action space内随机选一个动作的index(并非实际动作),这里没有直接得到action,而是得到
                    一个action space内的一个index,原因是:在env.step(a)里会根据index获取一个action。另外,这里之所以随机获取action
                    space内的一个index,是因为此时是在Collector类的start_envs()函数中,也就是说此时刚开始从environment里收集数据,
                    因此第一次收集的话,是不知道应该采取什么action的(不像后面已经得到一个network的时候可以根据前面的observation算出一个
                    action),所以这里就随机选取一个index就好了。
                    """
                    a = env.action_space.sample()
                    o, r, d, info = env.step(
                        a
                    )  # 执行action,得到observation, reward, done(是否完成标志), info(一些统计信息)
                    traj_infos[b].step(o, a, r, d, None,
                                       info)  # 更新trajectory的一些统计信息
                    """
                    info是一个namedtuple,取出来的traj_done属性值,是一个bool,表明是否game over了(对Atari游戏来说),如果没有game 
                    over,还要看是不是已经done了(比如游戏通关了),所以getattr()的default value设置成了done标志。
                    """
                    if getattr(info, "traj_done", d):
                        o = env.reset()  # 重置environment,回到最初状态
                        traj_infos[b] = self.TrajInfoCls()  # TrajInfo类的对象
                    if d:  # done(比如游戏通关)
                        a = env.action_space.null_value()
                        r = 0
                observation[b] = o
                prev_action[b] = a
                prev_reward[b] = r
        # For action-server samplers. rlpyt有一种并行模式是Parallel-GPU,在这种模式下,会有一个action-server的概念(参考rlpyt论文)
        if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None:
            self.step_buffer_np.observation[:] = observation
            self.step_buffer_np.action[:] = prev_action
            self.step_buffer_np.reward[:] = prev_reward
        return AgentInputs(observation, prev_action, prev_reward), traj_infos
Esempio n. 28
0
    def collect_evaluation(self, itr):
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
        observations = list()
        for env in self.envs:
            observations.append(env.reset())
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, o in enumerate(observations):
            observation[b] = o
        action = buffer_from_example(self.envs[0].action_space.null_value(),
                                     len(self.envs))
        reward = np.zeros(len(self.envs), dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        self.agent.reset()
        self.agent.eval_mode(itr)

        #* Modifying the eval logic here: always return traj for each env of a worker
        # obs_pyt: num_eval_env_per x obs_dim(3); act_pyt: num_eval_env_per x act_dim(1); rew_pyt: num_eval_env_per
        envs_done_flag = np.zeros((len(self.envs)))
        for t in range(self.max_T):  # max_T=100, not eval_max_steps
            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)

            # Go through each env in a worker
            for b, env in enumerate(self.envs):
                o, r, d, env_info = env.step(action[b])
                traj_infos[b].step(observation[b], action[b], r, d,
                                   agent_info[b], env_info)

                # Right now this one is never activated since our custom env (pendulum) does not return any info at each step
                if getattr(env_info, "traj_done", d):
                    self.traj_infos_queue.put(traj_infos[b].terminate(o))
                    traj_infos[b] = self.TrajInfoCls()
                    o = env.reset()
                    envs_done_flag[b] = 1

                # Right now this one is never activated since our custom env (pendulum) does not say done
                if d:
                    action[b] = 0  # Next prev_action.
                    r = 0
                    self.agent.reset_one(
                        idx=b)  # this does not do anything right now
                    envs_done_flag[b] = 1

                # Save saliency
                if t == 10 and b == 0 and self.agent.saliency_dir is not None:
                    saliency(img=o,
                             model=self.agent.model,
                             save_path=self.agent.saliency_dir + str(itr) +
                             '.png')

                observation[b] = o
                reward[b] = r
            if self.sync.stop_eval.value:
                break

        # Regardless, add to queue TODO: need to tell traj_info the global index of envs (like which image was used)
        for b in range(len(self.envs)):
            if envs_done_flag[b] < 1e-4:
                self.traj_infos_queue.put(traj_infos[b].terminate(o))

        self.traj_infos_queue.put(None)  # End sentinel.
Esempio n. 29
0
    def start_envs(self, max_decorrelation_steps=0):
        """Calls ``reset()`` on every environment instance, then steps each
        one through a random number of random actions, and returns the
        resulting agent_inputs buffer (`observation`, `prev_action`,
        `prev_reward`)."""
        player_traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
        if isinstance(self.envs[0], CWTO_EnvWrapperAtari):
            observer_traj_infos = [
                self.TrajInfoCls(n_obs=env.window_size, serial=env.serial)
                for env in self.envs
            ]
        else:
            observer_traj_infos = [
                self.TrajInfoCls(n_obs=env.obs_size, serial=env.serial)
                for env in self.envs
            ]
        player_observations = list()
        observer_observations = list()
        for env in self.envs:
            observer_observations.append(env.reset())
            player_observations.append(
                env.player_observation_space.null_value())
        observer_observation = buffer_from_example(observer_observations[0],
                                                   len(self.envs))
        player_observation = buffer_from_example(player_observations[0],
                                                 len(self.envs))
        for b, obs in enumerate(observer_observations):
            observer_observation[b] = obs  # numpy array or namedarraytuple
        player_prev_action = np.stack(
            [env.player_action_space.null_value() for env in self.envs])
        observer_prev_action = np.stack(
            [env.observer_action_space.null_value() for env in self.envs])
        player_prev_reward = np.zeros(len(self.envs), dtype="float32")
        observer_prev_reward = np.zeros(len(self.envs), dtype="float32")
        player_prev_cost = np.zeros(len(self.envs), dtype="float32")
        observer_prev_cost = np.zeros(len(self.envs), dtype="float32")
        player_done = np.zeros(len(self.envs), dtype=bool)
        observer_done = np.zeros(len(self.envs), dtype=bool)
        if self.rank == 0:
            logger.log("Sampler decorrelating envs, max steps: "
                       f"{max_decorrelation_steps}")
        if max_decorrelation_steps != 0:

            for b, env in enumerate(self.envs):
                n_steps = 1 + int(np.random.rand() * max_decorrelation_steps)
                if n_steps % 2 != 0:
                    if n_steps < max_decorrelation_steps or n_steps <= 1:
                        n_steps += 1
                    else:
                        n_steps -= 1
                for cstep in range(n_steps):
                    if env.player_turn:

                        a = env.action_space().sample()
                        o, r, d, info = env.step(a)
                        player_prev_action[b] = a
                        r_obs, cost_obs = env.observer_reward_shaping(
                            r, env.last_obs_act)
                        observer_prev_reward[b] = r_obs
                        observer_prev_cost[b] = cost_obs
                        observer_done[b] = d
                        if cstep > 0:
                            observer_traj_infos[b].step(
                                observer_observation[b],
                                observer_prev_action[b],
                                observer_prev_reward[b],
                                observer_done[b],
                                None,
                                info,
                                cost=cost_obs,
                                obs_act=env.last_obs_act)
                        if d:
                            o = env.reset()
                            observer_prev_reward[b] = 0
                            observer_traj_infos[b] = self.TrajInfoCls(
                                n_obs=env.obs_size, serial=env.serial)
                            player_prev_reward[b] = 0
                            player_traj_infos[b] = self.TrajInfoCls()
                            player_done[b] = d
                        observer_observation[b] = o
                    else:
                        if env.serial:
                            while not env.player_turn:
                                a = env.action_space().sample()
                                o, r, d, info = env.step(a)
                                assert not d
                                observer_prev_action[b] = a
                                if env.player_turn:
                                    r_ply, cost_ply = env.player_reward_shaping(
                                        r, env.last_obs_act)
                                    player_prev_reward[b] = r_ply

                                    player_done[b] = d
                                    if cstep > 0:
                                        player_traj_infos[b].step(
                                            player_observation[b],
                                            player_prev_action[b],
                                            player_prev_reward[b],
                                            player_done[b], None, info,
                                            cost_ply)
                                    player_observation[b] = o
                                else:
                                    observer_prev_reward[b] = r
                                    observer_done[b] = d
                                    if cstep > 0:
                                        observer_traj_infos[b].step(
                                            observer_observation[b],
                                            observer_prev_action[b],
                                            observer_prev_reward[b],
                                            observer_done[b],
                                            None,
                                            info,
                                            cost=0)
                                    observer_observation[b] = o

                        else:
                            a = env.action_space().sample()
                            o, r, d, info = env.step(a)
                            r_ply, cost_ply = env.player_reward_shaping(
                                r, env.last_obs_act)
                            assert not d
                            observer_prev_action[b] = a
                            player_prev_reward[b] = r_ply
                            player_done[b] = d
                            if cstep > 0:
                                player_traj_infos[b].step(
                                    player_observation[b],
                                    player_prev_action[b],
                                    player_prev_reward[b], player_done[b],
                                    None, info, cost_ply)

                            player_observation[b] = o

        # For action-server samplers.
        if hasattr(self, "observer_step_buffer_np"
                   ) and self.observer_step_buffer_np is not None:
            self.observer_step_buffer_np.observation[:] = observer_observation
            self.observer_step_buffer_np.action[:] = observer_prev_action
            self.observer_step_buffer_np.reward[:] = observer_prev_reward
        if hasattr(self, "player_step_buffer_np"
                   ) and self.player_step_buffer_np is not None:
            self.player_step_buffer_np.observation[:] = player_observation
            self.player_step_buffer_np.action[:] = player_prev_action
            self.player_step_buffer_np.reward[:] = player_prev_reward

        return AgentInputs(player_observation, player_prev_action,
                           player_prev_reward), player_traj_infos, AgentInputs(
                               observer_observation, observer_prev_action,
                               observer_prev_reward), observer_traj_infos
Esempio n. 30
0
    def collect_evaluation(self, itr, max_episodes=1):
        assert len(
            self.envs
        ) == 1, 'qec eval collector needs max 1 env. Otherwise evaluation will be biased'
        traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
        observations = list()
        for env in self.envs:
            observations.append(env.reset())
        observation = buffer_from_example(observations[0], len(self.envs))
        for b, o in enumerate(observations):
            observation[b] = o
        action = buffer_from_example(self.envs[0].action_space.null_value(),
                                     len(self.envs))
        reward = np.zeros(len(self.envs), dtype="float32")
        obs_pyt, act_pyt, rew_pyt = torchify_buffer(
            (observation, action, reward))
        self.agent.reset()
        self.agent.eval_mode(itr)
        num_completed_episodes = 0
        for t in range(self.max_T):
            act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
            action = numpify_buffer(act_pyt)
            static_decoder_inputs = []
            correct_labels = []
            env_infos = []
            done = []
            for b, env in enumerate(self.envs):
                o, r, d, env_info = env.step(action[b])
                done.append(d)
                observation[b] = o
                reward[b] = r
                env_infos.append(env_info)
                static_decoder_inputs.append(env_info.static_decoder_input)
                correct_labels.append(env_info.correct_label)

            static_decoder_inputs = np.stack(static_decoder_inputs)
            correct_labels = np.stack(correct_labels)
            label_prediction = np.argmax(
                self.static_decoder(static_decoder_inputs),
                axis=-1).squeeze(axis=1)
            done = label_prediction != correct_labels

            for b, env in enumerate(self.envs):
                traj_infos[b].step(observation[b], action[b], reward[b],
                                   done[b], agent_info[b], env_infos[b])
                if getattr(env_infos[b], "traj_done", done[b]):
                    self.traj_infos_queue.put(traj_infos[b].terminate(
                        observation[b]))
                    traj_infos[b] = self.TrajInfoCls()
                    observation[b] = env.reset()
                if done[b]:
                    action[b] = 0  # Next prev_action.
                    reward[b] = 0
                    self.agent.reset_one(idx=b)
                    num_completed_episodes += 1
            if num_completed_episodes >= max_episodes:
                print('reached max episodes')
                break
            if self.sync.stop_eval.value:
                print(f'sync stop')
                break
        self.traj_infos_queue.put(None)  # End sentinel.