コード例 #1
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_first=False,
                 reset_kwargs=None,
                 action_kwargs=None,
                 random_action=False,
                 get_last_val=False):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        if self.obs is None or reset_first or evaluation:
            self.reset(env=env, **reset_kwargs)
        ob = self.obs
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        else:
            all_dones = None
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = get_render_images(env)

            if random_action:
                action = env.random_actions()
                action_info = dict()
            else:
                action, action_info = self.agent.get_action(ob,
                                                            sample=sample,
                                                            **action_kwargs)
            next_ob, reward, done, info = env.step(action)

            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            true_next_ob, true_done, all_dones = self.get_true_done_next_ob(
                next_ob, done, reward, info, all_dones, skip_record=evaluation)
            sd = StepData(ob=ob,
                          action=action,
                          action_info=action_info,
                          next_ob=true_next_ob,
                          reward=reward,
                          done=true_done,
                          info=info)
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break

        if get_last_val and not evaluation:
            last_val = self.agent.get_val(traj[-1].next_ob)
            traj.add_extra('last_val', torch_to_np(last_val))
        self.obs = ob if not evaluation else None
        return traj
コード例 #2
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_first=False,
                 reset_kwargs=None,
                 action_kwargs=None,
                 get_last_val=False):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        # In RL^2, we should always reset in the begining of a rollout
        if self.obs is None or reset_first or evaluation:
            self.reset(**reset_kwargs)
        ob = self.obs
        hidden_state = self.hidden_states
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        else:
            all_dones = None
        done = None
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = deepcopy(env.get_images())

            action, action_info, hidden_state = self.agent.get_action(
                ob, sample=sample, hidden_state=hidden_state, **action_kwargs)
            #print('action_info', action_info)
            if self.hidden_state_shape is None:
                self.hidden_state_shape = hidden_state.shape
            next_ob, reward, done, info = env.step(action)

            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            true_next_ob, true_done, all_dones = self.get_true_done_next_ob(
                next_ob, done, reward, info, all_dones)

            sd = StepData(
                ob=ob,
                action=action,
                action_info=action_info,
                next_ob=true_next_ob,
                reward=reward,
                done=true_done,
                info=info,
                extra=
                done,  # this is a flag that can tell whether the environment
                # is reset or not so that we know whether we need to
                # reset the hidden state or not. We save it in "extra"
            )
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break

            # the order of next few lines matter, do not exchange
            if get_last_val and not evaluation and t == time_steps - 1:
                last_val, _ = self.agent.get_val(traj[-1].next_ob_raw,
                                                 hidden_state=hidden_state)
                if last_val is not None:
                    traj.add_extra('last_val', torch_to_np(last_val))
                else:
                    traj.add_extra('last_val', None)
            hidden_state = self.check_hidden_state(hidden_state, done=done)
        self.obs = ob if not evaluation else None
        self.hidden_states = hidden_state.detach() if not evaluation else None
        return traj
コード例 #3
0
    def __call__(self,
                 time_steps,
                 sample=True,
                 evaluation=False,
                 return_on_done=False,
                 render=False,
                 render_image=False,
                 sleep_time=0,
                 reset_kwargs=None,
                 action_kwargs=None):
        traj = Trajectory()
        if reset_kwargs is None:
            reset_kwargs = {}
        if action_kwargs is None:
            action_kwargs = {}
        if evaluation:
            env = self.eval_env
        else:
            env = self.train_env
        ob = env.reset(**reset_kwargs)
        # this is critical for some environments depending
        # on the returned ob data. use deepcopy() to avoid
        # adding the same ob to the traj

        # only add deepcopy() when a new ob is generated
        # so that traj[t].next_ob is still the same instance as traj[t+1].ob
        ob = deepcopy(ob)
        if return_on_done:
            all_dones = np.zeros(env.num_envs, dtype=bool)
        for t in range(time_steps):
            if render:
                env.render()
                if sleep_time > 0:
                    time.sleep(sleep_time)
            if render_image:
                # get render images at the same time step as ob
                imgs = deepcopy(env.get_images())

            action, action_info = self.agent.get_action(ob,
                                                        sample=sample,
                                                        **action_kwargs)
            next_ob, reward, done, info = env.step(action)
            next_ob = deepcopy(next_ob)
            if render_image:
                for img, inf in zip(imgs, info):
                    inf['render_image'] = deepcopy(img)

            done_idx = np.argwhere(done).flatten()
            if done_idx.size > 0 and return_on_done:
                # vec env automatically resets the environment when it's done
                # so the returned next_ob is not actually the next observation
                all_dones[done_idx] = True
            sd = StepData(ob=ob,
                          action=deepcopy(action),
                          action_info=deepcopy(action_info),
                          next_ob=next_ob,
                          reward=deepcopy(reward),
                          done=deepcopy(done),
                          info=deepcopy(info))
            ob = next_ob
            traj.add(sd)
            if return_on_done and np.all(all_dones):
                break
        if not evaluation:
            #print("next_ob:", traj[-1].next_ob)
            last_val = self.agent.get_val(traj[-1].next_ob_raw)
            traj.add_extra('last_val', torch_to_np(last_val))
        return traj