def add_traj_to_memory(self, traj): obs = traj.obs actions = traj.actions next_obs = traj.next_obs rewards = traj.rewards dones = traj.dones rets = map(lambda x: x.swapaxes(0, 1).reshape(x.shape[0] * x.shape[1], *x.shape[2:]), (obs, actions, next_obs, rewards, dones)) obs, actions, next_obs, rewards, dones = rets for i in range(obs.shape[0]): sd = StepData(ob=obs[i], action=actions[i], next_ob=next_obs[i], reward=rewards[i], done=dones[i]) self.agent.memory.append(deepcopy(sd)) self.cur_step += traj.total_steps
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_first=False, reset_kwargs=None, action_kwargs=None, get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env # In RL^2, we should always reset in the begining of a rollout if self.obs is None or reset_first or evaluation: self.reset(**reset_kwargs) ob = self.obs hidden_state = self.hidden_states # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) else: all_dones = None done = None for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = deepcopy(env.get_images()) action, action_info, hidden_state = self.agent.get_action( ob, sample=sample, hidden_state=hidden_state, **action_kwargs) #print('action_info', action_info) if self.hidden_state_shape is None: self.hidden_state_shape = hidden_state.shape next_ob, reward, done, info = env.step(action) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) true_next_ob, true_done, all_dones = self.get_true_done_next_ob( next_ob, done, reward, info, all_dones) sd = StepData( ob=ob, action=action, action_info=action_info, next_ob=true_next_ob, reward=reward, done=true_done, info=info, extra= done, # this is a flag that can tell whether the environment # is reset or not so that we know whether we need to # reset the hidden state or not. We save it in "extra" ) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break # the order of next few lines matter, do not exchange if get_last_val and not evaluation and t == time_steps - 1: last_val, _ = self.agent.get_val(traj[-1].next_ob_raw, hidden_state=hidden_state) if last_val is not None: traj.add_extra('last_val', torch_to_np(last_val)) else: traj.add_extra('last_val', None) hidden_state = self.check_hidden_state(hidden_state, done=done) self.obs = ob if not evaluation else None self.hidden_states = hidden_state.detach() if not evaluation else None return traj
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_first=False, reset_kwargs=None, action_kwargs=None, random_action=False, get_last_val=False): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env if self.obs is None or reset_first or evaluation: self.reset(env=env, **reset_kwargs) ob = self.obs # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) else: all_dones = None for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = get_render_images(env) if random_action: action = env.random_actions() action_info = dict() else: action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) true_next_ob, true_done, all_dones = self.get_true_done_next_ob( next_ob, done, reward, info, all_dones, skip_record=evaluation) sd = StepData(ob=ob, action=action, action_info=action_info, next_ob=true_next_ob, reward=reward, done=true_done, info=info) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break if get_last_val and not evaluation: last_val = self.agent.get_val(traj[-1].next_ob) traj.add_extra('last_val', torch_to_np(last_val)) self.obs = ob if not evaluation else None return traj
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_first=False, env_reset_kwargs=None, agent_reset_kwargs=None, action_kwargs=None, random_action=False): traj = Trajectory() if env_reset_kwargs is None: env_reset_kwargs = {} if agent_reset_kwargs is None: agent_reset_kwargs = {} if action_kwargs is None: action_kwargs = {} action_kwargs['eval'] = evaluation if evaluation: env = self.eval_env else: env = self.train_env if self.obs is None or reset_first or evaluation: self.reset(env=env, env_reset_kwargs=env_reset_kwargs, agent_reset_kwargs=agent_reset_kwargs) ob = self.obs ob = deepcopy(ob) for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = get_render_images(env) if random_action: action = env.action_space.sample() if len(action.shape) == 1: # the first dim is num_envs action = list_to_numpy(action, expand_dims=0) action_info = dict() else: action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) true_done = deepcopy(done) for iidx, inf in enumerate(info): true_done[iidx] = true_done[iidx] and not inf.get('TimeLimit.truncated', False) sd = StepData(ob=ob, action=action, action_info=action_info, next_ob=next_ob, reward=reward, done=true_done, info=info) ob = next_ob traj.add(sd) if return_on_done and done: break if done: ob = self.reset(env, env_reset_kwargs, agent_reset_kwargs) self.obs = None if evaluation else deepcopy(ob) return traj
def __call__(self, time_steps, sample=True, evaluation=False, return_on_done=False, render=False, render_image=False, sleep_time=0, reset_kwargs=None, action_kwargs=None): traj = Trajectory() if reset_kwargs is None: reset_kwargs = {} if action_kwargs is None: action_kwargs = {} if evaluation: env = self.eval_env else: env = self.train_env ob = env.reset(**reset_kwargs) # this is critical for some environments depending # on the returned ob data. use deepcopy() to avoid # adding the same ob to the traj # only add deepcopy() when a new ob is generated # so that traj[t].next_ob is still the same instance as traj[t+1].ob ob = deepcopy(ob) if return_on_done: all_dones = np.zeros(env.num_envs, dtype=bool) for t in range(time_steps): if render: env.render() if sleep_time > 0: time.sleep(sleep_time) if render_image: # get render images at the same time step as ob imgs = deepcopy(env.get_images()) action, action_info = self.agent.get_action(ob, sample=sample, **action_kwargs) next_ob, reward, done, info = env.step(action) next_ob = deepcopy(next_ob) if render_image: for img, inf in zip(imgs, info): inf['render_image'] = deepcopy(img) done_idx = np.argwhere(done).flatten() if done_idx.size > 0 and return_on_done: # vec env automatically resets the environment when it's done # so the returned next_ob is not actually the next observation all_dones[done_idx] = True sd = StepData(ob=ob, action=deepcopy(action), action_info=deepcopy(action_info), next_ob=next_ob, reward=deepcopy(reward), done=deepcopy(done), info=deepcopy(info)) ob = next_ob traj.add(sd) if return_on_done and np.all(all_dones): break if not evaluation: #print("next_ob:", traj[-1].next_ob) last_val = self.agent.get_val(traj[-1].next_ob_raw) traj.add_extra('last_val', torch_to_np(last_val)) return traj