def visualise_behaviour(env, args, policy, iter_idx, encoder=None, reward_decoder=None, image_folder=None, **kwargs ): """ Visualises the behaviour of the policy, together with the latent state and belief. The environment passed to this method should be a SubProcVec or DummyVecEnv, not the raw env! """ num_episodes = args.max_rollouts_per_task unwrapped_env = env.venv.unwrapped.envs[0] # --- initialise things we want to keep track of --- episode_all_obs = [[] for _ in range(num_episodes)] episode_prev_obs = [[] for _ in range(num_episodes)] episode_next_obs = [[] for _ in range(num_episodes)] episode_actions = [[] for _ in range(num_episodes)] episode_rewards = [[] for _ in range(num_episodes)] episode_returns = [] episode_lengths = [] episode_goals = [] if getattr(unwrapped_env, 'belief_oracle', False): episode_beliefs = [[] for _ in range(num_episodes)] else: episode_beliefs = None if encoder is not None: # keep track of latent spaces episode_latent_samples = [[] for _ in range(num_episodes)] episode_latent_means = [[] for _ in range(num_episodes)] episode_latent_logvars = [[] for _ in range(num_episodes)] else: episode_latent_samples = episode_latent_means = episode_latent_logvars = None curr_latent_sample = curr_latent_mean = curr_latent_logvar = None # --- roll out policy --- env.reset_task() (obs_raw, obs_normalised) = env.reset() obs_raw = obs_raw.float().reshape((1, -1)).to(device) obs_normalised = obs_normalised.float().reshape((1, -1)).to(device) start_obs_raw = obs_raw.clone() for episode_idx in range(args.max_rollouts_per_task): curr_goal = env.get_task() curr_rollout_rew = [] curr_rollout_goal = [] if encoder is not None: if episode_idx == 0: # reset to prior curr_latent_sample, curr_latent_mean, curr_latent_logvar, hidden_state = encoder.prior(1) curr_latent_sample = curr_latent_sample[0].to(device) curr_latent_mean = curr_latent_mean[0].to(device) curr_latent_logvar = curr_latent_logvar[0].to(device) episode_latent_samples[episode_idx].append(curr_latent_sample[0].clone()) episode_latent_means[episode_idx].append(curr_latent_mean[0].clone()) episode_latent_logvars[episode_idx].append(curr_latent_logvar[0].clone()) episode_all_obs[episode_idx].append(start_obs_raw.clone()) if getattr(unwrapped_env, 'belief_oracle', False): episode_beliefs[episode_idx].append(unwrapped_env.unwrapped._belief_state.copy()) for step_idx in range(1, env._max_episode_steps + 1): if step_idx == 1: episode_prev_obs[episode_idx].append(start_obs_raw.clone()) else: episode_prev_obs[episode_idx].append(obs_raw.clone()) # act _, action, _ = utl.select_action(args=args, policy=policy, obs=obs_normalised if args.norm_obs_for_policy else obs_raw, deterministic=True, latent_sample=curr_latent_sample, latent_mean=curr_latent_mean, latent_logvar=curr_latent_logvar) # observe reward and next obs (obs_raw, obs_normalised), (rew_raw, rew_normalised), done, infos = utl.env_step(env, action) obs_raw = obs_raw.reshape((1, -1)).to(device) obs_normalised = obs_normalised.reshape((1, -1)).to(device) if encoder is not None: # update task embedding curr_latent_sample, curr_latent_mean, curr_latent_logvar, hidden_state = encoder( action.float().to(device), obs_raw, rew_raw.reshape((1, 1)).float().to(device), hidden_state, return_prior=False) episode_latent_samples[episode_idx].append(curr_latent_sample[0].clone()) episode_latent_means[episode_idx].append(curr_latent_mean[0].clone()) episode_latent_logvars[episode_idx].append(curr_latent_logvar[0].clone()) episode_all_obs[episode_idx].append(obs_raw.clone()) episode_next_obs[episode_idx].append(obs_raw.clone()) episode_rewards[episode_idx].append(rew_raw.clone()) episode_actions[episode_idx].append(action.clone()) curr_rollout_rew.append(rew_raw.clone()) curr_rollout_goal.append(env.get_task().copy()) if getattr(unwrapped_env, 'belief_oracle', False): episode_beliefs[episode_idx].append(unwrapped_env.unwrapped._belief_state.copy()) if infos[0]['done_mdp'] and not done: start_obs_raw = infos[0]['start_state'] start_obs_raw = torch.from_numpy(start_obs_raw).float().reshape((1, -1)).to(device) break episode_returns.append(sum(curr_rollout_rew)) episode_lengths.append(step_idx) episode_goals.append(curr_goal) # clean up if encoder is not None: episode_latent_means = [torch.stack(e) for e in episode_latent_means] episode_latent_logvars = [torch.stack(e) for e in episode_latent_logvars] episode_prev_obs = [torch.cat(e) for e in episode_prev_obs] episode_next_obs = [torch.cat(e) for e in episode_next_obs] episode_actions = [torch.cat(e) for e in episode_actions] episode_rewards = [torch.cat(e) for e in episode_rewards] # plot behaviour & visualise belief in env rew_pred_means, rew_pred_vars = plot_bb(env, args, episode_all_obs, episode_goals, reward_decoder, episode_latent_means, episode_latent_logvars, image_folder, iter_idx, episode_beliefs) if reward_decoder: plot_rew_reconstruction(env, rew_pred_means, rew_pred_vars, image_folder, iter_idx) return episode_latent_means, episode_latent_logvars, \ episode_prev_obs, episode_next_obs, episode_actions, episode_rewards, \ episode_returns
def train(self): """ Given some stream of environments and a logger (tensorboard), (meta-)trains the policy. """ start_time = time.time() # reset environments (prev_obs_raw, prev_obs_normalised) = self.envs.reset() prev_obs_raw = prev_obs_raw.to(device) prev_obs_normalised = prev_obs_normalised.to(device) # insert initial observation / embeddings to rollout storage self.policy_storage.prev_obs_raw[0].copy_(prev_obs_raw) self.policy_storage.prev_obs_normalised[0].copy_(prev_obs_normalised) self.policy_storage.to(device) vae_is_pretrained = False for self.iter_idx in range(self.args.num_updates): # First, re-compute the hidden states given the current rollouts (since the VAE might've changed) # compute latent embedding (will return prior if current trajectory is empty) with torch.no_grad(): latent_sample, latent_mean, latent_logvar, hidden_state = self.encode_running_trajectory( ) # check if we flushed the policy storage assert len(self.policy_storage.latent_mean) == 0 # add this initial hidden state to the policy storage self.policy_storage.hidden_states[0].copy_(hidden_state) self.policy_storage.latent_samples.append(latent_sample.clone()) self.policy_storage.latent_mean.append(latent_mean.clone()) self.policy_storage.latent_logvar.append(latent_logvar.clone()) # rollout policies for a few steps for step in range(self.args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action, action_log_prob = utl.select_action( args=self.args, policy=self.policy, obs=prev_obs_normalised if self.args.norm_obs_for_policy else prev_obs_raw, deterministic=False, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar, ) # observe reward and next obs (next_obs_raw, next_obs_normalised), ( rew_raw, rew_normalised), done, infos = utl.env_step( self.envs, action) tasks = torch.FloatTensor([info['task'] for info in infos]).to(device) done = torch.from_numpy(np.array( done, dtype=int)).to(device).float().view((-1, 1)) # create mask for episode ends masks_done = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # bad_mask is true if episode ended because time limit was reached bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]).to(device) # compute next embedding (for next loop and/or value prediction bootstrap) latent_sample, latent_mean, latent_logvar, hidden_state = utl.update_encoding( encoder=self.vae.encoder, next_obs=next_obs_raw, action=action, reward=rew_raw, done=done, hidden_state=hidden_state) # before resetting, update the embedding and add to vae buffer # (last state might include useful task info) if not (self.args.disable_decoder and self.args.disable_stochasticity_in_latent): self.vae.rollout_storage.insert(prev_obs_raw.clone(), action.detach().clone(), next_obs_raw.clone(), rew_raw.clone(), done.clone(), tasks.clone()) # add the obs before reset to the policy storage # (only used to recompute embeddings if rlloss is backpropagated through encoder) self.policy_storage.next_obs_raw[step] = next_obs_raw.clone() self.policy_storage.next_obs_normalised[ step] = next_obs_normalised.clone() # reset environments that are done done_indices = np.argwhere( done.cpu().detach().flatten()).flatten() if len(done_indices) == self.args.num_processes: [next_obs_raw, next_obs_normalised] = self.envs.reset() if not self.args.sample_embeddings: latent_sample = latent_sample else: for i in done_indices: [next_obs_raw[i], next_obs_normalised[i]] = self.envs.reset(index=i) if not self.args.sample_embeddings: latent_sample[i] = latent_sample[i] # # add experience to policy buffer self.policy_storage.insert( obs_raw=next_obs_raw, obs_normalised=next_obs_normalised, actions=action, action_log_probs=action_log_prob, rewards_raw=rew_raw, rewards_normalised=rew_normalised, value_preds=value, masks=masks_done, bad_masks=bad_masks, done=done, hidden_states=hidden_state.squeeze(0).detach(), latent_sample=latent_sample.detach(), latent_mean=latent_mean.detach(), latent_logvar=latent_logvar.detach(), ) prev_obs_normalised = next_obs_normalised prev_obs_raw = next_obs_raw self.frames += self.args.num_processes # --- UPDATE --- if self.args.precollect_len <= self.frames: # check if we are pre-training the VAE if self.args.pretrain_len > 0 and not vae_is_pretrained: for _ in range(self.args.pretrain_len): self.vae.compute_vae_loss(update=True) vae_is_pretrained = True # otherwise do the normal update (policy + vae) else: train_stats = self.update( obs=prev_obs_normalised if self.args.norm_obs_for_policy else prev_obs_raw, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar) # log run_stats = [action, action_log_prob, value] if train_stats is not None: self.log(run_stats, train_stats, start_time) # clean up after update self.policy_storage.after_update()
def load_and_render(self, load_iter): #save_path = os.path.join('/ext/varibad_github/v2/varibad/logs/logs_HalfCheetahJoint-v0/varibad_73__15:05_17:14:07', 'models') #save_path = os.path.join('/ext/varibad_github/v2/varibad/logs/hfield', 'models') save_path = os.path.join( '/ext/varibad_github/v2/varibad/logs/logs_HalfCheetahBlocks-v0/varibad_73__15:05_20:20:25', 'models') self.policy.actor_critic = torch.load( os.path.join(save_path, "policy{0}.pt".format(load_iter))) self.vae.encoder = torch.load( os.path.join(save_path, "encoder{0}.pt").format(load_iter)) args = self.args device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") num_processes = 1 num_episodes = 100 num_steps = 1999 #import pdb; pdb.set_trace() # initialise environments envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=num_processes, # 1 gamma=args.policy_gamma, log_dir=args.agent_log_dir, device=device, allow_early_resets=False, episodes_per_task=self.args.max_rollouts_per_task, obs_rms=None, ret_rms=None, ) # reset latent state to prior latent_sample, latent_mean, latent_logvar, hidden_state = self.vae.encoder.prior( num_processes) for episode_idx in range(num_episodes): (prev_obs_raw, prev_obs_normalised) = envs.reset() prev_obs_raw = prev_obs_raw.to(device) prev_obs_normalised = prev_obs_normalised.to(device) for step_idx in range(num_steps): with torch.no_grad(): _, action, _ = utl.select_action( args=self.args, policy=self.policy, obs=prev_obs_normalised if self.args.norm_obs_for_policy else prev_obs_raw, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar, deterministic=True) # observe reward and next obs (next_obs_raw, next_obs_normalised), ( rew_raw, rew_normalised), done, infos = utl.env_step(envs, action) # render envs.venv.venv.envs[0].env.env.env.env.render() # update the hidden state latent_sample, latent_mean, latent_logvar, hidden_state = utl.update_encoding( encoder=self.vae.encoder, next_obs=next_obs_raw, action=action, reward=rew_raw, done=None, hidden_state=hidden_state) prev_obs_normalised = next_obs_normalised prev_obs_raw = next_obs_raw if done[0]: break
def train(self): """ Main training loop """ start_time = time.time() # reset environments state, belief, task = utl.reset_env(self.envs, self.args) # insert initial observation / embeddings to rollout storage self.policy_storage.prev_state[0].copy_(state) # log once before training with torch.no_grad(): self.log(None, None, start_time) for self.iter_idx in range(self.num_updates): # rollout policies for a few steps for step in range(self.args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action, action_log_prob = utl.select_action( args=self.args, policy=self.policy, state=state, belief=belief, task=task, deterministic=False) # observe reward and next obs [state, belief, task], (rew_raw, rew_normalised), done, infos = utl.env_step( self.envs, action, self.args) # create mask for episode ends masks_done = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # bad_mask is true if episode ended because time limit was reached bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]).to(device) # reset environments that are done done_indices = np.argwhere(done.flatten()).flatten() if len(done_indices) > 0: state, belief, task = utl.reset_env(self.envs, self.args, indices=done_indices, state=state) # add experience to policy buffer self.policy_storage.insert( state=state, belief=belief, task=task, actions=action, action_log_probs=action_log_prob, rewards_raw=rew_raw, rewards_normalised=rew_normalised, value_preds=value, masks=masks_done, bad_masks=bad_masks, done=torch.from_numpy(np.array(done, dtype=float)).unsqueeze(1), ) self.frames += self.args.num_processes # --- UPDATE --- train_stats = self.update(state=state, belief=belief, task=task) # log run_stats = [action, action_log_prob, value] if train_stats is not None: with torch.no_grad(): self.log(run_stats, train_stats, start_time) # clean up after update self.policy_storage.after_update()
def get_test_rollout(args, env, policy, encoder=None): num_episodes = args.max_rollouts_per_task # --- initialise things we want to keep track of --- episode_prev_obs = [[] for _ in range(num_episodes)] episode_next_obs = [[] for _ in range(num_episodes)] episode_actions = [[] for _ in range(num_episodes)] episode_rewards = [[] for _ in range(num_episodes)] episode_returns = [] episode_lengths = [] if encoder is not None: episode_latent_samples = [[] for _ in range(num_episodes)] episode_latent_means = [[] for _ in range(num_episodes)] episode_latent_logvars = [[] for _ in range(num_episodes)] else: curr_latent_sample = curr_latent_mean = curr_latent_logvar = None episode_latent_means = episode_latent_logvars = None # --- roll out policy --- # (re)set environment [obs_raw, obs_normalised] = env.reset() obs_raw = obs_raw.reshape((1, -1)).to(ptu.device) obs_normalised = obs_normalised.reshape((1, -1)).to(ptu.device) for episode_idx in range(num_episodes): curr_rollout_rew = [] if encoder is not None: if episode_idx == 0 and encoder: # reset to prior curr_latent_sample, curr_latent_mean, curr_latent_logvar, hidden_state = encoder.prior( 1) curr_latent_sample = curr_latent_sample[0].to(ptu.device) curr_latent_mean = curr_latent_mean[0].to(ptu.device) curr_latent_logvar = curr_latent_logvar[0].to(ptu.device) episode_latent_samples[episode_idx].append( curr_latent_sample[0].clone()) episode_latent_means[episode_idx].append( curr_latent_mean[0].clone()) episode_latent_logvars[episode_idx].append( curr_latent_logvar[0].clone()) for step_idx in range(1, env._max_episode_steps + 1): episode_prev_obs[episode_idx].append(obs_raw.clone()) _, action, _ = utl.select_action( args=args, policy=policy, obs=obs_normalised if args.norm_obs_for_policy else obs_raw, deterministic=True, task_sample=curr_latent_sample, task_mean=curr_latent_mean, task_logvar=curr_latent_logvar) # observe reward and next obs (obs_raw, obs_normalised), (rew_raw, rew_normalised), done, infos = utl.env_step( env, action) obs_raw = obs_raw.reshape((1, -1)).to(ptu.device) obs_normalised = obs_normalised.reshape((1, -1)).to(ptu.device) if encoder is not None: # update task embedding curr_latent_sample, curr_latent_mean, curr_latent_logvar, hidden_state = encoder( action.float().to(ptu.device), obs_raw, rew_raw.reshape((1, 1)).float().to(ptu.device), hidden_state, return_prior=False) episode_latent_samples[episode_idx].append( curr_latent_sample[0].clone()) episode_latent_means[episode_idx].append( curr_latent_mean[0].clone()) episode_latent_logvars[episode_idx].append( curr_latent_logvar[0].clone()) episode_next_obs[episode_idx].append(obs_raw.clone()) episode_rewards[episode_idx].append(rew_raw.clone()) episode_actions[episode_idx].append(action.clone()) if infos[0]['done_mdp']: break episode_returns.append(sum(curr_rollout_rew)) episode_lengths.append(step_idx) # clean up if encoder is not None: episode_latent_means = [torch.stack(e) for e in episode_latent_means] episode_latent_logvars = [ torch.stack(e) for e in episode_latent_logvars ] episode_prev_obs = [torch.cat(e) for e in episode_prev_obs] episode_next_obs = [torch.cat(e) for e in episode_next_obs] episode_actions = [torch.cat(e) for e in episode_actions] episode_rewards = [torch.cat(r) for r in episode_rewards] return episode_latent_means, episode_latent_logvars, \ episode_prev_obs, episode_next_obs, episode_actions, episode_rewards, \ episode_returns
def evaluate(args, policy, ret_rms, iter_idx, tasks, encoder=None, num_episodes=None): env_name = args.env_name if hasattr(args, 'test_env_name'): env_name = args.test_env_name if num_episodes is None: num_episodes = args.max_rollouts_per_task num_processes = args.num_processes # --- set up the things we want to log --- # for each process, we log the returns during the first, second, ... episode # (such that we have a minimum of [num_episodes]; the last column is for # any overflow and will be discarded at the end, because we need to wait until # all processes have at least [num_episodes] many episodes) returns_per_episode = torch.zeros( (num_processes, num_episodes + 1)).to(device) # --- initialise environments and latents --- envs = make_vec_envs( env_name, seed=args.seed * 42 + iter_idx, num_processes=num_processes, gamma=args.policy_gamma, device=device, rank_offset=num_processes + 1, # to use diff tmp folders than main processes episodes_per_task=num_episodes, normalise_rew=args.norm_rew_for_policy, ret_rms=ret_rms, tasks=tasks, add_done_info=args.max_rollouts_per_task > 1, ) num_steps = envs._max_episode_steps # reset environments state, belief, task = utl.reset_env(envs, args) # this counts how often an agent has done the same task already task_count = torch.zeros(num_processes).long().to(device) if encoder is not None: # reset latent state to prior latent_sample, latent_mean, latent_logvar, hidden_state = encoder.prior( num_processes) else: latent_sample = latent_mean = latent_logvar = hidden_state = None for episode_idx in range(num_episodes): for step_idx in range(num_steps): with torch.no_grad(): _, action = utl.select_action(args=args, policy=policy, state=state, belief=belief, task=task, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar, deterministic=True) # observe reward and next obs [state, belief, task], (rew_raw, rew_normalised), done, infos = utl.env_step( envs, action, args) done_mdp = [info['done_mdp'] for info in infos] if encoder is not None: # update the hidden state latent_sample, latent_mean, latent_logvar, hidden_state = utl.update_encoding( encoder=encoder, next_obs=state, action=action, reward=rew_raw, done=None, hidden_state=hidden_state) # add rewards returns_per_episode[range(num_processes), task_count] += rew_raw.view(-1) for i in np.argwhere(done_mdp).flatten(): # count task up, but cap at num_episodes + 1 task_count[i] = min(task_count[i] + 1, num_episodes) # zero-indexed, so no +1 if np.sum(done) > 0: done_indices = np.argwhere(done.flatten()).flatten() state, belief, task = utl.reset_env(envs, args, indices=done_indices, state=state) envs.close() return returns_per_episode[:, :num_episodes]
def train(self): """ Given some stream of environments and a logger (tensorboard), (meta-)trains the policy. """ start_time = time.time() # reset environments (prev_obs_raw, prev_obs_normalised) = self.envs.reset() prev_obs_raw = prev_obs_raw.to(device) prev_obs_normalised = prev_obs_normalised.to(device) # insert initial observation / embeddings to rollout storage self.policy_storage.prev_obs_raw[0].copy_(prev_obs_raw) self.policy_storage.prev_obs_normalised[0].copy_(prev_obs_normalised) self.policy_storage.to(device) for self.iter_idx in range(self.args.num_updates): # check if we flushed the policy storage assert len(self.policy_storage.latent_mean) == 0 # rollouts policies for a few steps for step in range(self.args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action, action_log_prob = utl.select_action( policy=self.policy, args=self.args, obs=prev_obs_normalised if self.args.norm_obs_for_policy else prev_obs_raw, deterministic=False) # observe reward and next obs (next_obs_raw, next_obs_normalised), ( rew_raw, rew_normalised), done, infos = utl.env_step( self.envs, action) action = action.float() # create mask for episode ends masks_done = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # bad_mask is true if episode ended because time limit was reached bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]).to(device) # add the obs before reset to the policy storage self.policy_storage.next_obs_raw[step] = next_obs_raw.clone() self.policy_storage.next_obs_normalised[ step] = next_obs_normalised.clone() # reset environments that are done done_indices = np.argwhere(done.flatten()).flatten() if len(done_indices) == self.args.num_processes: [next_obs_raw, next_obs_normalised] = self.envs.reset() if not self.args.sample_embeddings: latent_sample = latent_sample else: for i in done_indices: [next_obs_raw[i], next_obs_normalised[i]] = self.envs.reset(index=i) if not self.args.sample_embeddings: latent_sample[i] = latent_sample[i] # add experience to policy buffer self.policy_storage.insert( obs_raw=next_obs_raw.clone(), obs_normalised=next_obs_normalised.clone(), actions=action.clone(), action_log_probs=action_log_prob.clone(), rewards_raw=rew_raw.clone(), rewards_normalised=rew_normalised.clone(), value_preds=value.clone(), masks=masks_done.clone(), bad_masks=bad_masks.clone(), done=torch.from_numpy(np.array( done, dtype=float)).unsqueeze(1).clone(), ) prev_obs_normalised = next_obs_normalised prev_obs_raw = next_obs_raw self.frames += self.args.num_processes # --- UPDATE --- train_stats = self.update(prev_obs_normalised if self.args. norm_obs_for_policy else prev_obs_raw) # log run_stats = [action, action_log_prob, value] if train_stats is not None: self.log(run_stats, train_stats, start_time) # clean up after update self.policy_storage.after_update()
def train(self): """ Main Meta-Training loop """ start_time = time.time() # reset environments prev_state, belief, task = utl.reset_env(self.envs, self.args) # insert initial observation / embeddings to rollout storage self.policy_storage.prev_state[0].copy_(prev_state) # log once before training with torch.no_grad(): self.log(None, None, start_time) for self.iter_idx in range(self.num_updates): # First, re-compute the hidden states given the current rollouts (since the VAE might've changed) with torch.no_grad(): latent_sample, latent_mean, latent_logvar, hidden_state = self.encode_running_trajectory( ) # add this initial hidden state to the policy storage assert len(self.policy_storage.latent_mean ) == 0 # make sure we emptied buffers self.policy_storage.hidden_states[0].copy_(hidden_state) self.policy_storage.latent_samples.append(latent_sample.clone()) self.policy_storage.latent_mean.append(latent_mean.clone()) self.policy_storage.latent_logvar.append(latent_logvar.clone()) # rollout policies for a few steps for step in range(self.args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action = utl.select_action( args=self.args, policy=self.policy, state=prev_state, belief=belief, task=task, deterministic=False, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar, ) # take step in the environment [next_state, belief, task], (rew_raw, rew_normalised), done, infos = utl.env_step( self.envs, action, self.args) done = torch.from_numpy(np.array( done, dtype=int)).to(device).float().view((-1, 1)) # create mask for episode ends masks_done = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # bad_mask is true if episode ended because time limit was reached bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]).to(device) with torch.no_grad(): # compute next embedding (for next loop and/or value prediction bootstrap) latent_sample, latent_mean, latent_logvar, hidden_state = utl.update_encoding( encoder=self.vae.encoder, next_obs=next_state, action=action, reward=rew_raw, done=done, hidden_state=hidden_state) # before resetting, update the embedding and add to vae buffer # (last state might include useful task info) if not (self.args.disable_decoder and self.args.disable_kl_term): self.vae.rollout_storage.insert( prev_state.clone(), action.detach().clone(), next_state.clone(), rew_raw.clone(), done.clone(), task.clone() if task is not None else None) # add the obs before reset to the policy storage self.policy_storage.next_state[step] = next_state.clone() # reset environments that are done done_indices = np.argwhere(done.cpu().flatten()).flatten() if len(done_indices) > 0: next_state, belief, task = utl.reset_env( self.envs, self.args, indices=done_indices, state=next_state) # TODO: deal with resampling for posterior sampling algorithm # latent_sample = latent_sample # latent_sample[i] = latent_sample[i] # add experience to policy buffer self.policy_storage.insert( state=next_state, belief=belief, task=task, actions=action, rewards_raw=rew_raw, rewards_normalised=rew_normalised, value_preds=value, masks=masks_done, bad_masks=bad_masks, done=done, hidden_states=hidden_state.squeeze(0), latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar, ) prev_state = next_state self.frames += self.args.num_processes # --- UPDATE --- if self.args.precollect_len <= self.frames: # check if we are pre-training the VAE if self.args.pretrain_len > self.iter_idx: for p in range(self.args.num_vae_updates_per_pretrain): self.vae.compute_vae_loss( update=True, pretrain_index=self.iter_idx * self.args.num_vae_updates_per_pretrain + p) # otherwise do the normal update (policy + vae) else: train_stats = self.update(state=prev_state, belief=belief, task=task, latent_sample=latent_sample, latent_mean=latent_mean, latent_logvar=latent_logvar) # log run_stats = [ action, self.policy_storage.action_log_probs, value ] with torch.no_grad(): self.log(run_stats, train_stats, start_time) # clean up after update self.policy_storage.after_update() self.envs.close()