# Update current state and action action = next_action log_prob = next_log_prob # Visualizing AE Hash ae_hash.eval() # Set in evaluation mode if stacked: code, latent = ae_hash.hash(next_state.unsqueeze(dim=0), base_ten=False) recon_state, _ = ae_hash(next_state.unsqueeze(dim=0)) else: code, latent = ae_hash.hash(next_state[-1:].unsqueeze(dim=0), base_ten=False) recon_state, _ = ae_hash(next_state[-1:].unsqueeze(dim=0)) sim_code = sim_hash.hash(code.squeeze()) visualize_aehash(next_state.cpu().numpy(), recon_state.squeeze(dim=0).cpu().detach().numpy(), code.squeeze(), latent.squeeze().cpu().detach().numpy()) # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render() if done: # Load and print episode stats after each episode ends episode_durations.append(t + 1) episode_rewards.append(running_reward)
torch.tensor([current_state], dtype=torch.float32, device=device)).squeeze() # squeeze the dimension in_val = value_net_in( torch.tensor( [np.concatenate((current_state, [i_episode]), axis=0)], dtype=torch.float32, device=device)).squeeze( ) # provide i_episode as additional info as input # Store the first state and value estimate in memory memory.set_initial_state(current_state, initial_ex_val_est=ex_val, initial_in_val_est=in_val) # Obtain current state hash code current_state_hash = simhash.hash(current_state) for t in count(): # Sample an action given the current state action, log_prob = policy_net( torch.tensor([current_state], dtype=torch.float32, device=device)) log_prob = log_prob.squeeze() # Interact with the environment next_state, reward, done, _ = env.step(action.item()) running_reward += reward # Estimate the value of the next state
# Obtain action, log probability, and value estimate for the initial state # Move the outputs to cpu to save memory action, log_prob, ex_val, in_val = actor_critic(current_state.unsqueeze(dim=0), i_episode=i_episode) action = action.squeeze().cpu() log_prob = log_prob.squeeze().cpu() ex_val = ex_val.squeeze().cpu() in_val = in_val.squeeze().cpu() # Store the first state and value estimate in memory memory.set_initial_state(current_state.clone().detach().cpu(), initial_ex_val_est=ex_val, initial_in_val_est=in_val) # Obtain current state hash code if i_epoch > curiosity_delay: current_state_hash, _ = ae_hash.hash((current_state if stacked else current_state[-1:]).unsqueeze(dim=0), base_ten=False) current_state_hash = sim_hash.hash(current_state_hash.squeeze(), base_ten=True) # Downsample for t in count(): # Interact with the environment next_frame, reward, done, _ = env.step(action.item()) running_reward += reward # Pop the frame from the top of the list and append the new frame, and stack to form the current state frame_list.pop(0) frame_list.append(transform(next_frame)) next_state = torch.cat(frame_list, dim=0).to(device) # Stack the images # Obtain action, log probability and value estimate for the next state in a single propagation # Move the outputs to cpu to save memory next_action, next_log_prob, ex_val, in_val = actor_critic(next_state.unsqueeze(dim=0), i_episode=i_episode)