예제 #1
0
            # Update current state and action
            action = next_action
            log_prob = next_log_prob

            # Visualizing AE Hash
            ae_hash.eval()  # Set in evaluation mode
            if stacked:
                code, latent = ae_hash.hash(next_state.unsqueeze(dim=0),
                                            base_ten=False)
                recon_state, _ = ae_hash(next_state.unsqueeze(dim=0))
            else:
                code, latent = ae_hash.hash(next_state[-1:].unsqueeze(dim=0),
                                            base_ten=False)
                recon_state, _ = ae_hash(next_state[-1:].unsqueeze(dim=0))

            sim_code = sim_hash.hash(code.squeeze())

            visualize_aehash(next_state.cpu().numpy(),
                             recon_state.squeeze(dim=0).cpu().detach().numpy(),
                             code.squeeze(),
                             latent.squeeze().cpu().detach().numpy())

            # Render this episode
            if render and (render_each_episode or
                           (not finished_rendering_this_epoch)):
                env.render()

            if done:
                # Load and print episode stats after each episode ends
                episode_durations.append(t + 1)
                episode_rewards.append(running_reward)
예제 #2
0
            torch.tensor([current_state], dtype=torch.float32,
                         device=device)).squeeze()  # squeeze the dimension
        in_val = value_net_in(
            torch.tensor(
                [np.concatenate((current_state, [i_episode]), axis=0)],
                dtype=torch.float32,
                device=device)).squeeze(
                )  # provide i_episode as additional info as input

        # Store the first state and value estimate in memory
        memory.set_initial_state(current_state,
                                 initial_ex_val_est=ex_val,
                                 initial_in_val_est=in_val)

        # Obtain current state hash code
        current_state_hash = simhash.hash(current_state)

        for t in count():

            # Sample an action given the current state
            action, log_prob = policy_net(
                torch.tensor([current_state],
                             dtype=torch.float32,
                             device=device))
            log_prob = log_prob.squeeze()

            # Interact with the environment
            next_state, reward, done, _ = env.step(action.item())
            running_reward += reward

            # Estimate the value of the next state
예제 #3
0
        # Obtain action, log probability, and value estimate for the initial state
        # Move the outputs to cpu to save memory
        action, log_prob, ex_val, in_val = actor_critic(current_state.unsqueeze(dim=0), i_episode=i_episode)
        action = action.squeeze().cpu()
        log_prob = log_prob.squeeze().cpu()
        ex_val = ex_val.squeeze().cpu()
        in_val = in_val.squeeze().cpu()

        # Store the first state and value estimate in memory
        memory.set_initial_state(current_state.clone().detach().cpu(), initial_ex_val_est=ex_val, initial_in_val_est=in_val)

        # Obtain current state hash code
        if i_epoch > curiosity_delay:
            current_state_hash, _ = ae_hash.hash((current_state if stacked else current_state[-1:]).unsqueeze(dim=0),
                                              base_ten=False)
            current_state_hash = sim_hash.hash(current_state_hash.squeeze(), base_ten=True)   # Downsample

        for t in count():

            # Interact with the environment
            next_frame, reward, done, _ = env.step(action.item())
            running_reward += reward

            # Pop the frame from the top of the list and append the new frame, and stack to form the current state
            frame_list.pop(0)
            frame_list.append(transform(next_frame))
            next_state = torch.cat(frame_list, dim=0).to(device)     # Stack the images

            # Obtain action, log probability and value estimate for the next state in a single propagation
            # Move the outputs to cpu to save memory
            next_action, next_log_prob, ex_val, in_val = actor_critic(next_state.unsqueeze(dim=0), i_episode=i_episode)