if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) act_counter = np.zeros((output_size, ), dtype=np.int32) # Take the action confidence with current state hash code as the intrinsic reward in_reward = curiosity_weight * graph.action_confidence( current_state_hash, action.item()) # in_reward = curiosity_weight * np.sqrt(in_reward) # Take the square root of confidence value # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=reward, extrinsic_value_estimate=ex_val, intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # memory.add_transition(action, log_prob, next_state, # extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val, # intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # Update current state current_state = next_state current_state_hash = next_state_hash # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render()
act_counter[action.item()] += 1 # If next state hashed to a different code than the current state, then infer the dominating action, # update causal link, and clear action counter if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) act_counter = np.zeros((output_size,), dtype=np.int32) # Take the action confidence with current state hash code as the intrinsic reward in_reward = graph.action_confidence(current_state_hash, action.item()) in_reward = curiosity_weight * np.sqrt(in_reward) # Take the square root of confidence value # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val, intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # Update current state current_state = next_state current_state_hash = next_state_hash # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render() if done: # Load and print episode stats after each episode ends episode_durations.append(t + 1) episode_rewards.append(running_reward)
running_reward += reward # Estimate the value of the next state value = value_net( torch.tensor([next_state], device=device)).squeeze() # squeeze the dimension # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render() # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=reward, extrinsic_value_estimate=value) # Update current state current_state = next_state if done: # Load and print episode stats after each episode ends episode_durations.append(t + 1) episode_rewards.append(running_reward) if running_reward > training_info["max reward achieved"]: training_info["max reward achieved"] = running_reward # Decide whether to render next episode if not (render_each_episode):
frame_list.append(transform(next_frame)) next_state = torch.cat(frame_list, dim=0).to(device) # Stack the images # Obtain action, log probability and value estimate for the next state in a single propagation # Move the outputs to cpu to save memory next_action, next_log_prob, value = actor_critic( next_state.unsqueeze(dim=0)) next_action = next_action.squeeze().cpu() next_log_prob = next_log_prob.squeeze().cpu() value = value.squeeze().cpu() # Record transition in memory memory.add_transition(action, log_prob.cpu(), next_state.clone().detach().cpu(), extrinsic_reward=reward, extrinsic_value_estimate=value) # Update current state and action action = next_action log_prob = next_log_prob # Visualizing AE Hash ae_hash.eval() # Set in evaluation mode if stacked: code, latent = ae_hash.hash(next_state.unsqueeze(dim=0), base_ten=False) recon_state, _ = ae_hash(next_state.unsqueeze(dim=0)) else: code, latent = ae_hash.hash(next_state[-1:].unsqueeze(dim=0),
# Or if reached termination, update transition to the termination state if next_state_hash != current_state_hash or done: main_action = np.argmax(act_counter) act_counter = np.zeros((actor_layer_sizes[-1],), dtype=np.int32) if next_state_hash != current_state_hash: graph.update_transition(current_state_hash, main_action, next_state_hash) if done: graph.update_termination(current_state_hash, main_action) in_reward = graph.action_confidence(current_state_hash, action.item()) in_reward = curiosity_weight * in_reward # Take the square root of confidence value # Store transition in memory memory.add_transition(action, log_prob.cpu(), next_state.clone().detach().cpu(), extrinsic_reward=reward, extrinsic_value_estimate=ex_val, intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) current_state_hash = next_state_hash else: memory.add_transition(action, log_prob.cpu(), next_state.clone().detach().cpu(), extrinsic_reward=reward, extrinsic_value_estimate=ex_val, intrinsic_reward=0.0, intrinsic_value_estimate=in_val) current_state = next_state action = next_action log_prob = next_log_prob
action, log_prob = policy_net( torch.tensor([current_state], device=device)) log_prob = log_prob.squeeze() # Interact with the environment next_state, reward, done, _ = env.step(action.item()) running_reward += reward # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render() # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=reward) # Update current state current_state = next_state if done: # Load and print episode stats after each episode ends episode_durations.append(t + 1) episode_rewards.append(running_reward) if running_reward > training_info["max reward achieved"]: training_info["max reward achieved"] = running_reward # Decide whether to render next episode if not (render_each_episode): finished_rendering_this_epoch = True
# If next state hashed to a different code than the current state, then infer the dominating action, # update causal link, and clear action counter if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) act_counter = np.zeros((output_size,), dtype=np.int32) # Take the action confidence with current state hash code as the intrinsic reward in_reward = curiosity_weight * graph.action_confidence(current_state_hash, action.item()) # in_reward = curiosity_weight * np.sqrt(in_reward) # Take the square root of confidence value # Record transition in memory # If not done (end of episode), only record exploration bonus. If done, add exploration bonus at that step # with end-of-episode total running extrinsic reward. memory.add_transition(action, log_prob, next_state, extrinsic_reward=in_reward if not done else in_reward + running_reward, extrinsic_value_estimate=ex_val) # Update current state current_state = next_state current_state_hash = next_state_hash # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render() if done: # Load and print episode stats after each episode ends episode_durations.append(t + 1) episode_rewards.append(running_reward) if running_reward > training_info["max reward achieved"]:
if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) # Obtain action causal confidence (variance) and calculate curiosity (weighted) act_confidence = graph.action_confidence(current_state_hash, action.item()) curiosity = curiosity_weight * act_confidence # Record transition in memory with curiosity # Only summed-up end-of-episode reward is fed to the agent if done: memory.add_transition(action, log_prob, next_state, extrinsic_reward=running_reward, intrinsic_reward=curiosity) else: memory.add_transition(action, log_prob, next_state, extrinsic_reward=0., intrinsic_reward=curiosity) # Update current state current_state = next_state current_state_hash = next_state_hash # Render this episode if render and (render_each_episode or