# Obtain next state hash code next_state_hash = simhash.hash(next_state) # Update action counter act_counter[action.item()] += 1 # If next state hashed to a different code than the current state, then infer the dominating action, # update causal link, and clear action counter if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) act_counter = np.zeros((output_size, ), dtype=np.int32) # Take the action confidence with current state hash code as the intrinsic reward in_reward = curiosity_weight * graph.action_confidence( current_state_hash, action.item()) # in_reward = curiosity_weight * np.sqrt(in_reward) # Take the square root of confidence value # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=reward, extrinsic_value_estimate=ex_val, intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # memory.add_transition(action, log_prob, next_state, # extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val, # intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # Update current state
# Obtain next state hash code next_state_hash = simhash.hash(next_state) # Update action counter act_counter[action.item()] += 1 # If next state hashed to a different code than the current state, then infer the dominating action, # update causal link, and clear action counter if next_state_hash != current_state_hash: main_action = np.argmax(act_counter) graph.update_transition(current_state_hash, main_action, next_state_hash) act_counter = np.zeros((output_size, ), dtype=np.int32) # Take the action confidence with current state hash code as the intrinsic reward in_reward = graph.action_confidence(current_state_hash, action.item()) in_reward = curiosity_weight * np.sqrt( in_reward) # Take the square root of confidence value # Record transition in memory memory.add_transition(action, log_prob, next_state, extrinsic_reward=reward, extrinsic_value_estimate=ex_val, intrinsic_reward=in_reward, intrinsic_value_estimate=in_val) # Update current state current_state = next_state current_state_hash = next_state_hash