Example #1
0
            # Obtain next state hash code
            next_state_hash = simhash.hash(next_state)

            # Update action counter
            act_counter[action.item()] += 1

            # If next state hashed to a different code than the current state, then infer the dominating action,
            #   update causal link, and clear action counter
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action,
                                        next_state_hash)
                act_counter = np.zeros((output_size, ), dtype=np.int32)

            # Take the action confidence with current state hash code as the intrinsic reward
            in_reward = curiosity_weight * graph.action_confidence(
                current_state_hash, action.item())
            # in_reward = curiosity_weight * np.sqrt(in_reward)       # Take the square root of confidence value

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob,
                                  next_state,
                                  extrinsic_reward=reward,
                                  extrinsic_value_estimate=ex_val,
                                  intrinsic_reward=in_reward,
                                  intrinsic_value_estimate=in_val)
            # memory.add_transition(action, log_prob, next_state,
            #                       extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val,
            #                       intrinsic_reward=in_reward, intrinsic_value_estimate=in_val)

            # Update current state
Example #2
0
            # Obtain next state hash code
            next_state_hash = simhash.hash(next_state)

            # Update action counter
            act_counter[action.item()] += 1

            # If next state hashed to a different code than the current state, then infer the dominating action,
            #   update causal link, and clear action counter
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action,
                                        next_state_hash)
                act_counter = np.zeros((output_size, ), dtype=np.int32)

            # Take the action confidence with current state hash code as the intrinsic reward
            in_reward = graph.action_confidence(current_state_hash,
                                                action.item())
            in_reward = curiosity_weight * np.sqrt(
                in_reward)  # Take the square root of confidence value

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob,
                                  next_state,
                                  extrinsic_reward=reward,
                                  extrinsic_value_estimate=ex_val,
                                  intrinsic_reward=in_reward,
                                  intrinsic_value_estimate=in_val)

            # Update current state
            current_state = next_state
            current_state_hash = next_state_hash