def calc_expected_utility(qo_pi, C): """ Given expected observations under a policy Qo_pi and a prior over observations C compute the expected utility of the policy. Parameters ---------- qo_pi [numpy 1D array, array-of-arrays (where each entry is a numpy 1D array), Categorical (either single-factor or AoA), or list]: Expected observations under the given policy (predictive posterior over outcomes). If a list, a list of the expected observations over the time horizon of policy evaluation, where each entry is the expected observations at a given timestep. C [numpy nd-array, array-of-arrays (where each entry is a numpy nd-array): Prior beliefs over outcomes, expressed in terms of relative log probabilities Returns ------- expected_util [scalar]: Utility (reward) expected under the policy in question """ if isinstance(qo_pi, list): n_steps = len(qo_pi) for t in range(n_steps): qo_pi[t] = utils.to_numpy(qo_pi[t], flatten=True) else: n_steps = 1 qo_pi = [utils.to_numpy(qo_pi, flatten=True)] C = utils.to_numpy(C, flatten=True) # initialise expected utility expected_util = 0 # in case of multiple observation modalities, loop over time points and modalities if utils.is_arr_of_arr(C): num_modalities = len(C) for t in range(n_steps): for modality in range(num_modalities): lnC = np.log(softmax(C[modality][:, np.newaxis]) + 1e-16) expected_util += qo_pi[t][modality].dot(lnC) # else, just loop over time (since there's only one modality) else: lnC = np.log(softmax(C[:, np.newaxis]) + 1e-16) for t in range(n_steps): lnC = np.log(softmax(C[:, np.newaxis] + 1e-16)) expected_util += qo_pi[t].dot(lnC) return expected_util
def test_state_info_gain(self): """ Test the states_info_gain function. Demonstrates working by manipulating uncertainty in the likelihood matrices (A or B) in a ways that alternatively change the resolvability of uncertainty (via an imprecise expected state and a precise mapping, or high ambiguity and imprecise mapping). """ n_states = [2] n_control = [2] qs = Categorical(values=np.eye(n_states[0])[0]) # add some uncertainty into the consequences of the second policy, which # leads to increased epistemic value of observations, in case of pursuing # that policy -- in the case of a precise observation likelihood model B_matrix = construct_generic_B(n_states, n_control) B_matrix[:, :, 1] = core.softmax(B_matrix[:, :, 1]) B = Categorical(values=B_matrix) # single timestep n_step = 1 policies = core.construct_policies(n_states, n_control, policy_len=n_step) # single observation modality num_obs = [2] # create noiseless identity A matrix A = Categorical(values=np.eye(num_obs[0])) state_info_gains = np.zeros(len(policies)) for idx, policy in enumerate(policies): qs_pi = core.get_expected_states(qs, B, policy) state_info_gains[idx] += core.calc_states_info_gain(A, qs_pi) self.assertGreater(state_info_gains[1], state_info_gains[0]) # we can 'undo' the epistemic bonus of the second policy by making the A matrix # totally ambiguous, thus observations cannot resolve uncertainty about hidden states # - in this case, uncertainty in the posterior beliefs doesn't matter A = Categorical(values=np.ones((num_obs[0], num_obs[0]))) A.normalize() state_info_gains = np.zeros(len(policies)) for idx, policy in enumerate(policies): qs_pi = core.get_expected_states(qs, B, policy) state_info_gains[idx] += core.calc_states_info_gain(A, qs_pi) self.assertEqual(state_info_gains[0], state_info_gains[1])
def calc_expected_utility(Qo_pi, C): """ Given expected observations under a policy Qo_pi and a prior over observations C compute the expected utility of the policy. @TODO: Needs to be amended for use with multi-step policies (where possible_policies is a list of np.arrays (nStep x nFactor), not just a list of tuples as it is now) Parameters ---------- Qo_pi [numpy 1D array, array-of-arrays (where each entry is a numpy 1D array), or Categorical (either single-factor or AoA)]: Posterior predictive density over outcomes C [numpy nd-array, array-of-arrays (where each entry is a numpy nd-array): Prior beliefs over outcomes, expressed in terms of relative log probabilities Returns ------- expected_util [scalar]: Utility (reward) expected under the policy in question """ if isinstance(Qo_pi, Categorical): Qo_pi = Qo_pi.values if Qo_pi.dtype == "object": for g in range(len(Qo_pi)): Qo_pi[g] = Qo_pi[g].flatten() if C.dtype == "object": expected_util = 0 Ng = len(C) for g in range(Ng): lnC = np.log(softmax(C[g][:, np.newaxis]) + 1e-16) expected_util += Qo_pi[g].flatten().dot(lnC) else: lnC = np.log(softmax(C[:, np.newaxis]) + 1e-16) expected_util = Qo_pi.flatten().dot(lnC) return expected_util
def update_posterior_policies( qs, A, B, C, policies, use_utility=True, use_states_info_gain=True, use_param_info_gain=False, pA=None, pB=None, gamma=16.0, return_numpy=True, ): """ Updates the posterior beliefs about policies based on expected free energy prior @TODO: Needs to be amended for use with multi-step policies (where possible_policies is a list of np.arrays (n_step x n_factor), not just a list of tuples as it is now) Parameters ---------- - `qs` [1D numpy array, array-of-arrays, or Categorical (either single- or multi-factor)]: Current marginal beliefs about hidden state factors - `A` [numpy ndarray, array-of-arrays (in case of multiple modalities), or Categorical (both single and multi-modality)]: Observation likelihood model (beliefs about the likelihood mapping entertained by the agent) - `B` [numpy ndarray, array-of-arrays (in case of multiple hidden state factors), or Categorical (both single and multi-factor)]: Transition likelihood model (beliefs about the likelihood mapping entertained by the agent) - `C` [numpy 1D-array, array-of-arrays (in case of multiple modalities), or Categorical (both single and multi-modality)]: Prior beliefs about outcomes (prior preferences) - `policies` [list of tuples]: A list of all the possible policies, each expressed as a tuple of indices, where a given index corresponds to an action on a particular hidden state factor e.g. policies[1][2] yields the index of the action under policy 1 that affects hidden state factor 2 - `use_utility` [bool]: Whether to calculate utility term, i.e how much expected observation confer with prior expectations - `use_states_info_gain` [bool]: Whether to calculate state information gain - `use_param_info_gain` [bool]: Whether to calculate parameter information gain @NOTE requires pA or pB to be specified - `pA` [numpy ndarray, array-of-arrays (in case of multiple modalities), or Dirichlet (both single and multi-modality)]: Prior dirichlet parameters for A. Defaults to none, in which case info gain w.r.t. Dirichlet parameters over A is skipped. - `pB` [numpy ndarray, array-of-arrays (in case of multiple hidden state factors), or Dirichlet (both single and multi-factor)]: Prior dirichlet parameters for B. Defaults to none, in which case info gain w.r.t. Dirichlet parameters over A is skipped. - `gamma` [float, defaults to 16.0]: Precision over policies, used as the inverse temperature parameter of a softmax transformation of the expected free energies of each policy - `return_numpy` [Boolean]: True/False flag to determine whether output of function is a numpy array or a Categorical Returns -------- - `qp` [1D numpy array or Categorical]: Posterior beliefs about policies, defined here as a softmax function of the expected free energies of policies - `efe` - [1D numpy array or Categorical]: The expected free energies of policies """ n_policies = len(policies) efe = np.zeros(n_policies) q_pi = np.zeros((n_policies, 1)) for idx, policy in enumerate(policies): qs_pi = get_expected_states(qs, B, policy) qo_pi = get_expected_obs(qs_pi, A) if use_utility: efe[idx] += calc_expected_utility(qo_pi, C) if use_states_info_gain: efe[idx] += calc_states_info_gain(A, qs_pi) if use_param_info_gain: if pA is not None: efe[idx] += calc_pA_info_gain(pA, qo_pi, qs_pi) if pB is not None: efe[idx] += calc_pB_info_gain(pB, qs_pi, qs, policy) q_pi = softmax(efe * gamma) if return_numpy: q_pi = q_pi / q_pi.sum(axis=0) else: q_pi = utils.to_categorical(q_pi) q_pi.normalize() return q_pi, efe
""" # reset environment first_state = env.reset(init_state=s[0]) # get an observation, given the state first_obs = gp_likelihood[:, first_state] # turn observation into an index first_obs = np.where(first_obs)[0][0] print("Initial Location {}".format(first_state)) print("Initial Observation {}".format(first_obs)) # infer initial state, given first observation qs = core.softmax(A[first_obs, :].log() + D.log(), return_numpy=False) # loop over time for t in range(T): qs_past = qs.copy() s_t = env.set_state(s[t + 1]) # evoke observation, given the state obs = gp_likelihood[:, s_t] # turn observation into an index obs = np.where(obs)[0][0] # get transition likelihood