def select_desired_action(self, tau, t, posterior_policies, actions, *args): npi = posterior_policies.shape[0] likelihood = args[0] prior = args[1] DKL = (likelihood * ln(likelihood / prior)).sum() H = -(posterior_policies * ln(posterior_policies)).sum() H_p = -(prior * ln(prior)).sum() self.RT[tau, t] = np.exp(H_p + np.random.normal(H, DKL)) #estimate action probability self.estimate_action_probability(tau, t, posterior_policies, actions) u = np.random.choice(self.na, p=self.control_probability[tau, t]) return u
def __get_log_likelihood(self, params): self.agent.set_free_parameters(params) self.agent.reset_beliefs(self.actions) self.__update_model() p1 = np.tile(np.arange(self.trials), (self.T, 1)).T p2 = np.tile(np.arange(self.T), (self.trials, 1)) p3 = self.actions.astype(int) return ln(self.agent.asl.control_probability[p1, p2, p3]).sum()
def update_beliefs_states(self, tau, t, observation, policies, prior, prior_pi): if t == 0: self.logzs = np.tile(ln(self.zs), (self.T, 1)).T self.logzs[:, t] = ln( self.generative_model_observations[int(observation), :]) #estimate expected state distribution lforw = np.zeros((self.nh, self.T)) lforw[:, 0] = ln(self.prior_states) lback = np.zeros((self.nh, self.T)) posterior = np.zeros((self.nh, self.T, policies.shape[0])) neg_fe = np.zeros(policies.shape[0]) eps = 0.01 for pi, ppi in enumerate(prior_pi): if ppi > 1e-6: logtm = ln(self.generative_model_states[:, :, policies[pi]]) #SARAH: check the following before publishing! post = prior[:, :, pi] not_close = True while not_close: lforw[:, 1:] = np.einsum('ijk, jk-> ik', logtm, post[:, :-1]) lback[:, :-1] = np.einsum('ijk, ik->jk', logtm, post[:, 1:]) logpost = lforw + self.logzs lp = ln(post) lp = (1 - eps) * lp + eps * (logpost + lback) new_post = softmax(lp) not_close = not np.allclose(post, new_post, atol=1e-3) post[:] = new_post posterior[:, :, pi] = post neg_fe[pi] = (logpost * post).sum() - np.sum(post * ln(post)) else: posterior[:, :, pi] = prior[:, :, pi] neg_fe[pi] = -1e10 self.fe_pi = neg_fe return posterior, neg_fe
def update_beliefs(self, tau, observation, response): if tau == 0: self.posterior_states[0] = 1. / self.ns self.posterior_durations[0] = 1. / self.nd self.posterior_states[1] = self.prior_states self.posterior_durations[1] = self.prior_durations self.posterior_observations[0] = np.exp( ln(self.prior_observations) + (self.posterior_states[1][np.newaxis, :, np.newaxis] * ln(self.generative_model_observations)).sum(axis=1)) self.posterior_observations[0] /= self.posterior_observations[ 0].sum(axis=0) self.posterior_policies[0] = softmax( ln(self.prior_actions) - (self.posterior_observations[0] * ln(self.posterior_observations[0])).sum(axis=0) + (self.posterior_observations[0][:, np.newaxis, :] * self.posterior_states[1, np.newaxis, :, np.newaxis] * ln(self.generative_model_observations)).sum(axis=(0, 1))) else: old_post_s = self.posterior_states[1].copy() old_post_d = self.posterior_durations[1].copy() self.posterior_states[0] = softmax( ln(self.prior_states) + ln(self.generative_model_observations[observation, :, response])) self.posterior_durations[0] = old_post_d self.posterior_states[1] = softmax( (old_post_d[np.newaxis, np.newaxis, :] * self.posterior_states[0][np.newaxis, :, np.newaxis] * ln(self.generative_model_states)).sum(axis=(1, 2))) self.posterior_durations[1] = softmax((old_post_d[np.newaxis,:]*ln(self.generative_model_durations)).sum(axis=1)) \ #+ (self.posterior_states[0][np.newaxis,:,np.newaxis]*self.posterior_states[1][:,np.newaxis,np.newaxis]*ln(self.generative_model_states)).sum(axis=(0,1))) self.posterior_observations[0] = np.exp( ln(self.prior_observations[:, np.newaxis]) + (self.posterior_states[1][np.newaxis, :, np.newaxis] * ln(self.generative_model_observations)).sum(axis=1)) self.posterior_observations[0] /= self.posterior_observations[ 0].sum(axis=0) self.posterior_policies[0] = softmax( ln(self.prior_actions) - (self.posterior_observations[0] * ln(self.posterior_observations[0])).sum(axis=0) + (self.posterior_observations[0][:, np.newaxis, :] * self.posterior_states[1, np.newaxis, :, np.newaxis] * ln(self.generative_model_observations)).sum(axis=(0, 1)))
def update_beliefs_context(self, tau, t, reward, posterior_states, posterior_policies, prior_context, policies): post_policies = (prior_context[np.newaxis, :] * posterior_policies).sum(axis=1) beta = self.dirichlet_rew_params.copy() states = (posterior_states[:, t, :] * post_policies[np.newaxis, :, np.newaxis]).sum(axis=1) beta_prime = self.dirichlet_rew_params.copy() beta_prime[reward] = beta[reward] + states # for c in range(self.nc): # for state in range(self.nh): # self.generative_model_rewards[:,state,c] =\ # np.exp(scs.digamma(beta_prime[:,state,c])\ # -scs.digamma(beta_prime[:,state,c].sum())) # self.generative_model_rewards[:,state,c] /= self.generative_model_rewards[:,state,c].sum() # # self.rew_messages[:,t+1:,c] = self.prior_rewards.dot(self.generative_model_rewards[:,:,c])[:,np.newaxis] # # for c in range(self.nc): # for pi, cs in enumerate(policies): # if self.prior_policies[pi,c] > 1e-15: # self.update_messages(t, pi, cs, c) # else: # self.fwd_messages[:,:,pi,c] = 1./self.nh #0 alpha = self.dirichlet_pol_params.copy() if t == self.T - 1: chosen_pol = np.argmax(post_policies) inf_context = np.argmax(prior_context) alpha_prime = self.dirichlet_pol_params.copy() alpha_prime[chosen_pol, :] += prior_context #alpha_prime[chosen_pol,inf_context] = self.dirichlet_pol_params[chosen_pol,inf_context] + 1 else: alpha_prime = alpha if self.nc == 1: posterior = np.ones(1) else: # todo: recalc #outcome_surprise = ((states * prior_context[np.newaxis,:]).sum(axis=1)[:,np.newaxis] * (scs.digamma(beta_prime[reward]) - scs.digamma(beta_prime.sum(axis=0)))).sum(axis=0) outcome_surprise = (posterior_policies * ln(self.fwd_norms.prod(axis=0))).sum(axis=0) entropy = -(posterior_policies * ln(posterior_policies)).sum(axis=0) #policy_surprise = (post_policies[:,np.newaxis] * scs.digamma(alpha_prime)).sum(axis=0) - scs.digamma(alpha_prime.sum(axis=0)) policy_surprise = ( posterior_policies * scs.digamma(alpha_prime)).sum( axis=0) - scs.digamma(alpha_prime.sum(axis=0)) posterior = outcome_surprise + policy_surprise + entropy #+ np.nan_to_num((posterior_policies * ln(self.fwd_norms).sum(axis = 0))).sum(axis=0)#\ # if tau in range(90,120) and t == 1: # #print(tau, np.exp(outcome_surprise), np.exp(policy_surprise)) # print(tau, np.exp(outcome_surprise[1])/np.exp(outcome_surprise[0]), np.exp(policy_surprise[1])/np.exp(policy_surprise[0])) posterior = np.nan_to_num(softmax(posterior + ln(prior_context))) return posterior
def update_beliefs(self, tau, t, observation, reward, response): self.observations[tau,t] = observation self.rewards[tau,t] = reward if t == 0: self.possible_polcies = np.arange(0,self.npi,1).astype(np.int32) else: possible_policies = np.where(self.policies[:,t-1]==response)[0] self.possible_polcies = np.intersect1d(self.possible_polcies, possible_policies) self.log_probability += ln(self.posterior_actions[tau,t-1,response]) self.posterior_states[tau, t] = self.perception.update_beliefs_states( tau, t, observation, reward, self.policies, self.possible_polcies) #update beliefs about policies self.posterior_policies[tau, t], self.likelihood[tau,t] = self.perception.update_beliefs_policies(tau, t) if tau == 0: prior_context = self.prior_context else: #elif t == 0: prior_context = np.dot(self.perception.transition_matrix_context, self.posterior_context[tau-1, -1]).reshape((self.nc)) # else: # prior_context = np.dot(self.perception.transition_matrix_context, self.posterior_context[tau, t-1]) if self.nc>1 and t>0: self.posterior_context[tau, t] = \ self.perception.update_beliefs_context(tau, t, \ reward, \ self.posterior_states[tau, t], \ self.posterior_policies[tau, t], \ prior_context, \ self.policies) elif self.nc>1 and t==0: self.posterior_context[tau, t] = prior_context else: self.posterior_context[tau,t] = 1 # print(tau,t) # print("prior", prior_context) # print("post", self.posterior_context[tau, t]) if t < self.T-1: post_pol = np.dot(self.posterior_policies[tau, t], self.posterior_context[tau, t]) self.posterior_actions[tau, t] = self.estimate_action_probability(tau, t, post_pol) if t == self.T-1 and self.learn_habit: self.posterior_dirichlet_pol[tau], self.prior_policies[tau] = self.perception.update_beliefs_dirichlet_pol_params(tau, t, \ self.posterior_policies[tau,t], \ self.posterior_context[tau,t]) if False: self.posterior_rewards[tau, t-1] = np.einsum('rsc,spc,pc,c->r', self.perception.generative_model_rewards, self.posterior_states[tau,t,:,t], self.posterior_policies[tau,t], self.posterior_context[tau,t]) #if reward > 0: if self.learn_rew: self.posterior_dirichlet_rew[tau,t] = self.perception.update_beliefs_dirichlet_rew_params(tau, t, \ reward, \ self.posterior_states[tau, t], \ self.posterior_policies[tau, t], \ self.posterior_context[tau,t])
def update_beliefs_policies(self): posterior = softmax(ln(self.fwd_norms).sum(axis=0)) return posterior