Esempio n. 1
0
    def select_desired_action(self, tau, t, posterior_policies, actions,
                              *args):

        npi = posterior_policies.shape[0]
        likelihood = args[0]
        prior = args[1]

        DKL = (likelihood * ln(likelihood / prior)).sum()
        H = -(posterior_policies * ln(posterior_policies)).sum()
        H_p = -(prior * ln(prior)).sum()

        self.RT[tau, t] = np.exp(H_p + np.random.normal(H, DKL))

        #estimate action probability
        self.estimate_action_probability(tau, t, posterior_policies, actions)
        u = np.random.choice(self.na, p=self.control_probability[tau, t])

        return u
    def __get_log_likelihood(self, params):
        self.agent.set_free_parameters(params)
        self.agent.reset_beliefs(self.actions)
        self.__update_model()

        p1 = np.tile(np.arange(self.trials), (self.T, 1)).T
        p2 = np.tile(np.arange(self.T), (self.trials, 1))
        p3 = self.actions.astype(int)

        return ln(self.agent.asl.control_probability[p1, p2, p3]).sum()
    def update_beliefs_states(self, tau, t, observation, policies, prior,
                              prior_pi):
        if t == 0:
            self.logzs = np.tile(ln(self.zs), (self.T, 1)).T
        self.logzs[:, t] = ln(
            self.generative_model_observations[int(observation), :])

        #estimate expected state distribution
        lforw = np.zeros((self.nh, self.T))
        lforw[:, 0] = ln(self.prior_states)
        lback = np.zeros((self.nh, self.T))
        posterior = np.zeros((self.nh, self.T, policies.shape[0]))
        neg_fe = np.zeros(policies.shape[0])
        eps = 0.01
        for pi, ppi in enumerate(prior_pi):
            if ppi > 1e-6:
                logtm = ln(self.generative_model_states[:, :, policies[pi]])
                #SARAH: check the following before publishing!
                post = prior[:, :, pi]
                not_close = True
                while not_close:
                    lforw[:, 1:] = np.einsum('ijk, jk-> ik', logtm,
                                             post[:, :-1])
                    lback[:, :-1] = np.einsum('ijk, ik->jk', logtm, post[:,
                                                                         1:])
                    logpost = lforw + self.logzs
                    lp = ln(post)
                    lp = (1 - eps) * lp + eps * (logpost + lback)
                    new_post = softmax(lp)
                    not_close = not np.allclose(post, new_post, atol=1e-3)
                    post[:] = new_post

                posterior[:, :, pi] = post
                neg_fe[pi] = (logpost * post).sum() - np.sum(post * ln(post))
            else:
                posterior[:, :, pi] = prior[:, :, pi]
                neg_fe[pi] = -1e10

        self.fe_pi = neg_fe

        return posterior, neg_fe
Esempio n. 4
0
    def update_beliefs(self, tau, observation, response):

        if tau == 0:
            self.posterior_states[0] = 1. / self.ns
            self.posterior_durations[0] = 1. / self.nd
            self.posterior_states[1] = self.prior_states
            self.posterior_durations[1] = self.prior_durations
            self.posterior_observations[0] = np.exp(
                ln(self.prior_observations) +
                (self.posterior_states[1][np.newaxis, :, np.newaxis] *
                 ln(self.generative_model_observations)).sum(axis=1))

            self.posterior_observations[0] /= self.posterior_observations[
                0].sum(axis=0)

            self.posterior_policies[0] = softmax(
                ln(self.prior_actions) -
                (self.posterior_observations[0] *
                 ln(self.posterior_observations[0])).sum(axis=0) +
                (self.posterior_observations[0][:, np.newaxis, :] *
                 self.posterior_states[1, np.newaxis, :, np.newaxis] *
                 ln(self.generative_model_observations)).sum(axis=(0, 1)))

        else:
            old_post_s = self.posterior_states[1].copy()
            old_post_d = self.posterior_durations[1].copy()

            self.posterior_states[0] = softmax(
                ln(self.prior_states) +
                ln(self.generative_model_observations[observation, :,
                                                      response]))
            self.posterior_durations[0] = old_post_d
            self.posterior_states[1] = softmax(
                (old_post_d[np.newaxis, np.newaxis, :] *
                 self.posterior_states[0][np.newaxis, :, np.newaxis] *
                 ln(self.generative_model_states)).sum(axis=(1, 2)))
            self.posterior_durations[1] = softmax((old_post_d[np.newaxis,:]*ln(self.generative_model_durations)).sum(axis=1)) \
                                            #+ (self.posterior_states[0][np.newaxis,:,np.newaxis]*self.posterior_states[1][:,np.newaxis,np.newaxis]*ln(self.generative_model_states)).sum(axis=(0,1)))

            self.posterior_observations[0] = np.exp(
                ln(self.prior_observations[:, np.newaxis]) +
                (self.posterior_states[1][np.newaxis, :, np.newaxis] *
                 ln(self.generative_model_observations)).sum(axis=1))

            self.posterior_observations[0] /= self.posterior_observations[
                0].sum(axis=0)

            self.posterior_policies[0] = softmax(
                ln(self.prior_actions) -
                (self.posterior_observations[0] *
                 ln(self.posterior_observations[0])).sum(axis=0) +
                (self.posterior_observations[0][:, np.newaxis, :] *
                 self.posterior_states[1, np.newaxis, :, np.newaxis] *
                 ln(self.generative_model_observations)).sum(axis=(0, 1)))
Esempio n. 5
0
    def update_beliefs_context(self, tau, t, reward, posterior_states,
                               posterior_policies, prior_context, policies):

        post_policies = (prior_context[np.newaxis, :] *
                         posterior_policies).sum(axis=1)
        beta = self.dirichlet_rew_params.copy()
        states = (posterior_states[:, t, :] *
                  post_policies[np.newaxis, :, np.newaxis]).sum(axis=1)
        beta_prime = self.dirichlet_rew_params.copy()
        beta_prime[reward] = beta[reward] + states

        #        for c in range(self.nc):
        #            for state in range(self.nh):
        #                self.generative_model_rewards[:,state,c] =\
        #                np.exp(scs.digamma(beta_prime[:,state,c])\
        #                       -scs.digamma(beta_prime[:,state,c].sum()))
        #                self.generative_model_rewards[:,state,c] /= self.generative_model_rewards[:,state,c].sum()
        #
        #            self.rew_messages[:,t+1:,c] = self.prior_rewards.dot(self.generative_model_rewards[:,:,c])[:,np.newaxis]
        #
        #        for c in range(self.nc):
        #            for pi, cs in enumerate(policies):
        #                if self.prior_policies[pi,c] > 1e-15:
        #                    self.update_messages(t, pi, cs, c)
        #                else:
        #                    self.fwd_messages[:,:,pi,c] = 1./self.nh #0

        alpha = self.dirichlet_pol_params.copy()
        if t == self.T - 1:
            chosen_pol = np.argmax(post_policies)
            inf_context = np.argmax(prior_context)
            alpha_prime = self.dirichlet_pol_params.copy()
            alpha_prime[chosen_pol, :] += prior_context
            #alpha_prime[chosen_pol,inf_context] = self.dirichlet_pol_params[chosen_pol,inf_context] + 1
        else:
            alpha_prime = alpha

        if self.nc == 1:
            posterior = np.ones(1)
        else:
            # todo: recalc
            #outcome_surprise = ((states * prior_context[np.newaxis,:]).sum(axis=1)[:,np.newaxis] * (scs.digamma(beta_prime[reward]) - scs.digamma(beta_prime.sum(axis=0)))).sum(axis=0)
            outcome_surprise = (posterior_policies *
                                ln(self.fwd_norms.prod(axis=0))).sum(axis=0)
            entropy = -(posterior_policies *
                        ln(posterior_policies)).sum(axis=0)
            #policy_surprise = (post_policies[:,np.newaxis] * scs.digamma(alpha_prime)).sum(axis=0) - scs.digamma(alpha_prime.sum(axis=0))
            policy_surprise = (
                posterior_policies * scs.digamma(alpha_prime)).sum(
                    axis=0) - scs.digamma(alpha_prime.sum(axis=0))
            posterior = outcome_surprise + policy_surprise + entropy

            #+ np.nan_to_num((posterior_policies * ln(self.fwd_norms).sum(axis = 0))).sum(axis=0)#\

            #            if tau in range(90,120) and t == 1:
            #                #print(tau, np.exp(outcome_surprise), np.exp(policy_surprise))
            #                print(tau, np.exp(outcome_surprise[1])/np.exp(outcome_surprise[0]), np.exp(policy_surprise[1])/np.exp(policy_surprise[0]))

            posterior = np.nan_to_num(softmax(posterior + ln(prior_context)))

        return posterior
Esempio n. 6
0
    def update_beliefs(self, tau, t, observation, reward, response):
        self.observations[tau,t] = observation
        self.rewards[tau,t] = reward

        if t == 0:
            self.possible_polcies = np.arange(0,self.npi,1).astype(np.int32)
        else:
            possible_policies = np.where(self.policies[:,t-1]==response)[0]
            self.possible_polcies = np.intersect1d(self.possible_polcies, possible_policies)
            self.log_probability += ln(self.posterior_actions[tau,t-1,response])

        self.posterior_states[tau, t] = self.perception.update_beliefs_states(
                                         tau, t,
                                         observation,
                                         reward,
                                         self.policies,
                                         self.possible_polcies)

        #update beliefs about policies
        self.posterior_policies[tau, t], self.likelihood[tau,t] = self.perception.update_beliefs_policies(tau, t)

        if tau == 0:
            prior_context = self.prior_context
        else: #elif t == 0:
            prior_context = np.dot(self.perception.transition_matrix_context, self.posterior_context[tau-1, -1]).reshape((self.nc))
#            else:
#                prior_context = np.dot(self.perception.transition_matrix_context, self.posterior_context[tau, t-1])

        if self.nc>1 and t>0:
            self.posterior_context[tau, t] = \
            self.perception.update_beliefs_context(tau, t, \
                                                   reward, \
                                                   self.posterior_states[tau, t], \
                                                   self.posterior_policies[tau, t], \
                                                   prior_context, \
                                                   self.policies)
        elif self.nc>1 and t==0:
            self.posterior_context[tau, t] = prior_context
        else:
            self.posterior_context[tau,t] = 1

        # print(tau,t)
        # print("prior", prior_context)
        # print("post", self.posterior_context[tau, t])

        if t < self.T-1:
            post_pol = np.dot(self.posterior_policies[tau, t], self.posterior_context[tau, t])
            self.posterior_actions[tau, t] = self.estimate_action_probability(tau, t, post_pol)

        if t == self.T-1 and self.learn_habit:
            self.posterior_dirichlet_pol[tau], self.prior_policies[tau] = self.perception.update_beliefs_dirichlet_pol_params(tau, t, \
                                                            self.posterior_policies[tau,t], \
                                                            self.posterior_context[tau,t])

        if False:
            self.posterior_rewards[tau, t-1] = np.einsum('rsc,spc,pc,c->r',
                                                  self.perception.generative_model_rewards,
                                                  self.posterior_states[tau,t,:,t],
                                                  self.posterior_policies[tau,t],
                                                  self.posterior_context[tau,t])
        #if reward > 0:
        if self.learn_rew:
            self.posterior_dirichlet_rew[tau,t] = self.perception.update_beliefs_dirichlet_rew_params(tau, t, \
                                                            reward, \
                                                   self.posterior_states[tau, t], \
                                                   self.posterior_policies[tau, t], \
                                                   self.posterior_context[tau,t])
    def update_beliefs_policies(self):

        posterior = softmax(ln(self.fwd_norms).sum(axis=0))

        return posterior