def run_agent(par_list, w_old, trials=trials): #set parameters: #obs_unc: observation uncertainty condition #state_unc: state transition uncertainty condition #goal_pol: evaluate only policies that lead to the goal #utility: goal prior, preference p(o) trans_prob, avg, Rho = par_list """ create matrices """ ns = w_old.environment.Theta.shape[0] nr = w_old.environment.Rho.shape[1] na = w_old.environment.Theta.shape[2] T = w_old.T utility = w_old.agent.perception.prior_rewards.copy() #generating probability of observations in each state A = np.eye(ns) #state transition generative probability (matrix) B = np.zeros((ns, ns, na)) for i in range(0,na): B[i+1,:,i] += 1 # create reward generation # # C = np.zeros((utility.shape[0], ns)) # # vals = np.array([0., 1./5., 0.95, 1./5., 1/5., 1./5.]) # # for i in range(ns): # C[:,i] = [1-vals[i],vals[i]] # # changes = np.array([0.01, -0.01]) # Rho = generate_bandit_timeseries(C, nb, trials, changes) # agent's beliefs about reward generation C_alphas = w_old.agent.perception.dirichlet_rew_params.copy() C_agent = w_old.agent.perception.generative_model_rewards.copy() #np.array([np.random.dirichlet(C_alphas[:,i]) for i in range(ns)]).T # context transition matrix transition_matrix_context = w_old.agent.perception.transition_matrix_context.copy() """ create environment (grid world) """ environment = env.MultiArmedBandid(A, B, Rho, trials = trials, T = T) """ create policies """ pol = w_old.agent.policies #pol = pol[-2:] npi = pol.shape[0] # prior over policies #prior_pi[170] = 1. - 1e-3 alphas = w_old.agent.perception.dirichlet_pol_params.copy() # for i in range(nb): # alphas[i+1,i] = 100 #alphas[170] = 100 prior_pi = np.exp(scs.digamma(alphas) - scs.digamma(alphas.sum(axis=0))[np.newaxis,:]) prior_pi /= prior_pi.sum(axis=0) """ set state prior (where agent thinks it starts) """ state_prior = np.zeros((ns)) state_prior[0] = 1. """ set action selection method """ if avg: ac_sel = asl.AveragedSelector(trials = trials, T = T, number_of_actions = na) else: ac_sel = asl.MaxSelector(trials = trials, T = T, number_of_actions = na) # ac_sel = asl.AveragedPolicySelector(trials = trials, T = T, # number_of_policies = npi, # number_of_actions = na) prior_context = np.zeros((nc)) + 1./(nc)#np.dot(transition_matrix_context, w_old.agent.posterior_context[-1,-1]) # prior_context[0] = 1. """ set up agent """ pol_par = alphas # perception bayes_prc = prc.HierarchicalPerception(A, B, C_agent, transition_matrix_context, state_prior, utility, prior_pi, pol_par, C_alphas, T=T) bayes_pln = agt.BayesianPlanner(bayes_prc, ac_sel, pol, trials = trials, T = T, prior_states = state_prior, prior_policies = prior_pi, number_of_states = ns, prior_context = prior_context, learn_habit = True, #save_everything = True, number_of_policies = npi, number_of_rewards = nr) """ create world """ w = world.World(environment, bayes_pln, trials = trials, T = T) """ simulate experiment """ w.simulate_experiment(range(trials)) """ plot and evaluate results """ # plt.figure() # # for i in range(ns): # plt.plot(w.environment.Rho[:,0,i], label=str(i)) # # plt.legend() # plt.show() # # print("won:", int(w.rewards.sum()/trials*100), "%") # # stayed = np.array([((w.actions[i,0] - w.actions[i+1,0])==0) for i in range(trials-1)]) # # print("stayed:", int(stayed.sum()/trials*100), "%") return w
def run_agent(par_list, trials, T, ns, na, nr, nc, deval=False, ESS=None): #set parameters: #learn_pol: initial concentration paramter for policy prior #trans_prob: reward probability #avg: True for average action selection, False for maximum selection #Rho: Environment's reward generation probabilities as a function of time #utility: goal prior, preference p(o) learn_pol, trans_prob, avg, Rho, utility = par_list """ create matrices """ #generating probability of observations in each state A = np.eye(ns) #state transition generative probability (matrix) B = np.zeros((ns, ns, na)) for i in range(0, na): B[i + 1, :, i] += 1 # agent's beliefs about reward generation # concentration parameters C_alphas = np.ones((nr, ns, nc)) # initialize state in front of levers so that agent knows it yields no reward C_alphas[0, 0, :] = 100 for i in range(1, nr): C_alphas[i, 0, :] = 1 # agent's initial estimate of reward generation probability C_agent = np.zeros((nr, ns, nc)) for c in range(nc): C_agent[:, :, c] = np.array([(C_alphas[:, i, c]) / (C_alphas[:, i, c]).sum() for i in range(ns)]).T # context transition matrix p = trans_prob q = 1. - p transition_matrix_context = np.zeros((nc, nc)) transition_matrix_context += q / (nc - 1) for i in range(nc): transition_matrix_context[i, i] = p """ create environment (grid world) """ environment = env.MultiArmedBandid(A, B, Rho, trials=trials, T=T) """ create policies """ pol = np.array(list(itertools.product(list(range(na)), repeat=T - 1))) npi = pol.shape[0] # concentration parameters alphas = np.zeros((npi, nc)) + learn_pol prior_pi = alphas / alphas.sum(axis=0) """ set state prior (where agent thinks it starts) """ state_prior = np.zeros((ns)) state_prior[0] = 1. """ set action selection method """ if ESS is not None: ac_sel = asl.DirichletSelector(trials=trials, T=T, number_of_actions=na) elif avg: ac_sel = asl.AveragedSelector(trials=trials, T=T, number_of_actions=na) else: ac_sel = asl.MaxSelector(trials=trials, T=T, number_of_actions=na) """ set context prior """ prior_context = np.zeros((nc)) + 0.1 / (nc - 1) prior_context[0] = 0.9 """ set up agent """ # perception bayes_prc = prc.HierarchicalPerception(A, B, C_agent, transition_matrix_context, state_prior, utility, prior_pi, alphas, C_alphas, T=T) # agent bayes_pln = agt.BayesianPlanner( bayes_prc, ac_sel, pol, trials=trials, T=T, prior_states=state_prior, prior_policies=prior_pi, number_of_states=ns, prior_context=prior_context, learn_habit=True, learn_rew=True, #save_everything = True, number_of_policies=npi, number_of_rewards=nr) """ create world """ w = world.World(environment, bayes_pln, trials=trials, T=T) """ simulate experiment """ if not deval: w.simulate_experiment(range(trials)) else: w.simulate_experiment(range(trials // 2)) # reset utility to implement devaluation ut = utility[1:].sum() bayes_prc.prior_rewards[2:] = ut / (nr - 2) bayes_prc.prior_rewards[:2] = (1 - ut) / 2 w.simulate_experiment(range(trials // 2, trials)) return w
def run_agent(par_list, trials=trials, T=T, ns=ns, na=na): #set parameters: #obs_unc: observation uncertainty condition #state_unc: state transition uncertainty condition #goal_pol: evaluate only policies that lead to the goal #utility: goal prior, preference p(o) learn_pol, avg, Rho, learn_habit, utility = par_list learn_rew = 1 """ create matrices """ #generating probability of observations in each state A = np.eye(no) #state transition generative probability (matrix) B = np.zeros((ns, ns, na)) b1 = 0.7 nb1 = 1.-b1 b2 = 0.7 nb2 = 1.-b2 B[:,:,0] = np.array([[ 0, 0, 0, 0, 0, 0, 0,], [ b1, 0, 0, 0, 0, 0, 0,], [nb1, 0, 0, 0, 0, 0, 0,], [ 0, 1, 0, 1, 0, 0, 0,], [ 0, 0, 1, 0, 1, 0, 0,], [ 0, 0, 0, 0, 0, 1, 0,], [ 0, 0, 0, 0, 0, 0, 1,],]) B[:,:,1] = np.array([[ 0, 0, 0, 0, 0, 0, 0,], [nb2, 0, 0, 0, 0, 0, 0,], [ b2, 0, 0, 0, 0, 0, 0,], [ 0, 0, 0, 1, 0, 0, 0,], [ 0, 0, 0, 0, 1, 0, 0,], [ 0, 1, 0, 0, 0, 1, 0,], [ 0, 0, 1, 0, 0, 0, 1,],]) # create reward generation # # C = np.zeros((utility.shape[0], ns)) # # vals = np.array([0., 1./5., 0.95, 1./5., 1/5., 1./5.]) # # for i in range(ns): # C[:,i] = [1-vals[i],vals[i]] # # changes = np.array([0.01, -0.01]) # Rho = generate_bandit_timeseries(C, nb, trials, changes) # agent's beliefs about reward generation C_alphas = np.zeros((nr, ns, nc)) + learn_rew C_alphas[0,:3,:] = 100 for i in range(1,nr): C_alphas[i,0,:] = 1 # C_alphas[0,1:,:] = 100 # for c in range(nb): # C_alphas[1,c+1,c] = 100 # C_alphas[0,c+1,c] = 1 #C_alphas[:,13] = [100, 1] C_agent = np.zeros((nr, ns, nc)) for c in range(nc): C_agent[:,:,c] = np.array([(C_alphas[:,i,c])/(C_alphas[:,i,c]).sum() for i in range(ns)]).T #np.array([np.random.dirichlet(C_alphas[:,i]) for i in range(ns)]).T # context transition matrix transition_matrix_context = np.ones(1) """ create environment (grid world) """ environment = env.MultiArmedBandid(A, B, Rho, trials = trials, T = T) """ create policies """ pol = np.array(list(itertools.product(list(range(na)), repeat=T-1))) #pol = pol[-2:] npi = pol.shape[0] # prior over policies prior_pi = np.ones(npi)/npi #np.zeros(npi) + 1e-3/(npi-1) #prior_pi[170] = 1. - 1e-3 alphas = np.zeros((npi, nc)) + learn_pol # for i in range(nb): # alphas[i+1,i] = 100 #alphas[170] = 100 prior_pi = alphas / alphas.sum(axis=0) """ set state prior (where agent thinks it starts) """ state_prior = np.zeros((ns)) state_prior[0] = 1. """ set action selection method """ if avg: sel = 'avg' ac_sel = asl.AveragedSelector(trials = trials, T = T, number_of_actions = na) else: sel = 'max' ac_sel = asl.MaxSelector(trials = trials, T = T, number_of_actions = na) # ac_sel = asl.AveragedPolicySelector(trials = trials, T = T, # number_of_policies = npi, # number_of_actions = na) prior_context = np.array([1.]) # prior_context[0] = 1. """ set up agent """ #bethe agent if agent == 'bethe': agnt = 'bethe' pol_par = alphas # perception bayes_prc = prc.HierarchicalPerception(A, B, C_agent, transition_matrix_context, state_prior, utility, prior_pi, pol_par, C_alphas, T=T, pol_lambda=0.3, r_lambda=0.6, non_decaying=3, dec_temp=4.) bayes_pln = agt.BayesianPlanner(bayes_prc, ac_sel, pol, trials = trials, T = T, prior_states = state_prior, prior_policies = prior_pi, number_of_states = ns, prior_context = prior_context, learn_habit = learn_habit, learn_rew=True, #save_everything = True, number_of_policies = npi, number_of_rewards = nr) #MF agent else: agnt = 'mf' bayes_prc = prc.MFPerception(A, B, utility, state_prior, T = T) bayes_pln = agt.BayesianMFPlanner(bayes_prc, [], ac_sel, trials = trials, T = T, prior_states = state_prior, policies = pol, number_of_states = ns, number_of_policies = npi) """ create world """ w = world.World(environment, bayes_pln, trials = trials, T = T) """ simulate experiment """ # w.simulate_experiment(range(trials-100)) # new_ut = utility.copy() # new_ut[1] = utility[0] # new_ut /= new_ut.sum() # w.agent.perception.reset_preferences(0,new_ut, pol) # w.simulate_experiment(range(trials-100, trials)) w.simulate_experiment(range(trials)) """ plot and evaluate results """ # plt.figure() # # for i in range(3,ns): # plt.plot(w.environment.Rho[:,1,i], label=str(i)) # # plt.ylim([0,1]) # plt.legend() # plt.show() # # # rewarded = np.where(w.rewards[:trials-1,-1] == 1)[0] # unrewarded = np.where(w.rewards[:trials-1,-1] == 0)[0] # # rare = np.append(np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 0)[0]] == 2)[0], # np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 1)[0]] == 1)[0]) # # common = np.append(np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 0)[0]] == 1)[0], # np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 1)[0]] == 2)[0]) # # names = ["rewarded common", "rewarded rare", "unrewarded common", "unrewarded rare"] # # index_list = [np.intersect1d(rewarded, common), np.intersect1d(rewarded, rare), # np.intersect1d(unrewarded, common), np.intersect1d(unrewarded, rare)] # # stayed_list = [((w.actions[index_list[i],0] - w.actions[index_list[i]+1,0])==0).sum()/len(index_list[i]) for i in range(4)] # ## stayed_rew = ((w.actions[rewarded,0] - w.actions[rewarded+1,0]) == 0).sum()/len(rewarded) ## ## stayed_unrew = ((w.actions[unrewarded,0] - w.actions[unrewarded+1,0]) == 0).sum()/len(unrewarded) # # plt.figure() # plt.bar(x=names,height=stayed_list) # plt.show() return w