Esempio n. 1
0
def run_agent(par_list, w_old, trials=trials):
    
    #set parameters:
    #obs_unc: observation uncertainty condition
    #state_unc: state transition uncertainty condition
    #goal_pol: evaluate only policies that lead to the goal
    #utility: goal prior, preference p(o)
    trans_prob, avg, Rho = par_list
    
    
    """
    create matrices
    """
    ns = w_old.environment.Theta.shape[0]
    nr = w_old.environment.Rho.shape[1]
    na = w_old.environment.Theta.shape[2]
    T = w_old.T
    utility = w_old.agent.perception.prior_rewards.copy()
    
    #generating probability of observations in each state

    A = np.eye(ns)
        
    
    #state transition generative probability (matrix)
    B = np.zeros((ns, ns, na))
    
    for i in range(0,na):
        B[i+1,:,i] += 1
    
    # create reward generation
#            
#    C = np.zeros((utility.shape[0], ns))
#    
#    vals = np.array([0., 1./5., 0.95, 1./5., 1/5., 1./5.])
#    
#    for i in range(ns):
#        C[:,i] = [1-vals[i],vals[i]]
#    
#    changes = np.array([0.01, -0.01])
#    Rho = generate_bandit_timeseries(C, nb, trials, changes)
            
    # agent's beliefs about reward generation
    
    C_alphas = w_old.agent.perception.dirichlet_rew_params.copy()
    
    C_agent = w_old.agent.perception.generative_model_rewards.copy()
    #np.array([np.random.dirichlet(C_alphas[:,i]) for i in range(ns)]).T
    
    # context transition matrix
    
    transition_matrix_context = w_old.agent.perception.transition_matrix_context.copy()
                            
    """
    create environment (grid world)
    """
    
    environment = env.MultiArmedBandid(A, B, Rho, trials = trials, T = T)
    
    
    """
    create policies
    """
    
    pol = w_old.agent.policies
    
    #pol = pol[-2:]
    npi = pol.shape[0]
    
    # prior over policies

    #prior_pi[170] = 1. - 1e-3
    alphas = w_old.agent.perception.dirichlet_pol_params.copy()
#    for i in range(nb):
#        alphas[i+1,i] = 100
    #alphas[170] = 100
    prior_pi = np.exp(scs.digamma(alphas) - scs.digamma(alphas.sum(axis=0))[np.newaxis,:])
    prior_pi /= prior_pi.sum(axis=0)
    
    
    """
    set state prior (where agent thinks it starts)
    """
    
    state_prior = np.zeros((ns))
    
    state_prior[0] = 1.

    """
    set action selection method
    """

    if avg:
    
        ac_sel = asl.AveragedSelector(trials = trials, T = T, 
                                      number_of_actions = na)
    else:
        
        ac_sel = asl.MaxSelector(trials = trials, T = T, 
                                      number_of_actions = na)
    
#    ac_sel = asl.AveragedPolicySelector(trials = trials, T = T, 
#                                        number_of_policies = npi,
#                                        number_of_actions = na)
    
    prior_context = np.zeros((nc)) + 1./(nc)#np.dot(transition_matrix_context, w_old.agent.posterior_context[-1,-1])
        
#    prior_context[0] = 1.
    
    """
    set up agent
    """
        
    pol_par = alphas

    # perception
    bayes_prc = prc.HierarchicalPerception(A, B, C_agent, transition_matrix_context, state_prior, utility, prior_pi, pol_par, C_alphas, T=T)
    
    bayes_pln = agt.BayesianPlanner(bayes_prc, ac_sel, pol,
                      trials = trials, T = T,
                      prior_states = state_prior,
                      prior_policies = prior_pi,
                      number_of_states = ns, 
                      prior_context = prior_context,
                      learn_habit = True,
                      #save_everything = True,
                      number_of_policies = npi,
                      number_of_rewards = nr)
    

    """
    create world
    """
    
    w = world.World(environment, bayes_pln, trials = trials, T = T)
    
    """
    simulate experiment
    """
    
    w.simulate_experiment(range(trials))
    
    
    """
    plot and evaluate results
    """
#    plt.figure()
#    
#    for i in range(ns):
#        plt.plot(w.environment.Rho[:,0,i], label=str(i))
#        
#    plt.legend()
#    plt.show()
#    
#    print("won:", int(w.rewards.sum()/trials*100), "%")
#    
#    stayed = np.array([((w.actions[i,0] - w.actions[i+1,0])==0) for i in range(trials-1)])
#    
#    print("stayed:", int(stayed.sum()/trials*100), "%")
    
    return w
def run_agent(par_list, trials, T, ns, na, nr, nc, deval=False, ESS=None):

    #set parameters:
    #learn_pol: initial concentration paramter for policy prior
    #trans_prob: reward probability
    #avg: True for average action selection, False for maximum selection
    #Rho: Environment's reward generation probabilities as a function of time
    #utility: goal prior, preference p(o)
    learn_pol, trans_prob, avg, Rho, utility = par_list
    """
    create matrices
    """

    #generating probability of observations in each state
    A = np.eye(ns)

    #state transition generative probability (matrix)
    B = np.zeros((ns, ns, na))

    for i in range(0, na):
        B[i + 1, :, i] += 1

    # agent's beliefs about reward generation

    # concentration parameters
    C_alphas = np.ones((nr, ns, nc))
    # initialize state in front of levers so that agent knows it yields no reward
    C_alphas[0, 0, :] = 100
    for i in range(1, nr):
        C_alphas[i, 0, :] = 1

    # agent's initial estimate of reward generation probability
    C_agent = np.zeros((nr, ns, nc))
    for c in range(nc):
        C_agent[:, :,
                c] = np.array([(C_alphas[:, i, c]) / (C_alphas[:, i, c]).sum()
                               for i in range(ns)]).T

    # context transition matrix

    p = trans_prob
    q = 1. - p
    transition_matrix_context = np.zeros((nc, nc))
    transition_matrix_context += q / (nc - 1)
    for i in range(nc):
        transition_matrix_context[i, i] = p
    """
    create environment (grid world)
    """

    environment = env.MultiArmedBandid(A, B, Rho, trials=trials, T=T)
    """
    create policies
    """

    pol = np.array(list(itertools.product(list(range(na)), repeat=T - 1)))

    npi = pol.shape[0]

    # concentration parameters
    alphas = np.zeros((npi, nc)) + learn_pol

    prior_pi = alphas / alphas.sum(axis=0)
    """
    set state prior (where agent thinks it starts)
    """

    state_prior = np.zeros((ns))

    state_prior[0] = 1.
    """
    set action selection method
    """

    if ESS is not None:

        ac_sel = asl.DirichletSelector(trials=trials,
                                       T=T,
                                       number_of_actions=na)

    elif avg:

        ac_sel = asl.AveragedSelector(trials=trials, T=T, number_of_actions=na)

    else:

        ac_sel = asl.MaxSelector(trials=trials, T=T, number_of_actions=na)
    """
    set context prior
    """

    prior_context = np.zeros((nc)) + 0.1 / (nc - 1)
    prior_context[0] = 0.9
    """
    set up agent
    """

    # perception
    bayes_prc = prc.HierarchicalPerception(A,
                                           B,
                                           C_agent,
                                           transition_matrix_context,
                                           state_prior,
                                           utility,
                                           prior_pi,
                                           alphas,
                                           C_alphas,
                                           T=T)

    # agent
    bayes_pln = agt.BayesianPlanner(
        bayes_prc,
        ac_sel,
        pol,
        trials=trials,
        T=T,
        prior_states=state_prior,
        prior_policies=prior_pi,
        number_of_states=ns,
        prior_context=prior_context,
        learn_habit=True,
        learn_rew=True,
        #save_everything = True,
        number_of_policies=npi,
        number_of_rewards=nr)
    """
    create world
    """

    w = world.World(environment, bayes_pln, trials=trials, T=T)
    """
    simulate experiment
    """
    if not deval:
        w.simulate_experiment(range(trials))

    else:
        w.simulate_experiment(range(trials // 2))
        # reset utility to implement devaluation
        ut = utility[1:].sum()
        bayes_prc.prior_rewards[2:] = ut / (nr - 2)
        bayes_prc.prior_rewards[:2] = (1 - ut) / 2

        w.simulate_experiment(range(trials // 2, trials))

    return w
def run_agent(par_list, trials=trials, T=T, ns=ns, na=na):

    #set parameters:
    #obs_unc: observation uncertainty condition
    #state_unc: state transition uncertainty condition
    #goal_pol: evaluate only policies that lead to the goal
    #utility: goal prior, preference p(o)
    learn_pol, avg, Rho, learn_habit, utility = par_list
    learn_rew = 1

    """
    create matrices
    """


    #generating probability of observations in each state
    A = np.eye(no)


    #state transition generative probability (matrix)
    B = np.zeros((ns, ns, na))
    b1 = 0.7
    nb1 = 1.-b1
    b2 = 0.7
    nb2 = 1.-b2

    B[:,:,0] = np.array([[  0,  0,  0,  0,  0,  0,  0,],
                         [ b1,  0,  0,  0,  0,  0,  0,],
                         [nb1,  0,  0,  0,  0,  0,  0,],
                         [  0,  1,  0,  1,  0,  0,  0,],
                         [  0,  0,  1,  0,  1,  0,  0,],
                         [  0,  0,  0,  0,  0,  1,  0,],
                         [  0,  0,  0,  0,  0,  0,  1,],])

    B[:,:,1] = np.array([[  0,  0,  0,  0,  0,  0,  0,],
                         [nb2,  0,  0,  0,  0,  0,  0,],
                         [ b2,  0,  0,  0,  0,  0,  0,],
                         [  0,  0,  0,  1,  0,  0,  0,],
                         [  0,  0,  0,  0,  1,  0,  0,],
                         [  0,  1,  0,  0,  0,  1,  0,],
                         [  0,  0,  1,  0,  0,  0,  1,],])

    # create reward generation
#
#    C = np.zeros((utility.shape[0], ns))
#
#    vals = np.array([0., 1./5., 0.95, 1./5., 1/5., 1./5.])
#
#    for i in range(ns):
#        C[:,i] = [1-vals[i],vals[i]]
#
#    changes = np.array([0.01, -0.01])
#    Rho = generate_bandit_timeseries(C, nb, trials, changes)

    # agent's beliefs about reward generation

    C_alphas = np.zeros((nr, ns, nc)) + learn_rew
    C_alphas[0,:3,:] = 100
    for i in range(1,nr):
        C_alphas[i,0,:] = 1
#    C_alphas[0,1:,:] = 100
#    for c in range(nb):
#        C_alphas[1,c+1,c] = 100
#        C_alphas[0,c+1,c] = 1
    #C_alphas[:,13] = [100, 1]

    C_agent = np.zeros((nr, ns, nc))
    for c in range(nc):
        C_agent[:,:,c] = np.array([(C_alphas[:,i,c])/(C_alphas[:,i,c]).sum() for i in range(ns)]).T
    #np.array([np.random.dirichlet(C_alphas[:,i]) for i in range(ns)]).T

    # context transition matrix

    transition_matrix_context = np.ones(1)

    """
    create environment (grid world)
    """

    environment = env.MultiArmedBandid(A, B, Rho, trials = trials, T = T)


    """
    create policies
    """

    pol = np.array(list(itertools.product(list(range(na)), repeat=T-1)))

    #pol = pol[-2:]
    npi = pol.shape[0]

    # prior over policies

    prior_pi = np.ones(npi)/npi #np.zeros(npi) + 1e-3/(npi-1)
    #prior_pi[170] = 1. - 1e-3
    alphas = np.zeros((npi, nc)) + learn_pol
#    for i in range(nb):
#        alphas[i+1,i] = 100
    #alphas[170] = 100
    prior_pi = alphas / alphas.sum(axis=0)


    """
    set state prior (where agent thinks it starts)
    """

    state_prior = np.zeros((ns))

    state_prior[0] = 1.

    """
    set action selection method
    """

    if avg:

        sel = 'avg'

        ac_sel = asl.AveragedSelector(trials = trials, T = T,
                                      number_of_actions = na)
    else:

        sel = 'max'

        ac_sel = asl.MaxSelector(trials = trials, T = T,
                                      number_of_actions = na)

#    ac_sel = asl.AveragedPolicySelector(trials = trials, T = T,
#                                        number_of_policies = npi,
#                                        number_of_actions = na)

    prior_context = np.array([1.])

#    prior_context[0] = 1.

    """
    set up agent
    """
    #bethe agent
    if agent == 'bethe':

        agnt = 'bethe'

        pol_par = alphas

        # perception
        bayes_prc = prc.HierarchicalPerception(A, B, C_agent, transition_matrix_context, 
                                          state_prior, utility, prior_pi, 
                                          pol_par, C_alphas, T=T,
                                          pol_lambda=0.3, r_lambda=0.6,
                                          non_decaying=3, dec_temp=4.)

        bayes_pln = agt.BayesianPlanner(bayes_prc, ac_sel, pol,
                          trials = trials, T = T,
                          prior_states = state_prior,
                          prior_policies = prior_pi,
                          number_of_states = ns,
                          prior_context = prior_context,
                          learn_habit = learn_habit,
                          learn_rew=True,
                          #save_everything = True,
                          number_of_policies = npi,
                          number_of_rewards = nr)
    #MF agent
    else:

        agnt = 'mf'

        bayes_prc = prc.MFPerception(A, B, utility, state_prior, T = T)



        bayes_pln = agt.BayesianMFPlanner(bayes_prc, [], ac_sel,
                                  trials = trials, T = T,
                                  prior_states = state_prior,
                                  policies = pol,
                                  number_of_states = ns,
                                  number_of_policies = npi)


    """
    create world
    """

    w = world.World(environment, bayes_pln, trials = trials, T = T)

    """
    simulate experiment
    """

#    w.simulate_experiment(range(trials-100))
#    new_ut = utility.copy()
#    new_ut[1] = utility[0]
#    new_ut /= new_ut.sum()
#    w.agent.perception.reset_preferences(0,new_ut, pol)
#    w.simulate_experiment(range(trials-100, trials))

    w.simulate_experiment(range(trials))


    """
    plot and evaluate results
    """
#    plt.figure()
#
#    for i in range(3,ns):
#        plt.plot(w.environment.Rho[:,1,i], label=str(i))
#
#    plt.ylim([0,1])
#    plt.legend()
#    plt.show()
#
#
#    rewarded = np.where(w.rewards[:trials-1,-1] == 1)[0]
#    unrewarded = np.where(w.rewards[:trials-1,-1] == 0)[0]
#
#    rare = np.append(np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 0)[0]] == 2)[0],
#                     np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 1)[0]] == 1)[0])
#
#    common = np.append(np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 0)[0]] == 1)[0],
#                     np.where(w.environment.hidden_states[np.where(w.actions[:,0] == 1)[0]] == 2)[0])
#
#    names = ["rewarded common", "rewarded rare", "unrewarded common", "unrewarded rare"]
#
#    index_list = [np.intersect1d(rewarded, common), np.intersect1d(rewarded, rare),
#                 np.intersect1d(unrewarded, common), np.intersect1d(unrewarded, rare)]
#
#    stayed_list = [((w.actions[index_list[i],0] - w.actions[index_list[i]+1,0])==0).sum()/len(index_list[i]) for i in range(4)]
#
##    stayed_rew = ((w.actions[rewarded,0] - w.actions[rewarded+1,0]) == 0).sum()/len(rewarded)
##
##    stayed_unrew = ((w.actions[unrewarded,0] - w.actions[unrewarded+1,0]) == 0).sum()/len(unrewarded)
#
#    plt.figure()
#    plt.bar(x=names,height=stayed_list)
#    plt.show()


    return w