Ejemplo n.º 1
0
def simulate_givenstart(model, agent, s, t_max):
    '''
    Simulate an MDP for t_max timesteps or until the
    a terminal state is reached.  Returns a list
        [ (s_0, a_0, r_0), (s_1, a_1, r_1), ...]
    '''

    # s = util.functions.sample(initial)
    result = []
    t = 0
    while t < t_max and not model.is_terminal(s):
        a = agent.sample(s)
        s_p = util.functions.sample(model.T(s, a))
        r = model.R(s, a)

        result.append((s, a, r))
        s = s_p
        t += 1
    if model.is_terminal(s):
        a = agent.sample(s)
        r = model.R(s, a)

        result.append((s, a, r))

    return result
Ejemplo n.º 2
0
    def iter(cls, model, Q):
        V = util.classes.NumMap()
        # Compute V(s) = max_{a} Q(s,a)
        for s in model.S():
            V_s = util.classes.NumMap()
            for a in model.A(s):
                V_s[a] = Q[ (s,a) ]
            if len(V_s) > 0:
                V[s] = V_s.max()
            else:
                V[s] = 0.0
        
        # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
        QQ = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
                QQ[ (s,a) ] = value

        # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state)


        return QQ
Ejemplo n.º 3
0
def sample_model(model, n_samples, distr, agent):
    '''
    sample states (s,a,r,s') where s sampled from distribution
    returns
        [(s_0,a_0,r_0,s_p_0), (s_1,a_1,r_1,s_p_1),...]
    '''
    result = []
    for i in range(n_samples):
        s = util.functions.sample(distr)
        a = agent.sample(s)
        r = model.R(s, a)
        s_p = util.functions.sample(model.T(s, a))
        result.append((s, a, r, s_p))
    return result
Ejemplo n.º 4
0
def QValueSoftMaxSolve(model, thresh = 1):
    
    v = util.classes.NumMap()
    for s in model.S():
        v[s] = 0.0
        
        
    diff = 100.0
    
    while diff >= thresh:
        vp = v
        
        Q = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in  T.items()] )
                Q[ (s,a) ] = value            
        
        v = util.classes.NumMap()

        # need the max action for each state!
        for s in model.S():
            maxx = None
            for a in model.A(s):
                if (maxx == None) or Q[(s,a)] > maxx:
                    maxx = Q[(s,a)]


            e_sum = 0
            for a in model.A(s):
                e_sum += math.exp(Q[(s,a)] - maxx)
                
            v[s] = maxx + math.log(e_sum)
        
        diff = max(abs(value - vp[s]) for (s, value) in v.iteritems())
        
        
    logp = util.classes.NumMap()
    for (sa, value) in Q.iteritems():
        logp[sa] = value - v[sa[0]]
    return logp
Ejemplo n.º 5
0
 def iter(cls, model, Q):
     V = util.classes.NumMap()
     # Compute V(s) = max_{a} Q(s,a)
     for s in model.S():
         V_s = util.classes.NumMap()
         for a in model.A(s):
             V_s[a] = Q[ (s,a) ]
         if len(V_s) > 0:
             V[s] = V_s.max()
         else:
             V[s] = 0.0
     
     # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
     QQ = util.classes.NumMap()
     for s in model.S():
         for a in model.A(s):
             value = model.R(s,a)
             T = model.T(s,a)
             value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
             QQ[ (s,a) ] = value
     return QQ
Ejemplo n.º 6
0
def multi_simulate(model, policies, initials, t_max, interactionlength):
    # policies = [ policy1, policy2, equilibrium1, equilibrium2 ]

    result = []
    Ss = []
    for initial in initials:

        Ss.append(util.functions.sample(initial))
        result.append([])
    t = 0
    atTerminal = False

    interactionCooldown = [-1 for i in range(len(policies))]

    while t < t_max and not atTerminal:
        actions = []
        for i in initials:
            actions.append(None)

        if not policies[2] == None and not policies[
                3] == None and interactionCooldown[
                    0] < 0 and interactionCooldown[1] < 0:
            for (i, s) in enumerate(Ss):
                for (j, s2) in enumerate(Ss):
                    if not i == j:
                        if s2.conflicts(s) or (
                                t > 0 and
                            (result[i][t - 1][0].conflicts(s2)
                             or s.conflicts(result[j][t - 1][0]))):
                            interactionCooldown[0] = interactionlength
                            interactionCooldown[1] = interactionlength

        for (i, a) in enumerate(actions):
            if interactionCooldown[i] <= 0:
                actions[i] = policies[i].sample(Ss[i])
            elif interactionCooldown[i] > 1:
                actions[i] = patrol.model.PatrolActionStop()
            else:
                actions[i] = util.functions.sample(policies[2 + i])
                if actions[i].__class__.__name__ == "PatrolActionMoveForward":
                    actions[i] = policies[i].sample(Ss[i])

            interactionCooldown[i] = interactionCooldown[i] - 1

        for (i, a) in enumerate(actions):
            #			r = model.R(Ss[i],actions[i])

            result[i].append((Ss[i], actions[i]))
            Ss[i] = util.functions.sample(model.T(Ss[i], actions[i]))
            if model.is_terminal(Ss[i]):
                atTerminal = True

        t += 1

    if atTerminal:
        for (i, s) in Ss:
            a = policies[i].sample(s)
            r = model.R(s, a)

            result[i].append((s, a, r))

    return result