Ejemplo n.º 1
0
def policyIter(gamma=0.5):
    pi = initPolicy()
    while True:
        piOld = pi

        #Solve system for V(s)
        v = solveV(pi, gamma)

        q = np.zeros((num_s, num_a))
        for s in range(s_stop):
            for a in range(num_a):
                q[s, a] = gamma * np.dot(darts.transModel(s, a), v)

        #Take the index of the maximum Q(s,a)
        pi = np.argmax(q, 1)

        #convergence condition
        if np.array_equal(piOld, pi):
            break
    return pi
Ejemplo n.º 2
0
def policyIter(gamma=0.5):
    pi = initPolicy()
    while True:
        piOld = pi

        #Solve system for V(s)
        v = solveV(pi, gamma)

        q = np.zeros((num_s,num_a))
        for s in range(s_stop):
            for a in range(num_a):
                q[s,a] = gamma * np.dot(darts.transModel(s,a), v)

        #Take the index of the maximum Q(s,a)
        pi = np.argmax(q, 1)

        #convergence condition
        if np.array_equal(piOld,pi):
            break
    return pi
Ejemplo n.º 3
0
def solveV(pi, gamma=0.5):
    "Calculate the value of each state given the policies"
    v = np.zeros(num_s)

    #These are set because they end the game
    v[101] = 1
    v[102:num_s] = -1

    vSub = np.zeros(s_stop + 1)

    #These are subarrays of s and pi up to s_stop
    sSub = np.arange(s_stop, dtype=float)
    piSub = pi[:s_stop]

    trans = darts.transModel(sSub, piSub)
    a = trans[:, :s_stop] - np.identity(s_stop) / gamma

    b = np.sum(trans[:, 102:117], 1) - trans[:, 101]

    vSub = np.linalg.solve(a, b)

    v[:s_stop] = vSub

    return v
Ejemplo n.º 4
0
def solveV(pi, gamma=0.5):
    "Calculate the value of each state given the policies"
    v = np.zeros(num_s)

    #These are set because they end the game
    v[101] = 1
    v[102:num_s] = -1

    vSub = np.zeros(s_stop+1)

    #These are subarrays of s and pi up to s_stop
    sSub = np.arange(s_stop, dtype=float)
    piSub = pi[:s_stop]

    trans = darts.transModel(sSub, piSub)
    a = trans[:,:s_stop] - np.identity(s_stop)/gamma

    b = np.sum(trans[:,102:117],1) - trans[:,101]

    vSub = np.linalg.solve(a,b)

    v[:s_stop] = vSub

    return v