5 | 0.25355 | 1 | 0.254 6 | 0.10478 | 0 | 0.345 7 | 0.09657 | 0 | 0.442 8 | 0.03656 | 0 | 0.478 9 | 0.02772 | 0 | 0.506 10 | 0.01111 | 0 | 0.517 11 | 0.00735 | 0 | 0.524 12 | 0.00310 | 0 | 0.527 13 | 0.00190 | 0 | 0.529 14 | 0.00083 | 0 | 0.530 15 | 0.00049 | 0 | 0.531 16 | 0.00022 | 0 | 0.531 17 | 0.00013 | 0 | 0.531 18 | 0.00006 | 0 | 0.531 19 | 0.00003 | 0 | 0.531""" Vs_VI, pis_VI = value_iteration(mdp, gamma=GAMMA, nIt=20, grade_print=make_grader(expected_output)) for (V, pi) in zip(Vs_VI[:10], pis_VI[:10]): plt.figure(figsize=(3,3)) plt.imshow(V.reshape(4,4), cmap='gray', interpolation='none', clim=(0,1)) ax = plt.gca() ax.set_xticks(np.arange(4)-.5) ax.set_yticks(np.arange(4)-.5) ax.set_xticklabels([]) ax.set_yticklabels([]) Y, X = np.mgrid[0:4, 0:4] a2uv = {0: (-1, 0), 1:(0, -1), 2:(1,0), 3:(-1, 0)} Pi = pi.reshape(4,4) for y in range(4): for x in range(4): a = Pi[y, x]
expected_output = """Iteration | # chg actions | V[0] ----------+---------------+--------- 0 | 1 | -0.00000 1 | 9 | 0.00000 2 | 2 | 0.39785 3 | 1 | 0.45546 4 | 0 | 0.53118 5 | 0 | 0.53118 6 | 0 | 0.53118 7 | 0 | 0.53118 8 | 0 | 0.53118 9 | 0 | 0.53118 10 | 0 | 0.53118 11 | 0 | 0.53118 12 | 0 | 0.53118 13 | 0 | 0.53118 14 | 0 | 0.53118 15 | 0 | 0.53118 16 | 0 | 0.53118 17 | 0 | 0.53118 18 | 0 | 0.53118 19 | 0 | 0.53118""" Vs_PI, pis_PI = policy_iteration(mdp, gamma=0.95, nIt=20, grade_print=make_grader(expected_output)) plt.plot(Vs_PI)
def compute_vpi(pi, mdp, gamma): # use pi[state] to access the action that's prescribed by this policy V = np.ones(mdp.nS) # REPLACE THIS LINE WITH YOUR CODE A = np.zeros((mdp.nS,mdp.nS)) B = np.zeros(mdp.nS) for s in range(mdp.nS): a = pi[s] # p = mdp.P[s][a][0] # ns = mdp.P[s][a][1] # r = mdp.P[s][a][2] for p, ns, r in mdp.P[s][pi[s]]: A[s][ns] += GAMMA*p B[s] += p*r A = A - np.eye(mdp.nS) V = -np.linalg.solve(A,B) #V[s] = mdp.P[s][pi[s]][0]*(mdp.P[s][pi[s]][2] + GAMMA*V[mdp.P[s][a][1]]) return V def compute_qpi(vpi, mdp, gamma): Qpi = np.zeros([mdp.nS, mdp.nA]) # REPLACE THIS LINE WITH YOUR CODE for s in range(mdp.nS): for a in range(mdp.nA): for p,ns,r in mdp.P[s][a]: Qpi[s,a] += p*r + gamma*p*vpi[ns] return Qpi expected_Qpi = np.array([[ 0.38 , 3.135, 1.14 , 0.095], [ 0.57 , 3.99 , 2.09 , 0.95 ], [ 1.52 , 4.94 , 3.04 , 1.9 ], [ 2.47 , 5.795, 3.23 , 2.755], [ 3.8 , 6.935, 4.56 , 0.855], [ 4.75 , 4.75 , 4.75 , 4.75 ], [ 4.94 , 8.74 , 6.46 , 2.66 ], [ 6.65 , 6.65 , 6.65 , 6.65 ], [ 7.6 , 10.735, 8.36 , 4.655], [ 7.79 , 11.59 , 9.31 , 5.51 ], [ 8.74 , 12.54 , 10.26 , 6.46 ], [ 10.45 , 10.45 , 10.45 , 10.45 ], [ 11.4 , 11.4 , 11.4 , 11.4 ], [ 11.21 , 12.35 , 12.73 , 9.31 ], [ 12.16 , 13.4 , 14.48 , 10.36 ], [ 14.25 , 14.25 , 14.25 , 14.25 ]]) Qpi = compute_qpi(np.arange(mdp.nS), mdp, gamma=0.95) if np.all(np.isclose(expected_Qpi, Qpi, atol=1e-4)): print("Test passed") else: print("Expected: ", expected_Qpi) print("Actual: ", Qpi) def policy_iteration(mdp, gamma, nIt, grade_print=print): Vs = [] pis = [] pi_prev = np.zeros(mdp.nS,dtype='int') pis.append(pi_prev) grade_print("Iteration | # chg actions | V[0]") grade_print("----------+---------------+---------") for it in range(nIt): # YOUR CODE HERE # you need to compute qpi which is the state-action values for current pi vpi = compute_vpi(pi_prev, mdp, gamma) qpi = compute_qpi(vpi, mdp, gamma) pi = qpi.argmax(axis=1) grade_print("%4i | %6i | %6.5f"%(it, (pi != pi_prev).sum(), vpi[0])) Vs.append(vpi) pis.append(pi) pi_prev = pi return Vs, pis expected_output = """Iteration | # chg actions | V[0] ----------+---------------+--------- 0 | 1 | -0.00000 1 | 9 | 0.00000 2 | 2 | 0.39785 3 | 1 | 0.45546 4 | 0 | 0.53118 5 | 0 | 0.53118 6 | 0 | 0.53118 7 | 0 | 0.53118 8 | 0 | 0.53118 9 | 0 | 0.53118 10 | 0 | 0.53118 11 | 0 | 0.53118 12 | 0 | 0.53118 13 | 0 | 0.53118 14 | 0 | 0.53118 15 | 0 | 0.53118 16 | 0 | 0.53118 17 | 0 | 0.53118 18 | 0 | 0.53118 19 | 0 | 0.53118""" Vs_PI, pis_PI = policy_iteration(mdp, gamma=0.95, nIt=20, grade_print=make_grader(expected_output)) plt.plot(Vs_PI);