def generate_policy(): policy_stable = True for i in xrange(max_car+1): for j in xrange(max_car+1): rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move+1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 pi[i, j] = np.argmax(rewardlist)-5
def policy_eval(): while True: delta = 0 for i in xrange(max_car + 1): for j in xrange(max_car + 1): temp = V[i, j] V[i, j] = car_and_reward(i, j, pi[i, j], V) delta = max(delta, abs(temp - V[i, j])) if delta < 1e-9: break
def policy_eval(): while True: delta = 0 for i in xrange(max_car+1): for j in xrange(max_car+1): temp = V[i, j] V[i, j] = car_and_reward(i, j, pi[i, j], V) delta = max(delta, abs(temp-V[i, j])) if delta < 1e-9: break
def generate_policy(): policy_stable = True for i in xrange(max_car + 1): for j in xrange(max_car + 1): rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move + 1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 pi[i, j] = np.argmax(rewardlist) - 5
def policy_improv(): policy_stable = True for i in xrange(max_car + 1): for j in xrange(max_car + 1): temp = pi[i, j] rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move + 1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 pi[i, j] = np.argmax(rewardlist) - 5 if temp != pi[i, j]: policy_stable = False return policy_stable
def policy_improv(): policy_stable = True for i in xrange(max_car+1): for j in xrange(max_car+1): temp = pi[i, j] rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move+1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 pi[i, j] = np.argmax(rewardlist)-5 if temp != pi[i, j]: policy_stable = False return policy_stable
def policy_iter(): while True: delta = 0 for i in xrange(max_car+1): for j in xrange(max_car+1): # print i,j temp = V[i, j] rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move+1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 V[i, j] = np.amax(rewardlist) delta = max(delta, abs(temp-V[i, j])) if delta < 1e-9: break
def policy_iter(): while True: delta = 0 for i in xrange(max_car + 1): for j in xrange(max_car + 1): # print i,j temp = V[i, j] rewardlist = np.zeros(11) index = 0 for k in xrange(-max_move, max_move + 1): rewardlist[index] = car_and_reward(i, j, k, V) index += 1 V[i, j] = np.amax(rewardlist) delta = max(delta, abs(temp - V[i, j])) if delta < 1e-9: break