action = policy[row, col, vv, vh] seen.append((row, col)) # print row, col, vv, vh r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track) if r == 0: seen.append((row, col)) break return seen start = time.time() eps = 0.1 for k in xrange(100000): row, col, vv, vh = generate_start_state() while True: action = generate_action(policy[row, col, vv, vh], eps) reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track) Q[row, col, vv, vh, action] += alpha*(reward+gamma*np.amax(Q[new_row, new_col, new_vv, new_vh, :])-Q[row, col, vv, vh, action]) policy[row, col, vv, vh] = np.argmax(Q[row, col, vv, vh, :]) row, col, vv, vh = new_row, new_col, new_vv, new_vh if game_over(row, col, race_track): break print time.time()-start for k in xrange(6, 12): race_track = track() been = run(k) for state in been: race_track[state] = 5 plt.figure()
break action = policy[row, col, vv, vh] seen.append((row, col)) # print row, col, vv, vh r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track) if r == 0: break return seen start = time.time() eps = 0.1 for k in xrange(1000): E = np.zeros((height, width, n_vv, n_vh, actions)) row, col, vv, vh = generate_start_state() action = generate_action(policy[row, col, vv, vh], eps) while True: reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track) new_action = generate_action(policy[new_row, new_col, new_vv, new_vh], eps) delta = reward + gamma*Q[new_row, new_col, new_vv, new_vh, new_action] - Q[row, col, vv, vh, action] E[row, col, vv, vh, action] = (1-alpha)*E[row, col, vv, vh, action]+1 Q = Q + alpha*delta*E E = E*alpha*lamb policy = np.argmax(Q, axis=4) row, col, vv, vh, action = new_row, new_col, new_vv, new_vh, new_action if game_over(row, col, race_track): break print time.time()-start for k in xrange(6, 12):
seen.append((row, col)) # print row, col, vv, vh r, row, col, vv, vh = generate_reward_and_next_state( row, col, vv, vh, action, race_track) if r == 0: break return seen start = time.time() eps = 0.1 for k in xrange(1000): E = np.zeros((height, width, n_vv, n_vh, actions)) row, col, vv, vh = generate_start_state() action = generate_action(policy[row, col, vv, vh], eps) while True: reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state( row, col, vv, vh, action, race_track) new_action = generate_action(policy[new_row, new_col, new_vv, new_vh], eps) delta = reward + gamma * Q[new_row, new_col, new_vv, new_vh, new_action] - Q[row, col, vv, vh, action] E[row, col, vv, vh, action] = (1 - alpha) * E[row, col, vv, vh, action] + 1 Q = Q + alpha * delta * E E = E * alpha * lamb policy = np.argmax(Q, axis=4) row, col, vv, vh, action = new_row, new_col, new_vv, new_vh, new_action if game_over(row, col, race_track): break
# print row, col, vv, vh r, row, col, vv, vh = generate_reward_and_next_state( row, col, vv, vh, action, race_track) if r == 0: seen.append((row, col)) break return seen start = time.time() eps = 0.1 for k in xrange(100000): row, col, vv, vh = generate_start_state() while True: action = generate_action(policy[row, col, vv, vh], eps) reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state( row, col, vv, vh, action, race_track) Q[row, col, vv, vh, action] += alpha * ( reward + gamma * np.amax(Q[new_row, new_col, new_vv, new_vh, :]) - Q[row, col, vv, vh, action]) policy[row, col, vv, vh] = np.argmax(Q[row, col, vv, vh, :]) row, col, vv, vh = new_row, new_col, new_vv, new_vh if game_over(row, col, race_track): break print time.time() - start for k in xrange(6, 12): race_track = track() been = run(k)