action = policy[row, col, vv, vh]
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        if r == 0:
            seen.append((row, col))
            break
    return seen

start = time.time()

eps = 0.1
for k in xrange(100000):
    row, col, vv, vh = generate_start_state()
    while True:
        action = generate_action(policy[row, col, vv, vh], eps)
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        Q[row, col, vv, vh, action] += alpha*(reward+gamma*np.amax(Q[new_row, new_col, new_vv, new_vh, :])-Q[row, col, vv, vh, action])
        policy[row, col, vv, vh] = np.argmax(Q[row, col, vv, vh, :])
        row, col, vv, vh = new_row, new_col, new_vv, new_vh
        if game_over(row, col, race_track):
            break

print time.time()-start

for k in xrange(6, 12):
    race_track = track()
    been = run(k)
    for state in been:
        race_track[state] = 5
    plt.figure()
            break
        action = policy[row, col, vv, vh]
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        if r == 0:
            break
    return seen

start = time.time()

eps = 0.1
for k in xrange(1000):
    E = np.zeros((height, width, n_vv, n_vh, actions))
    row, col, vv, vh = generate_start_state()
    action = generate_action(policy[row, col, vv, vh], eps)
    while True:
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(row, col, vv, vh, action, race_track)
        new_action = generate_action(policy[new_row, new_col, new_vv, new_vh], eps)
        delta = reward + gamma*Q[new_row, new_col, new_vv, new_vh, new_action] - Q[row, col, vv, vh, action]
        E[row, col, vv, vh, action] = (1-alpha)*E[row, col, vv, vh, action]+1
        Q = Q + alpha*delta*E
        E = E*alpha*lamb
        policy = np.argmax(Q, axis=4)
        row, col, vv, vh, action = new_row, new_col, new_vv, new_vh, new_action
        if game_over(row, col, race_track):
            break

print time.time()-start

for k in xrange(6, 12):
Exemple #3
0
        seen.append((row, col))
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        if r == 0:
            break
    return seen


start = time.time()

eps = 0.1
for k in xrange(1000):
    E = np.zeros((height, width, n_vv, n_vh, actions))
    row, col, vv, vh = generate_start_state()
    action = generate_action(policy[row, col, vv, vh], eps)
    while True:
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        new_action = generate_action(policy[new_row, new_col, new_vv, new_vh],
                                     eps)
        delta = reward + gamma * Q[new_row, new_col, new_vv, new_vh,
                                   new_action] - Q[row, col, vv, vh, action]
        E[row, col, vv, vh,
          action] = (1 - alpha) * E[row, col, vv, vh, action] + 1
        Q = Q + alpha * delta * E
        E = E * alpha * lamb
        policy = np.argmax(Q, axis=4)
        row, col, vv, vh, action = new_row, new_col, new_vv, new_vh, new_action
        if game_over(row, col, race_track):
            break
Exemple #4
0
        # print row, col, vv, vh
        r, row, col, vv, vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        if r == 0:
            seen.append((row, col))
            break
    return seen


start = time.time()

eps = 0.1
for k in xrange(100000):
    row, col, vv, vh = generate_start_state()
    while True:
        action = generate_action(policy[row, col, vv, vh], eps)
        reward, new_row, new_col, new_vv, new_vh = generate_reward_and_next_state(
            row, col, vv, vh, action, race_track)
        Q[row, col, vv, vh, action] += alpha * (
            reward + gamma * np.amax(Q[new_row, new_col, new_vv, new_vh, :]) -
            Q[row, col, vv, vh, action])
        policy[row, col, vv, vh] = np.argmax(Q[row, col, vv, vh, :])
        row, col, vv, vh = new_row, new_col, new_vv, new_vh
        if game_over(row, col, race_track):
            break

print time.time() - start

for k in xrange(6, 12):
    race_track = track()
    been = run(k)