コード例 #1
0
        biggest_change = 0
        states_actions_returns = play_game(grid, policy)
        seen_state_action_pairs = set()
        for s, a, G in states_actions_returns:
            sa = (s, a)
            if sa not in seen_state_action_pairs:
                old_q = Q[s][a]
                returns[sa].append(G)
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

    for s in policy.keys():
        a, _ = max_dict(Q[s])
        policy[s] = a

    plt.plot(deltas)
    plt.show()

    print("final policy")
    print_policy(policy, grid)

    V = {}
    for s, Qs in Q.items():
        V[s] = max_dict(Q[s])[1]

    print("final values")
    print_values(V, grid)
コード例 #2
0
        update_counts_sa[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            update_counts_sa[s][a] = 1.0

    t = 1.0
    deltas = []
    for it in range(10000):
        if it % 100 == 0:
            t += 10e-3
        if it % 2000 == 0:
            print(it)

        s = (2, 0)
        grid.set_state(s)

        a = max_dict(Q[s])[0]
        biggest_change = 0

        while not grid.game_over():
            a = random_action(a, eps=0.5/t)
            r = grid.move(a)
            s2 = grid.current_state()

            alpha = ALPHA / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]

            a2, max_q_s2a2 = max_dict(Q[s2])
            Q[s][a] = Q[s][a] + alpha * (
                r + GAMMA * max_q_s2a2 - Q[s][a]
コード例 #3
0
        update_counts_sa[s] = {}
        for a in all_possible_actions:
            update_counts_sa[s][a] = 1.0

    t = 1.0
    deltas = []
    for it in range(10000):
        if it % 100 == 0:
            t += 1e-2
        if it % 2000 == 0:
            print("it:")
            print(it)

        s = (2, 0)
        grid.set_state(s)
        a = max_dict(Q[s])[0]
        a = random_action(a, eps=0.5 / t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state(s)

            a2 = max_dict(Q[s2])[0]
            a2 = random_action(a2, eps=0.5 / t)

            alpha = alpha / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]
            Q[s][a] = Q[s][a] + alpha * (r + gamma * Q[s2][a2] - Q[s][a])
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
コード例 #4
0
    t = 1.0
    t2 = 1.0
    for it in range(20000):
        if it % 100 == 0:
            t += 10e-3
            t2 += 0.01
        if it % 1000 == 0:
            print(it)
        alpha = ALPHA / t2
        
        s = (2, 0)
        grid.set_state(s)

        Qs = getQs(model, s)
        
        a = max_dict(Qs)[0]
        a = random_action(a, 0.5/t)
        biggest_change = 0

        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            old_theta = model.theta.copy()

            if grid.is_terminal(s2):
                model.theta += alpha * (
                    r - model.predict(s, a)
                ) * model.grad(s, a)
            else:
                Qs2 = getQs(model, s2)