コード例 #1
0
                        new_V = v

                V[s] = new_V
                delta = max(delta, np.abs(old_V - V[s]))

        if delta < THRESHOLD:
            break

    for s in policy.keys():

        best_a = None
        best_value = float("-inf")

        for a in POSSIBLE_ACTIONS:
            grid.set_state(s)
            r = grid.move(a)
            v = r + GAMMA * V[grid.current_state()]

            if v > best_value:
                best_value = v
                best_a = a

        policy[s] = best_a

    print("Final value function")
    print_value_func(V, grid)
    print("\n")

    print("Final policy")
    print_policy(policy, grid)
コード例 #2
0
        if first:
            first = False
        else:
            states_returns.append((s, G))

        G = r + GAMMA * G

    states_returns.reverse()
    return states_returns


if __name__ == "__main__":
    grid = standard_grid()

    print("Rewards")
    print_value_func(grid.rewards, grid)
    print("\n")

    policy = {
        (2, 0): "U",
        (1, 0): "U",
        (0, 0): "R",
        (0, 1): "R",
        (0, 2): "R",
        (1, 2): "U",
        (2, 1): "L",
        (2, 2): "U",
        (2, 3): "L",
    }

    V = {}
コード例 #3
0
import numpy as np
from grid_world import standard_grid, negative_grid
from dp_policy_evaluation import print_value_func, print_policy
from monte_carlo_policy_iteration_es import max_dict
from td_zero_prediction import random_action

GAMMA = 0.9
ALPHA = 0.1
POSSIBLE_ACTIONS = ("U", "D", "L", "R")

if __name__ == "__main__":
    grid = negative_grid(step_cost=-0.1)

    print("Rewards")
    print_value_func(grid.rewards, grid)
    print("\n")

    # Initialize Q(s,a)
    Q = {}
    states = grid.all_states()
    for s in states:
        Q[s] = {}
        for a in POSSIBLE_ACTIONS:
            Q[s][a] = 0

    # Keep track of how many times Q[s] has been updated
    update_counts = {}
    update_counts_sa = {}
    for s in states:
        update_counts_sa[s] = {}