def test_td0_method(alpha=0.1, gamma=0.9, eps=0.1):
    game = StandardGrid()
    print_values(game.rewards, game)
    policy = {
        ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left
        (0, 0): [0, 0, 1, 0],
        (0, 1): [1, 0, 0, 0],
        (0, 2): [1, 0, 0, 0],
        (1, 0): [0, 0, 1, 0],
        (1, 2): [0, 0, 1, 0],
        (2, 0): [0, 0, 1, 0],
        (2, 1): [0, 0, 1, 0],
        (2, 2): [0, 0, 1, 0],
        (3, 2): [1, 0, 0, 0]
    }
    print("Policy:")
    print_policy_beautifly(policy, game)
    V = {}
    states = game.allStates()
    for s in states:
        V[s] = 0
    for t in xrange(100000):
        s_and_rs = play_game(game, policy, False, eps)
        for i in xrange(len(s_and_rs) - 1):
            state, _ = s_and_rs[i]
            s_plus_one, reward = s_and_rs[i + 1]
            V[state] = V[state] + alpha * (reward + gamma * V[s_plus_one] -
                                           V[state])
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
def testMonteCarloValuesEvaluation(windy=False):
    gamma = 0.9
    eps = 0.2
    game = StandardGrid()#NegativeGrid(-0.25)
    ## This is to make things real
    #generateRandomPolicy()
    policy = generateRandomlyDeterministicPolicy(game)
    V = {}
    returns = {}
    states = game.allStates()
    seenStates = {}
    for s in states:
        if s in game.actions.keys():
            returns[s] = []
            seenStates.update({s : 0})
        else:
            V[s] = 0

    for t in range(0,10000):

        sAr = playGame(game, policy, gamma, windy, eps)
        for s, g in sAr:
            if s in game.actions.keys():
                if seenStates[s] == 0:
                    returns[s].append(g)
                    V[s] = np.mean(returns[s])
                    seenStates.update({s : 1})

    print_values(V, game)
Esempio n. 3
0
def test_td0_approximation_method(alpha=0.1,
                                  gamma=0.9,
                                  eps=0.1,
                                  itterations=10000):
    game = StandardGrid()
    print_values(game.rewards, game)
    policy = {
        ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left
        (0, 0): [0, 0, 1, 0],
        (0, 1): [1, 0, 0, 0],
        (0, 2): [1, 0, 0, 0],
        (1, 0): [0, 0, 1, 0],
        (1, 2): [0, 0, 1, 0],
        (2, 0): [0, 0, 1, 0],
        (2, 1): [0, 0, 1, 0],  #(2, 1): [0, 0, 1 , 0],
        (2, 2): [0, 0, 1, 0],  #(2, 2): [0, 0, 1 , 0],
        (3, 2): [1, 0, 0, 0]
    }
    print("Policy:")
    print_policy_beautifly(policy, game)
    states = game.allStates()
    # Let's create a state
    model = LinearState()
    deltas = []  ## Debug only
    for t in xrange(itterations):
        s_and_rs = play_game(game, policy, False, eps)
        biggest_change = 0
        alpha = alpha / 1.0001
        for i in xrange(len(s_and_rs) - 1):
            state, _ = s_and_rs[i]
            s_plus_one, reward = s_and_rs[i + 1]
            if game.isTerminal(s_plus_one):
                target = reward
            else:
                target = reward + gamma * model.predict(s_plus_one)
            #This is for debug only
            old_weights = model.weights.copy()
            # This is gradient step similar to below
            #V[state] = V[state] + alpha *(reward + gamma*V[s_plus_one] - V[state])
            model.weights += alpha * (
                target - model.predict(state)) * model.gradient(state)
            biggest_change = max(biggest_change,
                                 np.abs(old_weights - model.weights).sum())
        deltas.append(biggest_change)
    # We need to recreate the V
    V = {}
    for s in states:
        print(s)
        if s in game.actions:
            V[s] = model.predict(s)
        else:
            # terminal state or state we can't otherwise get to
            V[s] = 0

    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
    plt.plot(deltas)
    plt.show()
Esempio n. 4
0
def test_td0_sarsa_approximated(alpha=0.1,
                                gamma=0.9,
                                eps=0.1,
                                itterations=50000):
    game = StandardGrid()  #NegativeGrid(-0.1)
    print_values(game.rewards, game)
    allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right,
                  GridWorldMove.left)
    states = game.allStates()
    t = 1.0
    deltas = []
    model = LinearSarsaState()
    for j in xrange(itterations):
        ## Learning rate change
        if j % 1000 == 0:
            t += 10e-3

        if j % 1000 == 0:
            print(".")

        max_change = 0
        state = game.start
        game.position = state
        game_over = False
        Qs = model.getQs(state)
        action = max_dict(Qs)[0]
        action = eps_random_action(action, eps=0.5 / t)
        alpha = alpha / 1.00005
        while not game_over:
            r = game.move(action)
            state2 = game.position
            game_over = game.gameOver()
            Qs2 = model.getQs(state2)
            ## Let's find possible next actions
            action2 = max_dict(Qs2)[0]
            action2 = eps_random_action(action2, eps=0.5 / t)  # epsilon-greedy
            ## At this stage we have all SARSA, let's calculate Q
            old_weights = model.weights.copy()
            model.weights += alpha * (
                r + gamma * model.predict(state2, action2) -
                model.predict(state, action)) * model.gradient(state, action)
            max_change = max(max_change,
                             np.abs(model.weights - old_weights).sum())
            action = action2
            state = state2
        deltas.append(max_change)

    policy = {}
    V = {}
    Q = {}
    for s in game.actions.keys():
        Qs = model.getQs(s)
        Q[s] = Qs
        a, max_q = max_dict(Qs)
        policy[s] = [0, 0, 0, 0]
        policy[s][a] = 1
        V[s] = max_q
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
    plt.plot(deltas)
    plt.show()