def test_td0_method(alpha=0.1, gamma=0.9, eps=0.1):
    game = StandardGrid()
    print_values(game.rewards, game)
    policy = {
        ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left
        (0, 0): [0, 0, 1, 0],
        (0, 1): [1, 0, 0, 0],
        (0, 2): [1, 0, 0, 0],
        (1, 0): [0, 0, 1, 0],
        (1, 2): [0, 0, 1, 0],
        (2, 0): [0, 0, 1, 0],
        (2, 1): [0, 0, 1, 0],
        (2, 2): [0, 0, 1, 0],
        (3, 2): [1, 0, 0, 0]
    }
    print("Policy:")
    print_policy_beautifly(policy, game)
    V = {}
    states = game.allStates()
    for s in states:
        V[s] = 0
    for t in xrange(100000):
        s_and_rs = play_game(game, policy, False, eps)
        for i in xrange(len(s_and_rs) - 1):
            state, _ = s_and_rs[i]
            s_plus_one, reward = s_and_rs[i + 1]
            V[state] = V[state] + alpha * (reward + gamma * V[s_plus_one] -
                                           V[state])
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
Example #2
0
def test_td0_approximation_method(alpha=0.1,
                                  gamma=0.9,
                                  eps=0.1,
                                  itterations=10000):
    game = StandardGrid()
    print_values(game.rewards, game)
    policy = {
        ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left
        (0, 0): [0, 0, 1, 0],
        (0, 1): [1, 0, 0, 0],
        (0, 2): [1, 0, 0, 0],
        (1, 0): [0, 0, 1, 0],
        (1, 2): [0, 0, 1, 0],
        (2, 0): [0, 0, 1, 0],
        (2, 1): [0, 0, 1, 0],  #(2, 1): [0, 0, 1 , 0],
        (2, 2): [0, 0, 1, 0],  #(2, 2): [0, 0, 1 , 0],
        (3, 2): [1, 0, 0, 0]
    }
    print("Policy:")
    print_policy_beautifly(policy, game)
    states = game.allStates()
    # Let's create a state
    model = LinearState()
    deltas = []  ## Debug only
    for t in xrange(itterations):
        s_and_rs = play_game(game, policy, False, eps)
        biggest_change = 0
        alpha = alpha / 1.0001
        for i in xrange(len(s_and_rs) - 1):
            state, _ = s_and_rs[i]
            s_plus_one, reward = s_and_rs[i + 1]
            if game.isTerminal(s_plus_one):
                target = reward
            else:
                target = reward + gamma * model.predict(s_plus_one)
            #This is for debug only
            old_weights = model.weights.copy()
            # This is gradient step similar to below
            #V[state] = V[state] + alpha *(reward + gamma*V[s_plus_one] - V[state])
            model.weights += alpha * (
                target - model.predict(state)) * model.gradient(state)
            biggest_change = max(biggest_change,
                                 np.abs(old_weights - model.weights).sum())
        deltas.append(biggest_change)
    # We need to recreate the V
    V = {}
    for s in states:
        print(s)
        if s in game.actions:
            V[s] = model.predict(s)
        else:
            # terminal state or state we can't otherwise get to
            V[s] = 0

    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
    plt.plot(deltas)
    plt.show()
Example #3
0
def testForRandomPolicyEvaluationNegativeGrid():
    print("Negative Grid Random Policy Testing!")
    gamma = 0.7
    g = NegativeGrid(-0.50)
    p = RandomPolicy.createRandomPolicy(g.actions, g.width, g.height)
    print("Before itterative improvement:")
    print("========================================")
    print_policy_beautifly(p, g)
    print_policy(p, g)
    for i in range(0, 1000):
        v = evaluateValueFunction(g, p, gamma=gamma)
        p = itteratePolicy(g, v, p, gamma=gamma)
    print("After improvement:")
    print_policy_beautifly(p, g)
    print_policy(p, g)
    print_values(v, g)
Example #4
0
def testForDetermenisticPolicyEvaluation():
    print("Standart Grid Deterministic Policy Testing!")
    gamma = 0.5
    g = StandardGrid()
    p2 = RandomPolicy.createRandomDeterministicPolicy(g.actions, g.width,
                                                      g.height)
    print("Before itterative improvement:")
    print("========================================")
    print_policy_beautifly(p2, g)
    print_policy(p2, g)
    print("========================================")
    for i in range(0, 100):
        V2 = evaluateValueFunction(g, p2, gamma=gamma)
        p2 = itterateDetermenisticPolicy(g, V2, p2, gamma=gamma)
    print("After improvement:")
    print_policy_beautifly(p2, g)
    print_policy(p2, g)
    print_values(V2, g)
Example #5
0
def testToEvaluateValue():
    gamma = 1
    g = StandardGrid()
    p1 = generateRandomPolicy()
    V1 = evaluateValueFunction(g, p1, gamma)
    print("========================================")
    print_policy(p1, g)
    print_policy_beautifly(p1, g)
    print("========================================")
    print_values(V1, g)
    #####
    g = StandardGrid()
    p2 = generateDeterministicPolicyTwo()
    V2 = evaluateValueFunction(g, p2, gamma)
    print("========================================")
    print_policy(p2, g)
    print_policy_beautifly(p2, g)
    print("========================================")
    print_values(V2, g)
Example #6
0
def testForValueItteration():
    # Last algorithm of the chapter, but the most efficient one
    # Itterating Values to get the policy automatically
    print("Negative Grid Value Itteration Testing!")
    gamma = 0.9
    bigNegative = -10000
    allActions = [
        GridWorldMove.up, GridWorldMove.down, GridWorldMove.right,
        GridWorldMove.left
    ]
    g = NegativeGrid(-0.50)
    #p = generateDeterministicPolicyRandomly()
    p = generateDeterministicPolicyRandomly(g)
    print("Before itterative improvement:")
    print("========================================")
    print_policy_beautifly(p, g)
    print_policy(p, g)
    V = {}
    states = g.allStates()
    for s in states:
        if s in g.actions:
            V[s] = np.random.random()
        else:
            V[s] = 0
    convergenceMet = False
    while not convergenceMet:
        biggest_change = 0
        for s in states:
            old_v = V[s]
            # Policy already reduced to all possible actions
            if s in p:
                new_v = bigNegative
                for a in allActions:
                    g.setPosition(s)
                    r = g.move(a)
                    v = r + gamma * V[g.position]
                    if v > new_v:
                        new_v = v
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))
        if biggest_change < SMALL_ENOUGH:
            convergenceMet = True
    # Building optimum policy
    for s in p.keys():
        best_a = None
        best_value = bigNegative
        # loop through all possible actions
        # to find the best current action for the state
        for a in allActions:
            g.setPosition(s)
            r = g.move(a)
            v = r + gamma * V[g.position]
            if v > best_value:
                best_value = v
                best_a = a
            p[s][a] = 0.0
        # Best gets all
        p[s][best_a] = 1.0
    print("After Value itteration improvement:")
    print("========================================")
    print_policy_beautifly(p, g)
    print_policy(p, g)
def test_td0_sarsa(alpha=0.1, gamma=0.9, eps=0.1):
    game = NegativeGrid()
    print_values(game.rewards, game)
    allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right,
                  GridWorldMove.left)
    states = game.allStates()
    Q = {}
    Q_visists = {}
    Q_visists_debug = {}
    for s in states:
        Q[s] = {}
        Q_visists[s] = {}
        Q_visists_debug[s] = 0
        for a in allActions:
            Q[s][a] = 0
            Q_visists[s][a] = 1.0
    t = 1.0
    deltas = []

    for j in xrange(100000):
        ## Learning rate change
        if j % 100 == 0:
            t += 10e-3

        if j % 1000 == 0:
            print(".")
        max_change = 0
        state = game.start
        game.position = state
        game_over = False
        action, _ = max_dict(Q[s])
        action = eps_random_action(action, eps=0.5 / t)
        while not game_over:
            r = game.move(action)
            state2 = game.position
            game_over = game.gameOver()
            ## Let's find possible next actions
            action2, _ = max_dict(Q[state2])
            action2 = eps_random_action(action2, eps=0.5 / t)  # epsilon-greedy
            ## At this stage we have all SARSA, let's calculate Q
            betta = alpha / Q_visists[state][action]
            Q_visists[state][action] += 0.001  ## 0.005
            old_qsa = Q[state][action]
            Q[state][action] = Q[state][action] + betta * (
                r + gamma * Q[state2][action2] - Q[state][action])
            max_change = max(max_change, abs(Q[state][action] - old_qsa))
            Q_visists_debug[state] = Q_visists_debug.get(state, 0) + 1
            action = action2
            state = state2
        deltas.append(max_change)
    plt.plot(deltas)
    plt.show()

    policy = {}
    V = {}
    for s in game.actions.keys():
        a, max_q = max_dict(Q[s])
        policy[s] = [0, 0, 0, 0]
        policy[s][a] = 1.0
        V[s] = max_q
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
def test_td0_q_learning(itterations=1000,
                        alpha=0.1,
                        gamma=0.9,
                        eps=0.5,
                        delta_t=10e-3,
                        learning_decay=0.0001):
    game = NegativeGrid(-0.2)  #StandardGrid()
    print_values(game.rewards, game)
    allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right,
                  GridWorldMove.left)
    states = game.allStates()
    Q = {}
    Q_visists = {}
    Q_visists_debug = {}
    for s in states:
        Q[s] = {}
        Q_visists[s] = {}
        Q_visists_debug[s] = 0
        for a in allActions:
            Q[s][a] = 0
            Q_visists[s][a] = 1.0
    t = 1.0
    deltas = []

    for j in xrange(itterations):
        ## Learning rate change
        if j % 100 == 0:
            t += delta_t
        if j % 1000 == 0:
            print(".")
        max_change = 0
        state = game.start
        game.position = state
        game_over = False
        action = max_dict(Q[s])[0]
        while not game_over:
            ## The fact that we use e-greedy makes or actions non optimal during the play
            ## However this allows us to be off-policy
            ## That makes Q-LEarning - an OFF-Policy algorithm
            action = eps_random_action(action, eps=eps / t)
            r = game.move(action)
            state2 = game.position
            game_over = game.gameOver()
            ## Let's find possible next actions
            action2, q_max_a2 = max_dict(Q[state2])
            ## At this stage we have all Q-Learning params
            betta = alpha / Q_visists[state][action]
            Q_visists[state][action] += learning_decay
            old_qsa = Q[state][action]
            ### So we are taking Q* here despite not taking the best move as a future move.
            Q[state][action] = Q[state][action] + betta * (
                r + gamma * q_max_a2 - Q[state][action])
            max_change = max(max_change, abs(Q[state][action] - old_qsa))
            Q_visists_debug[state] = Q_visists_debug.get(state, 0) + 1
            action = action2
            state = state2
        deltas.append(max_change)
    plt.plot(deltas)
    plt.show()

    policy = {}
    V = {}
    for s in game.actions.keys():
        a, max_q = max_dict(Q[s])
        policy[s] = [0, 0, 0, 0]
        policy[s][a] = 1.0
        V[s] = max_q
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
Example #9
0
def test_td0_sarsa_approximated(alpha=0.1,
                                gamma=0.9,
                                eps=0.1,
                                itterations=50000):
    game = StandardGrid()  #NegativeGrid(-0.1)
    print_values(game.rewards, game)
    allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right,
                  GridWorldMove.left)
    states = game.allStates()
    t = 1.0
    deltas = []
    model = LinearSarsaState()
    for j in xrange(itterations):
        ## Learning rate change
        if j % 1000 == 0:
            t += 10e-3

        if j % 1000 == 0:
            print(".")

        max_change = 0
        state = game.start
        game.position = state
        game_over = False
        Qs = model.getQs(state)
        action = max_dict(Qs)[0]
        action = eps_random_action(action, eps=0.5 / t)
        alpha = alpha / 1.00005
        while not game_over:
            r = game.move(action)
            state2 = game.position
            game_over = game.gameOver()
            Qs2 = model.getQs(state2)
            ## Let's find possible next actions
            action2 = max_dict(Qs2)[0]
            action2 = eps_random_action(action2, eps=0.5 / t)  # epsilon-greedy
            ## At this stage we have all SARSA, let's calculate Q
            old_weights = model.weights.copy()
            model.weights += alpha * (
                r + gamma * model.predict(state2, action2) -
                model.predict(state, action)) * model.gradient(state, action)
            max_change = max(max_change,
                             np.abs(model.weights - old_weights).sum())
            action = action2
            state = state2
        deltas.append(max_change)

    policy = {}
    V = {}
    Q = {}
    for s in game.actions.keys():
        Qs = model.getQs(s)
        Q[s] = Qs
        a, max_q = max_dict(Qs)
        policy[s] = [0, 0, 0, 0]
        policy[s][a] = 1
        V[s] = max_q
    print("Policy:")
    print_policy_beautifly(policy, game)
    print("Values:")
    print_values(V, game)
    plt.plot(deltas)
    plt.show()