def play_game(grid, policy):
    # returns a list of states and corresponding rewards (not returns!)

    # Start at the designated start state

    s = (2, 0)
    grid.set_state(s)
    states_and_rewards = [(s, 0)]
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))

    return states_and_rewards
        alpha = ALPHA / t2

        #we play instead of generating an episode
        s = (2, 0)
        grid.set_state(s)

        #get Q(s) to choose first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            #need next action since Q(s,a) depends on Q(s',a')
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(
                    s, a)
            else:
                #not terminal
                Qs2 = getQs(model, s2)
                a2 = max_dict(Qs2)[0]
                a2 = random_action(a2, eps=0.5 / t)  #epsilon greedy
Beispiel #3
0
            print("it:", it)

        # instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a, _ = max_dict(Q[s])
        biggest_change = 0
        while not grid.game_over():
            a = random_action(a, eps=0.5 / t)  # epsilon-greedy
            # random action also works, but slower since you can bump into walls
            # a = np.random.choice(ALL_POSSIBLE_ACTIONS)
            r = grid.move(a)
            s2 = grid.current_state()

            # we will update Q(s,a) AS we experience the episode
            old_qsa = Q[s][a]
            # the difference between SARSA and Q-Learning is with Q-Learning
            # we will use this max[a']{ Q(s',a')} in our update
            # even if we do not end up taking this action in the next step
            a2, max_q_s2a2 = max_dict(Q[s2])
            Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * max_q_s2a2 - Q[s][a])
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))

            # we would like to know how often Q(s) has been updated too
    for i in range(10000):
        if i % 100 == 0:
            t += 10e-3
        if t % 2000 == 0:
            print('i', i)


        #start state
        s = (2,0)
        grid.set_state(s)


        #first (s,r) tuple is the state we start in and 0 for r
        #the last (s,r) tuple is terminal so r is 0 and we dont care to update it
        a = max_dict(Q[s])[0]
        a = random_action(a, eps = 0.5/t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            #we need the next action as well since Q(s,a) depends on Q(s',a')
            #if s2 not in policy then terminal state and all Q are 0
            a2 = max_dict(Q[s2])[0]
            a2 = random_action(a2, eps = 0.5/t)

            #we update Q(s,a) as wel experience the episode
            alpha = ALPHA / update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005
            old_qsa = Q[s][a]
            Q[s][a] = Q[s][a] + alpha * (r+ GAMMA*Q[s2][a2] - Q[s][a])
Beispiel #5
0
     for a in ALL_POSSIBLE_ACTIONS:
         update_counts_sa[s][a] = 1.0
         
 t = 1.0
 deltas = []
 for it in range(10000):
     if it % 100 == 0:
         t += 10e-3
     if it % 2000 == 0:
         print('it:', it)
         
     s = (2,0)
     grid.set_state(s)
     
     a = max_dict(Q[s])[0]
     a = random_action(a, eps = 0.5/t)            
     biggest_change = 0
     while not grid.game_over():
         r = grid.move(a)
         s1 = grid.current_state()
         
         a1 = max_dict(Q[s1])[0]
         a1 = random_action(a1, eps = 0.5/t)
         
         alpha = ALPHA/update_counts_sa[s][a]
         update_counts_sa[s][a] += 0.005
         old_qsa = Q[s][a]
         
         Q[s][a] = Q[s][a] + ALPHA*(r + GAMMA*Q[s1][a1] - Q[s][a])
         
         biggest_change = max(biggest_change, np.abs(Q[s][a] - old_qsa))
Beispiel #6
0
    t = 1.0
    deltas = []
    for it in range(10000):
        if it % 100 == 0:
            t += 10e-3
        if it % 2000 == 0:
            print ('it:', it)

        s = (2,0)
        grid.set_state(s)

        a = max_dict(Q[s])[0]
        
        biggest_change = 0
        while not grid.game_over():
            a = random_action(a, eps=0.5/t)
            #a = np.random.choice(ALL_POSSIBLE_ACTIONS)
            r = grid.move(a)
            s2 = grid.current_state()

            # update Q(s,a) as we experience the episode
            alpha = ALPHA/update_counts_sa[s][a]
            update_counts_sa[s][a] += 0.005

            old_qsa = Q[s][a]
            # the difference between SARSA and Q-Learning is with 
            # Q-Learning we will use max[a']{Q(s', a')} in our update
            # even if we do not end up taking this action in the next step
            a2, max_q_s2a2 = max_dict(Q[s2])
            Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q_s2a2 - Q[s][a])
            biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
      print "it:", it

    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a, _ = max_dict(Q[s])
    biggest_change = 0
    while not grid.game_over():
      a = random_action(a, eps=0.5/t) # epsilon-greedy
      # random action also works, but slower since you can bump into walls
      # a = np.random.choice(ALL_POSSIBLE_ACTIONS)
      r = grid.move(a)
      s2 = grid.current_state()

      # adaptive learning rate
      alpha = ALPHA / update_counts_sa[s][a]
      update_counts_sa[s][a] += 0.005

      # we will update Q(s,a) AS we experience the episode
      old_qsa = Q[s][a]
      # the difference between SARSA and Q-Learning is with Q-Learning
      # we will use this max[a']{ Q(s',a')} in our update
      # even if we do not end up taking this action in the next step
      a2, max_q_s2a2 = max_dict(Q[s2])