# instead of 'generating' an epsiode, we will PLAY
        # an episode within this loop
        s = (2, 0)  # start state
        grid.set_state(s)

        # get Q(s) so we can choose the first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)  # epsilon-greedy
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            # we need the next action as well since Q(s,a) depends on Q(s',a')
            # if s2 not in policy then it's a terminal state, all Q are 0
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(
                    s, a)
            else:
                # not terminal
                Qs2 = getQs(model, s2)
                a2 = max_dict(Qs2)[0]
    t2 = 1.0
    for it in range(20000):
        if it % 100 == 0:
            t += 10e-3
            t2 += 0.01
        if it % 1000 == 0:
            print(it)
        alpha = ALPHA / t2
        
        s = (2, 0)
        grid.set_state(s)

        Qs = getQs(model, s)
        
        a = max_dict(Qs)[0]
        a = random_action(a, 0.5/t)
        biggest_change = 0

        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            old_theta = model.theta.copy()

            if grid.is_terminal(s2):
                model.theta += alpha * (
                    r - model.predict(s, a)
                ) * model.grad(s, a)
            else:
                Qs2 = getQs(model, s2)
                a2 = max_dict(Qs2)[0]
    # instead of 'generating' an epsiode, we will PLAY
    # an episode within this loop
    s = (2, 0) # start state
    grid.set_state(s)

    # get Q(s) so we can choose the first action
    Qs = getQs(model, s)

    # the first (s, r) tuple is the state we start in and 0
    # (since we don't get a reward) for simply starting the game
    # the last (s, r) tuple is the terminal state and the final reward
    # the value for the terminal state is by definition 0, so we don't
    # care about updating it.
    a = max_dict(Qs)[0]
    a = random_action(a, eps=0.5/t) # epsilon-greedy
    biggest_change = 0
    while not grid.game_over():
      r = grid.move(a)
      s2 = grid.current_state()

      # we need the next action as well since Q(s,a) depends on Q(s',a')
      # if s2 not in policy then it's a terminal state, all Q are 0
      old_theta = model.theta.copy()
      if grid.is_terminal(s2):
        model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a)
      else:
        # not terminal
        Qs2 = getQs(model, s2)
        a2 = max_dict(Qs2)[0]
        a2 = random_action(a2, eps=0.5/t) # epsilon-greedy
        if it % 100 == 0:
            t += 0.01
            t2 += 0.01
        if it % 1000 == 0:
            print("it:", it)
        alpha = ALPHA / t2


        s = (2,0)
        grid.set_state(s)

        Qs = getQs(model, s)


        a = max_dict(Qs)[0]
        a = random_action(a, eps=.5/t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(s, a)
            else:
                Qs2 = getQs(model, s2)
                a2 = max_dict(Qs2)[0]
                a2 = random_action(a2, eps=.5/t)

                model.theta += alpha * (r + GAMMA*model.predict(s2,a2) - model.predict(s, a))*model.grad(s,a)