# our model is V_hat = theta.dot(x)
  # where x = [row, col, row*col, 1] - 1 for bias term
  theta = np.random.randn(4) / 2
  def s2x(s):
    return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1])

  # repeat until convergence
  deltas = []
  t = 1.0
  for it in xrange(20000):
    if it % 100 == 0:
      t += 0.01
    alpha = LEARNING_RATE/t
    # generate an episode using pi
    biggest_change = 0
    states_and_returns = play_game(grid, policy)
    seen_states = set()
    for s, G in states_and_returns:
      # check if we have already seen s
      # called "first-visit" MC policy evaluation
      if s not in seen_states:
        old_theta = theta.copy()
        x = s2x(s)
        V_hat = theta.dot(x)
        # grad(V_hat) wrt theta = x
        theta += alpha*(G - V_hat)*x
        biggest_change = max(biggest_change, np.abs(old_theta - theta).sum())
        seen_states.add(s)
    deltas.append(biggest_change)

  plt.plot(deltas)
Example #2
0
    }

    theta = np.random.randn(4) / 2

    def s2x(s):
        return np.array([s[0] - 1, s[1] - 1.5, s[0] * s[1] - 3, 1])

    deltas = []
    t = 1.0
    for it in range(20000):
        if it % 100 == 0:
            t += 0.01
        alpha = LEARNING_RATE / t

        biggest_change = 0
        states_and_returns = play_game(grid, policy)
        seen_states = set()

        for s, G in states_and_returns:
            if s not in seen_states:
                old_theta = theta.copy()
                x = s2x(s)
                V_hat = theta.dot(x)
                theta += alpha * (G - V_hat) * x
                biggest_change = max(biggest_change,
                                     np.abs(old_theta - theta).sum())
                seen_states.add(s)
        deltas.append(biggest_change)

    plt.plot(deltas)
    plt.show()
Example #3
0
    }

    theta = np.random.randn(4) / 2

    # V = theta.dot(x)
    def s2x(s):
        return np.array([s[0] - 1, s[1] - 1.5, s[0] * s[1] - 3, 1])

    delta = []
    t = 1.0
    for it in range(10000):
        if it % 100 == 0:
            t += 10e-3
        biggest_change = 0
        alpha = LEARNING_RATE / t
        state_return = play_game(grid, policy)
        seen_states = set()
        for s, G in state_return:
            if s not in seen_states:
                old_theta = theta.copy()
                x = s2x(s)
                V_hat = theta.dot(x)
                theta += alpha * (G - V_hat) * x
                biggest_change = max(biggest_change,
                                     np.abs(old_theta - theta).sum())
                seen_states.add(s)
        delta.append(biggest_change)

    plt.plot(delta)
    plt.show()