예제 #1
0
def random_MDP_forward_reward(S, A, gamma=0.95, b=None):
    """
    GarnetMDP with reward depending only on (s,a) not s'
    """

    if b is None: b = S

    P = np.zeros((S, A, S))
    states = np.array(list(range(S)))

    for s in range(S):
        for a in range(A):
            # pick b states to be connected to.
            connected = np.random.choice(states, size=b, replace=False)
            P[s, a, connected] = random_dist(b)

    # TODO: specify how many states get rewards
    R = np.random.uniform(0, 1, (S, A, 1))
    R = np.tile(R, (1, 1, S))

    M = DiscountedMDP(
        s0=random_dist(S),
        R=R,
        P=P,
        gamma=gamma,
    )

    return M
예제 #2
0
def test():
    S = 10

    γ = 0.7
    p0 = random_dist(S)
    P = random_dist(S, S)

    M = MarkovChain(s0=p0, P=P, γ=γ)

    d = M.d()

    assert np.allclose(d, M.d_by_eigen())

    lc = viz.lc['power']

    lc.baselines = {'eps_mach': np.finfo(np.float64).eps**2}

    lc.yscale = 'log'
    lc.xscale = 'log'
    for t, x in enumerate(M.d_by_power_iteration(iterations=50), start=1):
        err = linalg.norm(x - d)**2
        print(t, err)
        lc.update(t, power=err)

    print('done')
    lc.draw()
    pl.ioff()
    pl.show()
예제 #3
0
def random_mrp(S, gamma=0.3):
    # Randomly generate and MDP.
    return MRP(
        s0 = random_dist(S),
        R = np.random.uniform(0,1,size=S),
        P = random_dist(S, S),
        gamma = gamma,
    )
예제 #4
0
def random_MDP(S, A, gamma=0.95, b=None, r=None):
    """Randomly generated MDP

    Text taken from http://www.jmlr.org/papers/volume15/geist14a/geist14a.pdf

      "... we consider Garnet problems (Archibald et al., 1995), which are a
      class of randomly constructed finite MDPs. They do not correspond to any
      specific application, but are totally abstract while remaining
      representative of the kind of MDP that might be encountered in
      practice. In our experiments, a Garnet is parameterized by 3 parameters
      and is written G(S, A, b): S is the number of states, A is the number of
      actions, b is a branching factor specifying how many possible next states
      are possible for each state-action pair (b states are chosen uniformly at
      random and transition probabilities are set by sampling uniform random b −
      1 cut points between 0 and 1). The reward is state-dependent: for a given
      randomly generated Garnet problem, the reward for each state is uniformly
      sampled between 0 and 1."

      "The discount factor γ is set to 0.95 in all experiments."

    We consider two types of problems, “small” and “big”, respectively
    corresponding to instances G(30, 2, p=2, dim=8) and G(100, 4, p=3, dim=20)

    """

    if b is None: b = S
    if r is None: r = S

    P = np.zeros((S, A, S))
    states = np.array(list(range(S)))

    #rs = np.random.choice(states, size=r, replace=False)

    for s in range(S):
        for a in range(A):
            # pick b states to be connected to.
            connected = np.random.choice(states, size=b, replace=False)
            P[s, a, connected] = random_dist(b)

    # TODO: specify how many states get rewards
    R = np.zeros((S, A, S))
    rstates = np.random.choice(states, size=r, replace=False)
    R[rstates, :, :] = np.random.uniform(0, 1, r)

    M = DiscountedMDP(
        s0=random_dist(S),
        R=R,
        P=P,
        gamma=gamma,
    )

    return M
예제 #5
0
def main():
#    np.random.seed(0)
#    M = random_MDP(S = 2, A = 2, b = None, r = None, gamma = 0.9)
    M = load_example(3)

    # Use Peirre-Luc's sampling strategy to approximate the polytope
    n_policies = 10000
    random_policies = [random_dist(M.S, M.A) for _ in range(n_policies)]
    det_policies = all_det_policies(M.S, M.A)

    vfs = np.array([M.mrp(pi).V() for pi in random_policies])
    dvfs = np.array([M.mrp(pi).V() for pi in det_policies])

    r = M.r

    v = dvfs
    offset = v[:,0].ptp() * .2
    xs = [min(v[:,0])-offset, max(v[:,0])+offset, 100]
    ys = [min(v[:,1])-offset, max(v[:,1])+offset, 100]
    plot_cons(M.P, r, M.gamma, xs, ys)

    contour_plot(
        lambda v: (1-M.gamma) * M.s0 @ v,
        xs,
        ys,
    )

    sol = M.solve()
    V = sol['V']

    if 1:
        pl.scatter(vfs[:,0], vfs[:,1], s=1, c='k', alpha=0.1)
        pl.scatter(dvfs[:,0], dvfs[:,1], s=20, c='r', zorder=1000)
        pl.title('Value function polytope')

    pl.scatter(*V, c='r', s=200, marker='*')
    pl.xlabel('$V_{\pi}(s_0)$'); pl.ylabel('$V_{\pi}(s_1)$')

    if 0:
        # interpolate between two random policies
        a = random_dist(M.S, M.A)
        b = random_dist(M.S, M.A)
        ts = np.linspace(0, 1, 100)
        pl.plot(*np.array([M.V((1-t)*a + t*b) for t in ts]).T, c='orange', lw=2)

    # TODO: other interesting things -- trajectories of different algorithms.
    # one-step-deviation-based search will walk a bunch of corners --
    # snake-in-the-box can make it take exp time.  This is probably related to
    # the example used to trick simplex.

    pl.show()
예제 #6
0
def test():
    methods = [
        swor_heap1,
        #        swor_heap2,
        swor_heap3,
    ]

    R = 50_000
    v = random_dist(4)

    S = {f.__name__: f(v, R) for f in methods}

    D = {name: counts(S[name]) for name in S}

    R = {}
    n = len(v)
    for z in permute(range(n)):
        R[z] = p_perm(v, z)
        for d in D.values():
            d[z] += 0

    # Check that p_perm sums to one.
    np.testing.assert_allclose(sum(R.values()), 1)
    for name, d in sorted(D.items()):
        compare(R, d)  #.show(title=name);

    T = timers()
    R = 50
    for i in range(1, 15):
        n = 2**i
        #print('n=', n, 'i=', i)
        for _ in range(R):
            v = random_dist(n)
            np.random.shuffle(methods)
            for f in methods:
                name = f.__name__
                with T[name](n=n):
                    S = f(v, R=1)
                assert S.shape == (1, n)  # some sort of sanity check
    print('done')

    fig, ax = pl.subplots(ncols=2, figsize=(12, 5))
    T.plot_feature('n', ax=ax[0])
    fig.tight_layout()
    T.plot_feature('n', ax=ax[1])
    ax[1].set_yscale('log')
    ax[1].set_xscale('log')
    T.compare()

    pl.show()
예제 #7
0
def test_flat():

    n = 5
    w = random_dist(n)**2
    w /= w.sum()

    c = np.zeros(n)
    p = w
    reps = 10_000

    def run():
        Z = Sample.zero
        for k in range(n):
            Z += Sample(w[k], k)
        return Z

    if EAGER:
        def sampler():
            while True:
                yield run().value
    else:
        def sampler():
            yield from run()


#    sample = lazy_sampler()
    sample = iter(sampler())

    for r in range(1, 1+reps):
        _, z = next(sample)
        c[z] += 1
        if r % 10_000 == 0:
            print(f'err({r})=', 0.5*np.abs(p - c/r).sum())
예제 #8
0
def test_subsets():
    from swor.cps import ConditionalPoissonSampling

    n = 5
    K = 3
    w = random_dist(n)

    cps = ConditionalPoissonSampling(w, K)
    p = {Y: cps.score(Y)/cps.Z for Y in cps.domain()}

    reps = 10_000
    c = {Y: 0 for Y in p}

    if EAGER:
        def sampler():
            while True:
                _, y = subsets(w, K, Sample).value
                yield frozenset(extract(y))

    else:
        def sampler():
            for _, y in subsets(w, K, Sample):
                yield frozenset(extract(y))


    sample = iter(sampler())

    for r in range(1, 1+reps):
        Y = next(sample)
        c[Y] += 1
        if r % 10_000 == 0:
            print(f'err({r})=', 0.5*sum(abs(p[x] - c[x]/r) for x in p))
예제 #9
0
def test():
    print()
    print('Finite-horizon tests:', ok)

    S = 10
    A = 3
    M = FiniteHorizonMDP(
        s0=random_dist(S),
        R=np.random.uniform(0, 1, size=(S, A, S)),
        P=random_dist(S, A, S),
        T=20,
    )

    p = random_dist(M.S, M.A)
    assert abs(M.d(p).sum() - M.T) / M.T < 1e-5

    test_pd_lemma_finite_horizon(M)
예제 #10
0
def test_stationary(M):
    print('[test stationary]')

    π = random_dist(M.S, M.A)
    [_, _, γ, r] = M = M | π
    T = 1 / (1 - γ)

    d1 = M.d()
    d2 = M.d_by_eigen()
    assert compare(d1, d2).max_relative_error < 1e-5

    J0 = M.J()
    d0 = M.d()

    def estimate(N):
        d = np.zeros(M.S)
        J = 0.0
        for t, [s, r, _] in enumerate(M.run(), start=1):
            if t >= N: break

            d += (onehot(s, M.S) - d) / t

            # Note the 'importance sampling correction' T, which accounts for
            # the (1-γ)-resetting dynamics.
            J += (r * T - J) / t

            if t % 1000 == 0:
                yield [
                    t,
                    0.5 * abs(J - J0),
                    0.5 * abs(d - d0).sum(),
                ]

    ns, J_err, d_err = np.array(list(estimate(1_000_000))).T

    dmax = 1
    Jmax = T * r.max(
    )  # scaled by T because of the importance sampling correction.

    # Very loose bounds on total variation distance
    J_bnd = Jmax / np.sqrt(ns)
    d_bnd = M.S * dmax / np.sqrt(ns)

    if 0:
        # Error decays at a rate of 1/sqrt(N)
        pl.title('performance estimate')
        pl.loglog(ns, J_bnd, label='error bound')
        pl.loglog(ns, J_err, label='error observed')
        pl.show()

        pl.title('distribution estimate')
        pl.loglog(ns, d_bnd, label='error bound')
        pl.loglog(ns, d_err, label='error observed')
        pl.show()

    assert (J_err <= J_bnd).all()
    assert (d_err <= d_bnd).all()
예제 #11
0
def test_performance_difference_lemma_discounted(M):
    """
    Evaluate performance difference of `p` over `q` based on roll-outs from on
    `q` and roll-ins from `p`.
    """

    p = random_dist(M.S, M.A)
    q = random_dist(M.S, M.A)

    dp = M.d(p)  # Roll-in with p
    Aq = M.Advantage(q)  # Roll-out with q
    # Accumulate advantages of p over q.
    z = 1 / (1 - M.γ) * sum(dp[s] * p[s, :] @ Aq[s, :] for s in range(M.S))

    assert np.allclose(M.J(p) - M.J(q), z)
    print('[pd-lemma]', ok)

    # The PD lemma is just potential-based shaping.
    #   See `test_potential_based_shaping` to read about potential-based shaping.
    #
    # Let `ϕ(s) = Vq(s)` where `Vq(s)` is the value function of some policy `q`.
    # The shaped reward is
    #
    #   R'(s,a,s') = R(s,a,s') + γ Vq(s') - Vq(s)
    #
    # Now take the expectation over s',
    #
    #   E_{s'}[ R'(s,a,s') ]
    #     = E_{s'}[ R(s,a,s') + γ Vq(s') - Vq(s) ]
    #     = E_{s'}[ R(s,a,s') + γ Vq(s')  ]  - Vq(s)
    #     = Qq(s,a) - Vq(s).
    #     = Aq(s, a)
    #
    # We see that the shaped reward function is the advantage of policy `q`.

    ϕ = M.V(q)
    M1 = M.copy()
    M1.apply_potential_based_shaping(ϕ)

    assert_equal(M1.J(p), M.J(p) - M.J(q), verbose=True)

    # Sanity check: q should have no advantive over itself.
    assert abs(M1.J(q)) < 1e-10
예제 #12
0
def test_performance_difference_lemma_discounted(M):
    """
    Evaluate performance difference of `p` over `q` based on roll-outs from on
    `q` and roll-ins from `p`.
    """
    # Connection to performance-difference lemma.
    #
    # If we take ϕ(s) = Vq(s), the value function an arbitrary policy q,
    #
    #   R'(s,a,s') = R(s,a,s') + γ Vq(s') - Vq(s)
    #
    # And then take the expectation over s',
    #
    #   E_{s'}[ R'(s,a,s') ]
    #     = E_{s'}[ R(s,a,s') + γ Vq(s') - Vq(s) ]
    #     = E_{s'}[ R(s,a,s') + γ Vq(s')  ]  - Vq(s)
    #     = Qq(s,a) - Vq(s).
    #
    # We see that the effective reward function is the advantage.
    #
    # TODO: Now, the question is what does the action-value function look like after
    # shaping?
    #
    # TODO: There is some discussion in the Ng and Russel papers about the idealized
    # case of value function (p's value funciton not q's).
    #
    #   - I think it's quite simple. When p=q, the advantage is always
    #     zero. Therefore, variance is zero.
    #
    #   - V* is also an interesting case, which is closer to the SEARN case.

    # TODO: PD-lemma and the derivative of a policy mixture.

    p = random_dist(M.S, M.A)
    q = random_dist(M.S, M.A)

    dp = M.d(p)           # Roll-in with p
    Aq = M.Advantage(q)   # Roll-out with q
    # Accumulate advantages of p over q.
    z = 1/(1-M.gamma) * sum(dp[s] * p[s,:] @ Aq[s,:] for s in range(M.S))

    assert np.allclose(M.J(p) - M.J(q), z)
    print('[pd-lemma]', ok)
예제 #13
0
def test_pd_lemma_finite_horizon(M):
    """
    Evaluate performance difference of `p` over `q` based on roll-outs from on
    `q` and roll-ins from `p`.
    """
    p = random_dist(M.S, M.A)
    q = random_dist(M.S, M.A)

    Jq, Vq, Qq = M.value(q)  # Roll-out with q
    dp = M.d(p)  # Roll-in with p. Note that dp sums to T, not 1.
    #assert dp.sum() == M.T

    Jp, _, _ = M.value(p)  # Value p.
    # Accumulate advantages of p over q.
    z = 0.0
    for t in range(M.T):
        for s in range(M.S):
            A = p[s, :] @ Qq[t, s, :] - Vq[t, s]
            z += dp[t, s] * A
    assert np.allclose(Jp - Jq, z)
    print('[pd-lemma]', ok)
예제 #14
0
def test_gradients(M):

    J = lambda: M.J(π)

    π = random_dist(M.S, M.A)
    r = M.r

    # The policy gradient theorem
    fdcheck(
        J,
        π,
        1 / (1 - M.γ) * M.d(π)[:, None] *
        M.Q(π),  # Note: not Q is not interchangeable with Advantage!
    )  #.show(title='policy gradient v1.')

    print('[policy gradient theorem]', ok)

    # Jacobians of the implicit d(p) and v(p) functions.
    z = spherical(M.S)
    _d, d_grad = M.d(π, jac=True)
    fdcheck(lambda: z @ M.d(π), π, d_grad(z))  #.show(title='implicit d')
    _v, v_grad = M.V(π, jac=True)
    fdcheck(lambda: z @ M.V(π), π, v_grad(z))  #.show(title='implicit v')

    # check that the implicit functions are consistent with the other methods for computing them.
    assert np.allclose(_d, M.d(π))
    assert np.allclose(_v, M.V(π))

    # The policy gradient theorem
    #    fdcheck(J, p,
    #            1/(1-M.γ) * (
    #                np.einsum('s,sa->sa', M.d(p), M.Advantage(p))
    #                + (M.d(p) * M.V(p))[:,None]
    #            )
    #    ) .show(title='policy gradient v1.')

    # Extract the full Jacobian, flatten SA dim of policy
    Jdp = np.zeros((M.S, M.S * M.A))
    for s in range(M.S):
        Jdp[s, :] = d_grad(onehot(s, M.S)).flat

    # The stuff below is the chaining from J to derivatives thru π
    fdcheck(J, π, 1 / (1 - M.γ) * (np.einsum('sa,sa->s', r, π) @ Jdp +
                                   np.einsum('s,sa->sa', M.d(π), r).flatten())
            )  #.show(title='policy gradient v2.')

    # Extract the full Jacobian, flatten SA dim of policy
    Jvp = np.zeros((M.S, M.S * M.A))
    for s in range(M.S):
        Jvp[s, :] = v_grad(onehot(s, M.S)).flat
    fdcheck(J, π, M.s0 @ Jvp)  #.show(title='policy gradient v2a.')
    fdcheck(J, π, v_grad(M.s0))  #.show(title='policy gradient v2b.')
예제 #15
0
def test_J(M):
    # Test a single-state MRP
    # Sanity check: Why is there a 1/(1-γ) here?
    # if there is 1 state {
    #   rewards    = [r]
    #   stationary = [1]
    #   value      = r + γ value
    #              = r / (1-γ)
    #   J          = r / (1-γ)
    # }
    m1 = random_mrp(1)
    assert np.allclose(m1.J(), m1.R / (1-m1.gamma))

    # Test equivalence of various methods for computing J.
    π = random_dist(M.S, M.A)
    [α, _, γ, r] = m = M | π

    T = 1 / (1-γ)

    J_by_V = α @ m.V()
    J_by_d = T * m.d() @ r
    J_by_S = α @ m.successor_representation() @ r

    J = m.J()
    assert np.allclose(J_by_d, J)
    assert np.allclose(J_by_S, J)
    assert np.allclose(J_by_V, J)

    # The reason why we have this equivalence is simply because of where we put
    # the parentheses
    #   (α @ m.successor_representation()) @ r
    #     = T dᵀ @ r
    # vs
    #   α @ (m.successor_representation() @ r)
    #     = α @ v

    assert np.allclose(α @ m.successor_representation(), T * m.d())
    assert np.allclose(m.successor_representation() @ r, m.V())

    # [2018-09-26 Wed] The following idea was tempting, but wrong! Here is where
    # my logic broke down: In the case of MDPs, we can use the performance
    # difference lemma (PD) to create a similar equation.  However, PD relates
    # the expected advantage function under a stationary distribution to the
    # difference of J's.  In the special case of a single PD of a policy versus
    # itself, we have that J'-J should be zero.  Note that the advantage of a
    # policy against itself is just the reward function.
    #
    # J_by_dV = T @ M.d() @ M.V()   # <=== INCORRECT!

    print('[test J]', ok)
예제 #16
0
파일: mrp.py 프로젝트: timvieira/rl
def random_MRP(S, γ=0.95, b=None, r=None):
    if b is None: b = S
    if r is None: r = S

    P = np.zeros((S, S))
    states = np.array(list(range(S)))

    #rs = np.random.choice(states, size=r, replace=False)

    for s in range(S):
        # pick b states to be connected to.
        connected = np.random.choice(states, size=b, replace=False)
        P[s, connected] = random_dist(b)

    R = np.zeros(S)
    rstates = np.random.choice(states, size=r, replace=False)
    R[rstates] = np.random.uniform(0, 1, r)

    return MRP(
        s0=random_dist(S),
        R=R,
        P=P,
        γ=γ,
    )
예제 #17
0
def test_dual_representation(mdp):
    # Wang et al. 2008. "Dual Representations for Dynamic Programming" JMLR.
    # https://webdocs.cs.ualberta.ca/~dale/papers/dualdp.pdf

    S = range(mdp.S)
    A = range(mdp.A)
    γ = mdp.γ

    π = random_dist(mdp.S, mdp.A)
    Q = mdp.Q(π)
    V = mdp.V(π)
    R = mdp.r
    P = mdp.P

    Π = mdp.Π(π)

    # Wang08's H matrix, which I'll call W, is a Markov chain over (s,a) ->
    # (s'', a'')
    W = mdp.sasa_matrix(π, normalize=True)
    F = mdp.successor_representation(π, normalize=True)

    # Lemma 4
    assert np.all(F >= 0)
    assert np.allclose(1.0, np.einsum('ik->i', F))

    # Lemma 10 W ≥ 0 and W @ 1 = 1
    assert np.all(W >= 0)
    assert np.allclose(1.0, np.einsum('iakc->ia', W))

    # Q as a function of W
    assert np.allclose(Q * (1 - γ), np.einsum('iakb,kb->ia', W, R))

    # Check that W solves our equations
    for k in S:
        for c in A:
            for i in S:
                for a in A:
                    np.allclose(W[i, a, k, c], (1 - γ) * ((i, a) == (k, c)) +
                                γ * sum(W[j, b, k, c] * π[j, b] * P[i, a, j]
                                        for j in S for b in A))

    # Lemma 13
    assert np.allclose(V, Π @ Q.flat)

    # Lemma 14
    assert np.allclose(F @ Π, Π @ W.reshape(mdp.S * mdp.A, mdp.S * mdp.A))
예제 #18
0
def test_policy_matrix(M):
    π = random_dist(M.S, M.A)
    Π = M.Π(π)
    m = M.mrp(π)

    np.allclose(Π @ M.r.reshape(M.S*M.A), m.R.flatten())
    np.allclose(Π @ M.P.reshape((M.S*M.A, M.S)), m.P)

    # Markov chain over state-action pairs <s,a> -> <s',a'>
    X = np.zeros((M.S, M.A, M.S, M.A))
    for s in range(M.S):
        for a in range(M.A):
            for sp in range(M.S):
                for ap in range(M.A):
                    X[s,a,sp,ap] = M.P[s,a,sp] * π[sp, ap]

    X = X.reshape((M.S*M.A, M.S*M.A))
    np.allclose(M.P.reshape((M.S*M.A, M.S)) @ Π, X)

    print('[policy matrix]', colors.light.green % 'ok')
예제 #19
0
def test_potential_based_shaping(M0):
    S = M0.S; A = M0.A; s0 = M0.s0

    opt_π = M0.solve()['policy']

    # generate a random potential function
    ϕ = np.random.uniform(-1, 1, size=S)

    M1 = M0.copy()    # use a copy!
    M1.apply_potential_based_shaping(ϕ)

    # Check that both methods found the same policy
    original = M0.solve()
    shaped = M1.solve()
    assert np.allclose(shaped['policy'], original['policy'])

#    opt_π = M0.solve()['policy']

    π = random_dist(S, A)

    v0 = M0.V(π)
    v1 = M1.V(π)

    # Corollary 2 of Ng et al. (1999).
    assert np.allclose(v0, v1 + ϕ)

    # Advantage is invariant to shaping.
    assert np.allclose(M0.Advantage(π), M1.Advantage(π))

    # The difference in J only depends on the initial state
    assert np.allclose(M0.J(π), M1.J(π) + s0 @ ϕ)
    print('[potential-based shaping] relationship between expected values and value functions', ok)

    # shaping with the optimal value function
    # TODO: are there other interesting things to say about this setting?
    vstar = original['V']
    M2 = M0.copy()  # use a copy of R!
    M2.apply_potential_based_shaping(vstar)

    assert np.allclose(0, M2.V(opt_π))  # optimal policy as V=0 everywhere and everything else in negative
    assert (M2.V(π) <= 0).all()         # suboptimal policies have negative value everywhere

    # optimal policy in the "optimally shapped MDP" can be found with gamma=0!
    M2.gamma *= 0
    assert (M2.solve()['policy'] == opt_π).all()

    # The optimal policy in M2 requires *zero* steps of lookahead (i.e., just
    # optimize immediate reward). The proof is pretty trivial.
    #
    # Given the V*-shaped reward R':
    #    R'[s,a,s'] def= R[s,a,s'] + γ V*[s'] - V*[s]
    #
    # R'[s,a] = sum_{s'} p(s' | s, a) * R'[s,a,s']
    #         = sum_{s'} p(s' | s, a) * (R[s,a,s'] + γ V*[s'] - V*[s])
    #         = A*(s,a)
    #
    # Acting greedily according to A*(s,a) is clearly optimal. Nonetheless, we
    # have an explict test below.
    assert np.allclose(M2.r, M0.Advantage(opt_π))
    M2_r = M2.r
    myopic_π = (M2_r == M2_r.max(axis=1)[:,None]) * 1.0
    assert np.allclose(myopic_π, opt_π)

    print('[potential-based shaping] "optimal shaping"', ok)