Ejemplo n.º 1
0
def generate_polytope_densities():
    n_states, n_actions = 2, 2
    pis = utils.gen_grid_policies(41)

    nx = 4
    ny = 5
    plt.figure(figsize=(16, 16))

    for i in range(nx * ny):
        print(i)
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
        Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)
        # just set all to be the same probability
        p_pi = 0.1
        pVs = [
            density_value_functional(p_pi, mdp.P, mdp.r, pi, mdp.discount)
            for pi in pis
        ]

        plt.subplot(nx, ny, i + 1)

        fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=pVs)
        # plt.colorbar()
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    plt.show()
Ejemplo n.º 2
0
def generate_model_iteration():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(7)

    init = rnd.standard_normal(
        (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)
    )  # needs its own init. alternatively could find init that matches value of other inits?!?

    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75)

    lr = 0.01
    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]

    # adversarial pis
    apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack(apis)

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    params = [parse_model_params(mdp.S, mdp.A, p) for p in params]

    vs = np.vstack([
        utils.value_functional(utils.softmax(p_logits), r, pi_star,
                               mdp.discount).T for p_logits, r in params
    ])

    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG')
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x')

    p_logits, r = params[-1]
    vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis)
    plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75)
    plt.title('Model iteration')
    plt.xlabel('Value of state 1')
    plt.ylabel('Value of state 2')

    # plt.show()
    plt.savefig('figs/model_iteration_1.png')

    learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r,
                            mdp.discount, mdp.d0)
    pi_star_approx = utils.solve(
        policy_iteration(learned_mdp),
        utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1]
    print(pi_star_approx, '\n', pi_star)
Ejemplo n.º 3
0
def lmdp_field():
    """
    For each policy.
    Calculate its dynamics, P_pi.
    Estimate the value via the LMDP.
    Plot difference under linearTD operator.
    """
    n_states, n_actions = 2, 2
    pis = utils.gen_grid_policies(11)

    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)

    vs = []
    dvs = []
    for pi in pis:
        u = np.einsum('ijk,jk->ij', mdp.P, pi)
        v = lmdps.linear_value_functional(p, q, u, mdp.discount)
        z = np.exp(v)
        Tz = lmdps.linear_bellman_operator(p, q, z, mdp.discount)
        dv = np.log(Tz) - np.log(z)

        vs.append(v)
        dvs.append(dv)

    dvs = np.vstack(dvs)
    vs = np.vstack(vs)

    normed_dvs = utils.normalize(dvs)

    plt.figure(figsize=(16, 16))
    plt.subplot(1, 2, 1)
    plt.title('Linearised Bellman operator')
    plt.quiver(vs[:, 0], vs[:, 1], normed_dvs[:, 0], normed_dvs[:, 1],
               np.linalg.norm(dvs, axis=1))

    # plot bellman
    Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)
    diff_op = lambda V: utils.bellman_optimality_operator(
        mdp.P, mdp.r, np.expand_dims(V, 1), mdp.discount) - np.expand_dims(
            V, 1)
    dVs = np.stack([np.max(diff_op(V), axis=1) for V in Vs])

    normed_dVs = utils.normalize(dVs)

    plt.subplot(1, 2, 2)
    plt.title('Bellman operator')
    plt.quiver(Vs[:, 0], Vs[:, 1], normed_dVs[:, 0], normed_dVs[:, 1],
               np.linalg.norm(dVs, axis=1))

    # plt.savefig('figs/LBO_BO.png')
    plt.show()
Ejemplo n.º 4
0
def compare_mdp_lmdp():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    pis = utils.gen_grid_policies(7)
    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75)

    # solve via LMDPs
    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
    u, v = lmdps.lmdp_solver(p, q, mdp.discount)
    pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

    pi_p = lmdps.lmdp_decoder(p, mdp.P)

    # solve MDP
    init = np.random.standard_normal((n_states, n_actions))
    pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]
    # pi_star = onehot(np.argmax(qs, axis=1), n_actions)

    # evaluate both policies.
    v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
    v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount)
    v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount)

    plt.scatter(v_star[0, 0],
                v_star[1, 0],
                c='m',
                alpha=0.5,
                marker='x',
                label='mdp')
    plt.scatter(v_u_star[0, 0],
                v_u_star[1, 0],
                c='g',
                alpha=0.5,
                marker='x',
                label='lmdp')
    plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p')
    plt.legend()
    plt.show()
Ejemplo n.º 5
0
def plot():
    n_states = 2
    n_actions = 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    value = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))

    pis = np.stack(utils.gen_grid_policies(101), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10)

    pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r')

    plt.xlabel('The value of state 1')
    plt.ylabel('The value of state 2')

    plt.title('The value polytope')

    plt.show()
Ejemplo n.º 6
0
def emp_est_snr_map():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(5)
    vals = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    vars = []
    hs = []
    for i, pi in enumerate(pis):
        print('\r{}'.format(i), end='', flush=True)

        vars.append(est_var_R(mdp, pi))
        hs.append(utils.entropy(pi))

    plt.subplot(2, 1, 1)
    plt.scatter(vals[:, 0], vals[:, 1], c=hs)
    plt.subplot(2, 1, 2)
    plt.scatter(vals[:, 0], vals[:, 1], c=vars)
    # plt.subplot(3,1,1)
    # plt.scatter(vals[:, 0], vals[:, 0], c=hs)
    plt.show()
Ejemplo n.º 7
0
def hyperbolic_polytope():
    # https://arxiv.org/abs/1902.06865
    n_states, n_actions = 2, 2
    N = 21
    pis = utils.gen_grid_policies(N)
    mdp = utils.build_random_mdp(n_states, n_actions, None)

    n = 10
    discounts = np.linspace(0.1, 1-1e-4, n)
    Vs = []
    for discount in discounts:
        Vs.append((1-discount)*utils.polytope(mdp.P, mdp.r, discount, pis))

    h_V = sum(Vs)/n

    plt.subplot(2, 1, 1)
    plt.scatter(h_V[:, 0], h_V[:, 1])
    plt.subplot(2, 1, 2)
    V = (1-0.9)*utils.polytope(mdp.P, mdp.r, 0.9, pis)
    plt.scatter(V[:, 0], V[:, 1])
    plt.show()
Ejemplo n.º 8
0
        trajs.append(traj)
    return trajs


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


if __name__ == '__main__':
    rnd.seed(42)
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(4)

    use_momentum = False
    fname = 'test1.json'
    with open(fname, 'w') as f:

        for lr in np.logspace(-2, -1, 2):
            traj = value_iteration(mdp, pis, lr)

            data = {
                '{}-{}-{}'.format(value_iteration.__name__, lr, use_momentum):
                [np.array(t).tolist() for t in traj]
            }
            s = json.dumps(data, cls=NumpyEncoder)
            f.write(s + '\n')
Ejemplo n.º 9
0
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')

    for i in range(len(vs)-2):
        dv = 0.1*(vs[i+1, :] - vs[i, :])
        plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)

if __name__ == '__main__':
    # rnd.seed(42)
    print('start')
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    print('\nBuilding polytope')
    pis = np.stack(utils.gen_grid_policies(41))
    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16,16))
    plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75)

    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    for i, c in zip(range(4), colors):
        print('\nRunning experiment {}'.format(i))
        # generate_vi(mdp, c)
        generate_pg(mdp, c)
        # generate_pi(mdp, c)
    plt.legend()
    plt.colorbar()
    plt.show()