Beispiel #1
0
def test():
    T1 = np.array([
        [0, 3 / 4, 1 / 4],
        [1 / 3, 1 / 2, 1 / 6],
        [1, 0, 0],
    ])
    T2 = np.array([
        [0, 3 / 4, 1 / 4],
        [2 / 3, 1 / 4, 1 / 12],
        [0, 3 / 4, 1 / 4],
    ])
    R = np.array([
        [0, 1, 1],
        [1, 0, 0],
        [1, 0, 0],
    ])
    mdp_gnd = MDP([T1, T2], [R, R], gamma=0.9)
    phi = np.array([
        [1, 0],
        [0, 1],
        [0, 1],
    ])
    mdp_abs = AbstractMDP(mdp_gnd, phi)
    mdp_abs = AbstractMDP(mdp_gnd, mdp_abs.phi)
    assert is_markov(mdp_abs)
Beispiel #2
0
def generate_markov_mdp_pair(n_states,
                             n_abs_states,
                             n_actions,
                             sparsity=0,
                             gamma=0.9,
                             equal_block_rewards=True,
                             equal_block_transitions=True):
    # Sometimes numerical precision causes the abstract mdp to appear non-Markov
    # so we just keep generating until the problem goes away. Usually it's fine.
    while True:
        # generate an MDP and an abstraction function
        mdp_gnd = MDP.generate(n_states=n_states,
                               n_actions=n_actions,
                               sparsity=sparsity,
                               gamma=gamma)
        assert n_abs_states < n_states
        phi = random_phi(n_states, n_abs_states)

        agg_states = ((phi.sum(axis=0) > 1) @ phi.transpose()).astype(bool)
        other_states = ((phi.sum(axis=0) == 1) @ phi.transpose()).astype(bool)

        random_weights = random_transition_matrix(
            (1, n_states - n_abs_states + 1))

        # adjust T and R to achieve desired properties
        R = np.copy(mdp_gnd.R)
        T = np.copy(mdp_gnd.T)
        for a in range(mdp_gnd.n_actions):
            if equal_block_rewards:
                R[a][agg_states[:, None] * agg_states] = np.mean(
                    mdp_gnd.R[a][agg_states[:, None] * agg_states])
                R[a][other_states[:, None] * agg_states] = np.mean(
                    mdp_gnd.R[a][other_states[:, None] * agg_states])
                R[a][agg_states[:, None] * other_states] = np.mean(
                    mdp_gnd.R[a][agg_states[:, None] * other_states])

            T[a][:, agg_states] = random_weights * np.sum(
                mdp_gnd.T[a][:, agg_states], axis=1, keepdims=True)
            if equal_block_transitions:
                T[a][agg_states] = np.mean(T[a][agg_states, :], axis=0)
                T[a][agg_states][:, agg_states] = random_weights * np.sum(
                    T[a][agg_states][:, agg_states], axis=1, keepdims=True)
            # T[a][:,other_states] = random_transition_matrix((1,mdp_gnd.n_states-2)) * np.sum(mdp_gnd.T[a][:,other_states],axis=1, keepdims=True)
            assert (is_stochastic(T[a]))
        mdp_gnd.R = R
        mdp_gnd.T = T

        p0 = random_transition_matrix((1, n_states)).squeeze()
        mdp_abs = AbstractMDP(mdp_gnd, phi, p0=p0)

        # Ensure that the abstraction is markov by checking inverse models and ratios
        if is_markov(mdp_abs):
            break
    return mdp_gnd, mdp_abs
Beispiel #3
0
def test_non_I_possibly_markov():
    T1 = np.array([
        #0  1   2   3  4
        [0, .5, 0, .5, 0],  # 0
        [0, 0, 1, 0, 0],  # 1 (action 1)
        [1, 0, 0, 0, 0],  # 2
        [0, .5, 0, .5, 0],  # 3 (action 1)
        [1, 0, 0, 0, 0],  # 4
    ])
    T2 = np.array([
        #0  1   2   3  4
        [0, .5, 0, .5, 0],  # 0
        [0, .5, 0, .5, 0],  # 1 (action 2)
        [1, 0, 0, 0, 0],  # 2
        [0, 0, 0, 0, 1],  # 3 (action 2)
        [1, 0, 0, 0, 0],  # 4
    ])
    T = (.2 * T1 + .8 * T2)
    R = ((T1 + T2) > 0).astype(float)
    # mdp_gnd = MDP([T1, T2], [R, R], gamma=0.9)
    mdp_gnd = MDP([T, T], [R, R], gamma=0.9)
    phi = np.array([
        [1, 0, 0],  # 0
        [0, 1, 0],  # 1
        [0, 0, 1],  # 2
        [0, 1, 0],  # 3
        [0, 0, 1],  # 4
    ])
    p0 = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6])
    mdp_abs = AbstractMDP(mdp_gnd, phi, p0=p0)
    matching_I(mdp_abs)

    pi = mdp_gnd.get_policy(0)
    mdp_gnd.stationary_distribution(p0=p0, max_steps=200).round(4)
    p0_abs = np.array([1 / 3, 1 / 3, 1 / 3])
    mdp_abs.stationary_distribution(p0=p0_abs, max_steps=100).round(3)
    mdp_gnd.get_N(pi=pi)
Beispiel #4
0
def generate_non_markov_mdp_pair(n_states,
                                 n_abs_states,
                                 n_actions,
                                 sparsity=0,
                                 gamma=0.9,
                                 fixed_w=False):
    while True:
        mdp_gnd = MDP.generate(n_states=n_states,
                               n_actions=n_actions,
                               sparsity=sparsity,
                               gamma=gamma)
        assert n_abs_states < n_states
        phi = random_phi(n_states, n_abs_states)
        if fixed_w:
            mdp_abs = UniformAbstractMDP(mdp_gnd, phi)
        else:
            mdp_abs = AbstractMDP(mdp_gnd, phi)

        # Ensure non-markov by checking inverse models and ratios
        if not is_markov(mdp_abs):
            break
    return mdp_gnd, mdp_abs
Beispiel #5
0
def test_non_markov_B():
    T = np.array([
        [0, .5, .5, 0, 0, 0],
        [0, 0, 0, .5, .5, 0],
        [0, 0, 0, 0, .5, .5],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
    ])
    R = (T > 0).astype(float)
    mdp_gnd = MDP([T, T], [R, R], gamma=0.9)
    phi = np.array([
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
    ])
    mdp_abs = AbstractMDP(mdp_gnd, phi)
    # Even though this abstract MDP is Markov, is_markov() will return False,
    # since its conditions (while sufficient) are stricter than necessary
    assert not is_markov(mdp_abs)
# Note that because B(x|z) depends on the action selected at s0, B is not Markov.
# Similarly, R(z',a,z) depends on the same additional history, so the abstraction
# is not Markov either.
T_list = np.array([
    [[0, 1, 0, 0.0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]],
    [[0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]],
])
R = np.array([[0, 0.5, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 0]])
phi = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 1, 0],
    [0, 0, 1],
])
mdp1 = MDP(T_list, [R, R], gamma=0.9)
mdp2 = AbstractMDP(mdp1, phi, p0=np.array([1, 0, 0, 0]), t=1)
mdp2 = AbstractMDP(mdp1, phi)
is_markov(mdp2)

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()

v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

order_v_g = np.stack(sort_value_fns(v_g_list)).round(4)
order_v_a = np.stack(sort_value_fns(v_a_list)).round(4)

mdp2.p0
agg_state = mdp2.phi.sum(axis=0) > 1
np.stack([mdp2.B(pi, t=1)[agg_state] for pi in pi_g_list])
    [0, 0, 0, 0, 2, 2],
    [2, 0, 0, 0, 0, 0],
    [3, 0, 0, 0, 0, 0],
    [4, 0, 0, 0, 0, 0]
])/4

mdp1 = MDP([T, T], [R, R], gamma=0.9)
phi = np.array([
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
])
mdp2 = AbstractMDP(mdp1, phi)

v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()
v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

np.allclose(v_g_list, v_g_list[0])

order_v_g = sorted_order(v_g_list)
order_v_a = sorted_order(v_a_list)
assert np.allclose(order_v_a, order_v_g)
R = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [1, 0, 0],
])
mdp1 = MDP([T1, T2], [R, R], gamma=0.9)
v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

phi = np.array([
    [1, 0],
    [0, 1],
    [0, 1],
])

mdp2 = AbstractMDP(mdp1, phi)
v_phi_star, q_phi_star, pi_phi_star = vi(mdp2)
v_phi_star

# for each ground-state policy
n_policies = mdp1.n_actions**mdp1.n_states
for i in range(n_policies):
    pi_string = gmpy.digits(i, mdp1.n_actions).zfill(mdp1.n_states)
    pi = np.asarray(list(pi_string), dtype=int)

    # compare V^pi vs V_phi^pi
    v_pi = vi(mdp1, pi)[0]
    belief = mdp2.B(pi)
    v_phi_pi = belief @ v_pi
    print(i, pi, v_pi, v_phi_pi)
    [0, 3/4, 1/4],
    [2/3, 1/4, 1/12],
    [0, 3/4, 1/4],
])
# T_alt = np.array([
#     [1/2, 3/8, 1/8],
#     [1, 0, 0],
#     [1, 0, 0],
# ])
R = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [1, 0, 0],
])
mdp1 = MDP([T1, T2], [R, R], gamma=0.9)
mdp2 = AbstractMDP(MDP([T0, T1], [R, R], gamma=0.9), np.array([[1,0],[0,1],[0,1]]))
is_hutter_markov(mdp2)
is_markov(mdp2)
v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

phi = np.array([
    [1, 0],
    [0, 1],
    [0, 1],
])

mdp2 = AbstractMDP(mdp1, phi)
assert is_markov(mdp2)
assert has_block_dynamics(mdp2)
assert not is_hutter_markov(mdp2)
Beispiel #10
0
v_pi_phi_star
np.asarray(v_g_list).round(3)
np.asarray(v_a_list).round(3)

#%%
# This illustrates an example where V^{\pi_\phi^*} < max_{\pi\in \Pi_\phi} V^{\pi}
# Note the fixed weighting scheme.
T_list = np.array([[[1., 0., 0.], [1., 0., 0.], [0., 0., 1.]],
                   [[0., 1., 0.], [0., 0., 1.], [0., 1., 0.]]])
R_list = np.array([[[1., 0., 0.], [0.5, 0., 0.], [0., 0., 0.5]],
                   [[0., 1., 0.], [0., 0., 1.], [0., 0.1, 0.]]])
phi = np.array([[0, 1], [1, 0], [0, 1]])

mdp1 = MDP(T_list, R_list, gamma=0.9)
mdp2 = AbstractMDP(mdp1, phi)

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()

v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

order_v_g = sort_value_fns(v_g_list)
order_v_a = sort_value_fns(v_a_list)

v_phi_pi_phi_star, _, pi_phi_star = vi(mdp2)
v_pi_phi_star = vi(mdp1, mdp2.get_ground_policy(pi_phi_star))[0]

# Look for examples of v_pi_phi_star < v
for v in v_g_list: