Ejemplo n.º 1
0
def test():
    mdp = MDP.generate(n_states=4, n_actions=2)
    pi_list = mdp.all_policies()
    v_list = [vi(mdp, pi)[0] for pi in pi_list]
    v_ranks = sorted_order(v_list)

    sorted_v = [v for _, v in sorted(zip(v_ranks, v_list))]
    for v1, v2 in zip(sorted_v[:-1], sorted_v[1:]):
        assert compare_value_fns(v1, v2) != '<'
    # for pi1, v1 in zip(pi_list, v_list):
    #     for pi2, v2 in zip(pi_list, v_list):
    #         print(v1.round(4))
    #         print(compare_value_fns(v1, v2), v2.round(4))
    #         print()

    v_star, _, pi_star = vi(mdp)
    assert compare_value_fns(v_star, sorted_v[0]) == '='
Ejemplo n.º 2
0
def main():
    mdp = BlockMDP(MDP.generate(n_states=5, n_actions=6), n_obs_per_block=3)
    v, q, pi = vi(mdp)

    v_alt = np.zeros_like(v)
    for s in range(mdp.n_states):
        v_alt[s] = q[pi[s]][s]
    v_alt = v_alt.squeeze()
    assert np.allclose(v_alt, v)

    v_pi = vi(mdp, pi)[0]
    assert np.allclose(v_pi, v)

    m_phi = mdp.base_mdp
    v_phi, q_phi, pi_phi = vi(m_phi)
    pi_phi_grounded = np.kron(pi_phi,
                              np.ones((1, mdp.n_states // m_phi.n_states)))
    assert np.allclose(pi_phi_grounded, pi)
    print('All tests passed.')
R = np.array([[0, 0.5, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 0]])
phi = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 1, 0],
    [0, 0, 1],
])
mdp1 = MDP(T_list, [R, R], gamma=0.9)
mdp2 = AbstractMDP(mdp1, phi, p0=np.array([1, 0, 0, 0]), t=1)
mdp2 = AbstractMDP(mdp1, phi)
is_markov(mdp2)

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()

v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

order_v_g = np.stack(sort_value_fns(v_g_list)).round(4)
order_v_a = np.stack(sort_value_fns(v_a_list)).round(4)

mdp2.p0
agg_state = mdp2.phi.sum(axis=0) > 1
np.stack([mdp2.B(pi, t=1)[agg_state] for pi in pi_g_list])

v_phi_pi_phi_star, _, pi_phi_star = vi(mdp2)
v_pi_phi_star = vi(mdp1, mdp2.get_ground_policy(pi_phi_star))[0]

# Look for examples of v_pi_phi_star < v
for v in v_g_list:
    if compare_value_fns(v_pi_phi_star, v) == "<":
    [3, 0, 0, 0, 0, 0],
    [4, 0, 0, 0, 0, 0]
])/4

mdp1 = MDP([T, T], [R, R], gamma=0.9)
phi = np.array([
    [1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 1, 0],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
])
mdp2 = AbstractMDP(mdp1, phi)

v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

pi_g_list = mdp2.piecewise_constant_policies()
pi_a_list = mdp2.abstract_policies()
v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

np.allclose(v_g_list, v_g_list[0])

order_v_g = sorted_order(v_g_list)
order_v_a = sorted_order(v_a_list)
assert np.allclose(order_v_a, order_v_g)

graph_value_fns(v_a_list)
graph_value_fns(v_g_list)
Ejemplo n.º 5
0
    [0, 3/4, 1/4],
    [2/3, 1/4, 1/12],
    [0, 3/4, 1/4],
])
# T_alt = np.array([
#     [1/2, 3/8, 1/8],
#     [1, 0, 0],
#     [1, 0, 0],
# ])
R = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [1, 0, 0],
])
mdp1 = MDP([T1, T2], [R, R], gamma=0.9)
v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

phi = np.array([
    [1, 0],
    [0, 1],
    [0, 1],
])

mdp2 = AbstractMDP(mdp1, phi)
v_phi_star, q_phi_star, pi_phi_star = vi(mdp2)
v_phi_star

# for each ground-state policy
n_policies = mdp1.n_actions**mdp1.n_states
for i in range(n_policies):
    [2/3, 1/4, 1/12],
    [0, 3/4, 1/4],
])
# T_alt = np.array([
#     [1/2, 3/8, 1/8],
#     [1, 0, 0],
#     [1, 0, 0],
# ])
R = np.array([
    [0, 1, 1],
    [1, 0, 0],
    [1, 0, 0],
])
mdp1 = MDP([T1, T2], [R, R], gamma=0.9)
mdp2 = AbstractMDP(MDP([T0, T1], [R, R], gamma=0.9), np.array([[1,0],[0,1],[0,1]]))
is_hutter_markov(mdp2)
is_markov(mdp2)
v_star, q_star, pi_star = vi(mdp1)
v_star, pi_star

phi = np.array([
    [1, 0],
    [0, 1],
    [0, 1],
])

mdp2 = AbstractMDP(mdp1, phi)
assert is_markov(mdp2)
assert has_block_dynamics(mdp2)
assert not is_hutter_markov(mdp2)
Ejemplo n.º 7
0
    # pi_abs = pi_a_list[i]
    # mdp2.B(pi_gnd)
    # phi = mdp2.phi
    # N_gnd = mdp1.get_N(pi_gnd)
    # phi.transpose() @ phi
    # Px = mdp1.stationary_distribution(pi_gnd)
    # N_abs = mdp2.get_N(pi_abs)
    # Pz = mdp2.stationary_distribution(pi_abs)
    #
    # ratio_abs = np.divide(N_abs, Pz[None,:], out=np.zeros_like(N_abs), where=Pz!=0)
    # ratio_gnd = np.divide(N_gnd, Px[None,:], out=np.zeros_like(N_gnd), where=Px!=0)
    # mdp2.B(pi_gnd) @ ratio_gnd
    # ratio_abs @ phi.transpose()
    # is_markov(mdp2)

    v_g_list = [vi(mdp1, pi)[0] for pi in pi_g_list]
    v_a_list = [vi(mdp2, pi)[0] for pi in pi_a_list]

    order_v_g = sort_value_fns(v_g_list)
    order_v_a = sort_value_fns(v_a_list)

    agg_state = mdp2.phi.sum(axis=0) > 1
    [mdp2.B(pi, t=0)[agg_state][0] for pi in pi_g_list]
    [mdp2.B(pi, t=1)[agg_state][0] for pi in pi_g_list]
    [mdp2.B(pi, t=3)[agg_state][0] for pi in pi_g_list]

    v_star, _, pi_star = vi(mdp1)
    v_phi_pi_phi_star, _, pi_phi_star = vi(mdp2)
    pi_phi_star_gnd = mdp2.get_ground_policy(pi_phi_star)
    v_pi_phi_star = vi(mdp1, pi_phi_star_gnd)[0]