def test_terminal_state(): with dsl.new() as new_mdp: dsl.terminal_state() dsl.action() solver = lp.LinearProgramming(new_mdp) assert np.isclose(solver.compute_q_table(), [0])
def test_coverage(): with dsl.new() as new_mdp: # type: mdp.MDPSpec start = dsl.state() finish = dsl.terminal_state() action = dsl.action() start & action > finish | dsl.reward(1) assert new_mdp.num_states == 2 assert new_mdp.num_actions == 1 new_mdp.to_graph() env: mdp.MDPEnv = new_mdp.to_env() state = env.reset() assert state == 0 state, reward, is_done, info = env.step(0) assert state == 1 assert reward == 1 assert is_done env.render(mode='rgb_array') env.render(mode='png')
def tiny_counterexample_env(): with dsl.new() as mdp: s0 = dsl.state() s1 = dsl.state() s2 = dsl.state() s3 = dsl.terminal_state() a0 = dsl.action() a1 = dsl.action() # s0 & a0 > s1 s0 & a1 > s2 s1 & a0 > s3 s1 & a1 > s3 s2 & a0 > s3 s2 & a1 > s3 # s0 & a0 > dsl.reward(0) s0 & a1 > dsl.reward(0) s1 & a0 > dsl.reward(2) s1 & a1 > dsl.reward(0) s2 & a0 > dsl.reward(0) s2 & a1 > dsl.reward(1) return mdp.validate()
def _multi_round_nmdp(): with dsl.new() as mdp: start = dsl.state() end = dsl.terminal_state() start & dsl.action() > dsl.reward(5) | start | end * 2 start & dsl.action() > dsl.reward(3) | start * 2 | end dsl.discount(0.5) return mdp.validate()
def _one_round_dmdp(): with dsl.new() as mdp: start = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & (action_0 | action_1) > end start & action_1 > dsl.reward(1.) return mdp.validate()
def long_counterexample_env(middle_steps=1): with dsl.new() as mdp: s0 = dsl.state() a0 = dsl.action() a1 = dsl.action() middle_l = [s0, dsl.state()] middle_r = [s0, dsl.state()] s0 & a0 > middle_l[-1] s0 & a1 > middle_r[-1] s0 & a0 > dsl.reward(0) s0 & a1 > dsl.reward(0) for step in range(middle_steps - 1): middle_l.append(dsl.state()) middle_l[-2] & a0 > middle_l[-1] middle_l[-2] & a1 > middle_l[-3] middle_l[-2] & a0 > dsl.reward(0) middle_l[-2] & a1 > dsl.reward(0) middle_r.append(dsl.state()) middle_r[-2] & a0 > middle_r[-3] middle_r[-2] & a1 > middle_r[-1] middle_r[-2] & a0 > dsl.reward(0) middle_r[-2] & a1 > dsl.reward(0) s3 = dsl.terminal_state() # middle_l[-1] & a0 > s3 middle_l[-1] & a1 > s3 middle_r[-1] & a0 > s3 middle_r[-1] & a1 > s3 # middle_l[-1] & a0 > dsl.reward(2) middle_l[-1] & a1 > dsl.reward(0) middle_r[-1] & a0 > dsl.reward(0) middle_r[-1] & a1 > dsl.reward(1) return mdp.validate()
def _two_round_dmdp(): with dsl.new() as mdp: start = dsl.state() better = dsl.state() worse = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & action_0 > better better & action_1 > dsl.reward(3) start & action_1 > worse worse & action_0 > dsl.reward(1) worse & action_1 > dsl.reward(2) (better | worse) & (action_0 | action_1) > end return mdp.validate()
def _two_round_nmdp(): with dsl.new() as mdp: start = dsl.state() a = dsl.state() b = dsl.state() end = dsl.terminal_state() action_0 = dsl.action() action_1 = dsl.action() start & action_0 > a a & action_0 > dsl.reward(-1) | dsl.reward(1) a & action_1 > dsl.reward(0) * 2 | dsl.reward(9) start & action_1 > b b & action_0 > dsl.reward(0) | dsl.reward(2) b & action_1 > dsl.reward(2) | dsl.reward(3) (a | b) & (action_0 | action_1) > end return mdp.validate()
def new_counterexample_env(): with dsl.new() as mdp: s1 = dsl.state() s2 = dsl.state() s3 = dsl.state() s4 = dsl.state() s5 = dsl.terminal_state() l = dsl.action() r = dsl.action() # s1 & l > s2 s1 & r > s4 s2 & l > s3 s2 & r > s1 s3 & l > s5 s3 & r > s2 s4 & l > s1 s4 & r > s5 # s1 & l > dsl.reward(-1) s1 & r > dsl.reward(-1) s2 & l > dsl.reward(-1) s2 & r > dsl.reward(-1) s3 & l > dsl.reward(-1) s3 & r > dsl.reward(-1) s4 & l > dsl.reward(-1) s4 & r > dsl.reward(-1) return mdp.validate()