def test_policy_iteration(): environment = EasyGridWorld() GAMMA = 0.9 EPS = 1e-3 policy, value_function = policy_iteration(environment, GAMMA, eps=EPS) torch.testing.assert_allclose(value_function.table, torch.tensor([OPTIMAL_VALUE]), atol=0.05, rtol=EPS) pred_p = policy.table.argmax(dim=0) assert_policy_equality(environment, GAMMA, value_function, OPTIMAL_POLICY, pred_p) environment = EasyGridWorld(terminal_states=[22]) GAMMA = 0.9 EPS = 1e-3 policy, value_function = policy_iteration(environment, GAMMA, eps=EPS) torch.testing.assert_allclose( value_function.table, torch.tensor([OPTIMAL_VALUE_WITH_TERMINAL]), atol=0.05, rtol=EPS, ) pred_p = policy.table.argmax(dim=0) assert_policy_equality(environment, GAMMA, value_function, OPTIMAL_POLICY_WITH_TERMINAL, pred_p)
def test_not_implemented(): environment = GymEnvironment("CartPole-v0") with pytest.raises(AttributeError): iterative_policy_evaluation(0, environment, 0.9) # type: ignore with pytest.raises(AttributeError): value_iteration(environment, 0.9) # type: ignore with pytest.raises(AttributeError): policy_iteration(environment, 0.9) # type: ignore
"reward": -3 }) transitions[(1, 1)].append({ "next_state": 1, "probability": 0.5, "reward": -3 }) return transitions if __name__ == "__main__": from itertools import product import torch from rllib.algorithms.tabular_planning import policy_iteration from rllib.environment.utilities import transitions2kernelreward gamma = 0.99 env = TwoState(reward_0=1) kernel, reward = transitions2kernelreward(env.transitions, env.num_states, env.num_actions) policy, value = policy_iteration(env, gamma) print( torch.distributions.Categorical(logits=policy.table.detach()).probs, value.table.detach() * (1 - gamma), ) for state, action in product(range(env.num_states), range(env.num_actions)): print(state, action, kernel[state, action], reward[state, action])
print("Iterative Policy Evaluation:") value_function = iterative_policy_evaluation(policy, environment, GAMMA, eps=EPS) print(value_function.table) print() print("Linear System Policy Evaluation:") value_function = linear_system_policy_evaluation(policy, environment, GAMMA) print(value_function.table) print() print("Policy Iteration:") policy, value_function = policy_iteration(environment, GAMMA, eps=EPS) print(policy.table.argmax(dim=0)) print(value_function.table) print() print("Value Iteration") policy, value_function = value_iteration(environment, GAMMA, eps=EPS) print(policy.table.argmax(dim=0)) print(value_function.table) print() print("Iterative Policy Evaluation from Value Iteration:") value_function = iterative_policy_evaluation(policy, environment, GAMMA, eps=EPS,
transitions[(state, action)].append({ "next_state": next_state, "probability": correct_prob, "reward": 0 }) transitions[(state, action)].append({ "next_state": state, "probability": 1 - correct_prob, "reward": 0, }) return transitions if __name__ == "__main__": from itertools import product from rllib.algorithms.tabular_planning import policy_iteration from rllib.environment.utilities import transitions2kernelreward env = LeftChain() kernel, reward = transitions2kernelreward(env.transitions, env.num_states, env.num_actions) print(kernel) print(reward) policy, value = policy_iteration(env, 0.99) print(policy.table, value.table) for state, action in product(range(env.num_states), range(env.num_actions)): print(state, action, kernel[state, action], reward[state, action])