コード例 #1
0
def test_policy_iteration():
    environment = EasyGridWorld()
    GAMMA = 0.9
    EPS = 1e-3
    policy, value_function = policy_iteration(environment, GAMMA, eps=EPS)

    torch.testing.assert_allclose(value_function.table,
                                  torch.tensor([OPTIMAL_VALUE]),
                                  atol=0.05,
                                  rtol=EPS)
    pred_p = policy.table.argmax(dim=0)
    assert_policy_equality(environment, GAMMA, value_function, OPTIMAL_POLICY,
                           pred_p)

    environment = EasyGridWorld(terminal_states=[22])
    GAMMA = 0.9
    EPS = 1e-3
    policy, value_function = policy_iteration(environment, GAMMA, eps=EPS)

    torch.testing.assert_allclose(
        value_function.table,
        torch.tensor([OPTIMAL_VALUE_WITH_TERMINAL]),
        atol=0.05,
        rtol=EPS,
    )

    pred_p = policy.table.argmax(dim=0)
    assert_policy_equality(environment, GAMMA, value_function,
                           OPTIMAL_POLICY_WITH_TERMINAL, pred_p)
コード例 #2
0
def test_not_implemented():
    environment = GymEnvironment("CartPole-v0")
    with pytest.raises(AttributeError):
        iterative_policy_evaluation(0, environment, 0.9)  # type: ignore
    with pytest.raises(AttributeError):
        value_iteration(environment, 0.9)  # type: ignore
    with pytest.raises(AttributeError):
        policy_iteration(environment, 0.9)  # type: ignore
コード例 #3
0
            "reward": -3
        })
        transitions[(1, 1)].append({
            "next_state": 1,
            "probability": 0.5,
            "reward": -3
        })

        return transitions


if __name__ == "__main__":
    from itertools import product

    import torch
    from rllib.algorithms.tabular_planning import policy_iteration
    from rllib.environment.utilities import transitions2kernelreward

    gamma = 0.99
    env = TwoState(reward_0=1)
    kernel, reward = transitions2kernelreward(env.transitions, env.num_states,
                                              env.num_actions)
    policy, value = policy_iteration(env, gamma)
    print(
        torch.distributions.Categorical(logits=policy.table.detach()).probs,
        value.table.detach() * (1 - gamma),
    )
    for state, action in product(range(env.num_states),
                                 range(env.num_actions)):
        print(state, action, kernel[state, action], reward[state, action])
コード例 #4
0
print("Iterative Policy Evaluation:")
value_function = iterative_policy_evaluation(policy,
                                             environment,
                                             GAMMA,
                                             eps=EPS)
print(value_function.table)
print()

print("Linear System Policy Evaluation:")
value_function = linear_system_policy_evaluation(policy, environment, GAMMA)
print(value_function.table)
print()

print("Policy Iteration:")
policy, value_function = policy_iteration(environment, GAMMA, eps=EPS)
print(policy.table.argmax(dim=0))
print(value_function.table)
print()

print("Value Iteration")
policy, value_function = value_iteration(environment, GAMMA, eps=EPS)
print(policy.table.argmax(dim=0))
print(value_function.table)
print()

print("Iterative Policy Evaluation from Value Iteration:")
value_function = iterative_policy_evaluation(policy,
                                             environment,
                                             GAMMA,
                                             eps=EPS,
コード例 #5
0
            transitions[(state, action)].append({
                "next_state": next_state,
                "probability": correct_prob,
                "reward": 0
            })
            transitions[(state, action)].append({
                "next_state": state,
                "probability": 1 - correct_prob,
                "reward": 0,
            })

        return transitions


if __name__ == "__main__":
    from itertools import product

    from rllib.algorithms.tabular_planning import policy_iteration
    from rllib.environment.utilities import transitions2kernelreward

    env = LeftChain()
    kernel, reward = transitions2kernelreward(env.transitions, env.num_states,
                                              env.num_actions)
    print(kernel)
    print(reward)
    policy, value = policy_iteration(env, 0.99)
    print(policy.table, value.table)
    for state, action in product(range(env.num_states),
                                 range(env.num_actions)):
        print(state, action, kernel[state, action], reward[state, action])