Esempio n. 1
0
 def tabular_representation(self):
     """Get a tabular representation of the policy."""
     policy = TabularPolicy(self.num_states, self.num_actions)
     for state in range(self.num_states):
         state = torch.tensor(state)
         policy.set_value(state, self(state).clone())
     return policy
Esempio n. 2
0
def value_iteration(model, gamma, eps=1e-6, max_iter=1000, value_function=None):
    """Implement of Value Iteration algorithm.

    Parameters
    ----------
    model:
    gamma: discount factor.
    eps: desired precision of policy evaluation step
    max_iter: maximum number of iterations
    value_function: initial estimate of value function, optional.

    Returns
    -------
    policy:
    value_function:

    References
    ----------
    Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction.
    MIT press.
    Chapter 4.4

    """
    if model.num_actions is None or model.num_states is None:
        raise NotImplementedError("Actions and States must be discrete and countable.")

    if value_function is None:
        value_function = init_value_function(model.num_states, model.terminal_states)
    policy = TabularPolicy(num_states=model.num_states, num_actions=model.num_actions)

    for _ in range(max_iter):
        error = 0
        for state in range(model.num_states):

            value_ = torch.zeros(model.num_actions)
            for action in range(model.num_actions):
                value_estimate = 0
                for transition in model.transitions[(state, action)]:
                    next_state = torch.tensor(transition["next_state"]).long()
                    value_estimate += transition["probability"] * (
                        transition["reward"] + gamma * value_function(next_state)
                    )
                value_[action] = value_estimate
            state = torch.tensor(state).long()
            value = value_function(state)
            value_, action = torch.max(value_, 0)

            error = max(error, torch.abs(value_ - value.item()))
            value_function.set_value(state, value_)
            policy.set_value(state, action)

        if error < eps:
            break

    return policy, value_function
Esempio n. 3
0
def get_default_policy(environment, function_approximation):
    """Get default policy."""
    if function_approximation == "tabular":
        policy = TabularPolicy.default(environment)
    elif function_approximation == "linear":
        policy = NNPolicy.default(environment, layers=[200])
        freeze_hidden_layers(policy)
    else:
        policy = NNPolicy.default(environment)
    return policy
Esempio n. 4
0
    def test_set_value(self):
        policy = TabularPolicy(num_states=4, num_actions=2)
        policy.set_value(2, torch.tensor(1))
        l1 = torch.log(torch.tensor(1e-12))
        l2 = torch.log(torch.tensor(1.0 + 1e-12))
        torch.testing.assert_allclose(
            policy.table, torch.tensor([[1.0, 1.0, l1, 1], [1.0, 1.0, l2, 1]])
        )

        policy.set_value(0, torch.tensor([0.3, 0.7]))
        torch.testing.assert_allclose(
            policy.table, torch.tensor([[0.3, 1.0, l1, 1], [0.7, 1.0, l2, 1]])
        )
Esempio n. 5
0
 def test_init(self):
     policy = TabularPolicy(num_states=4, num_actions=2)
     torch.testing.assert_allclose(policy.table, torch.ones(2, 4))
from exps.plotting import set_figure_params

palette = sns.color_palette(n_colors=15)

eta = 0.5
num_states = 3
num_actions = 2
P = torch.tensor([
    [[0.8, 0.2, 0.0], [0.05, 0.9, 0.05]],
    [[0.0, 1.0, 0.0], [0.05, 0.05, 0.9]],
    [[0.1, 0.1, 0.8], [0.05, 0.05, 0.9]],
])
r = torch.tensor([[-0.0, 0], [-0.1, 0], [-0.0, 1.0]])
pi = Categorical(probs=torch.tensor([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]))
policy = TabularPolicy(num_states=num_states, num_actions=num_actions)
for state in range(num_states):
    policy.set_value(state, pi.logits[state])

rho = average_policy_evaluation(policy=policy, transitions=P, rewards=r)
P_, r_ = mdp2mrp(policy=policy, transitions=P, rewards=r)

r = r - rho
r_ = r_ - rho

V = torch.zeros(num_states)
for i in range(1000):
    V[1:] = P_[1:, 0, :] @ V + r_[1:, 0]

# Q = r + (P @ V).squeeze(-1)
Q = torch.tensor([[-1.0, 0], [0.5, 1.0], [0.2, 0.5]])