def tabular_representation(self): """Get a tabular representation of the policy.""" policy = TabularPolicy(self.num_states, self.num_actions) for state in range(self.num_states): state = torch.tensor(state) policy.set_value(state, self(state).clone()) return policy
def value_iteration(model, gamma, eps=1e-6, max_iter=1000, value_function=None): """Implement of Value Iteration algorithm. Parameters ---------- model: gamma: discount factor. eps: desired precision of policy evaluation step max_iter: maximum number of iterations value_function: initial estimate of value function, optional. Returns ------- policy: value_function: References ---------- Sutton, R. S., & Barto, A. G. (2018). Reinforcement learning: An introduction. MIT press. Chapter 4.4 """ if model.num_actions is None or model.num_states is None: raise NotImplementedError("Actions and States must be discrete and countable.") if value_function is None: value_function = init_value_function(model.num_states, model.terminal_states) policy = TabularPolicy(num_states=model.num_states, num_actions=model.num_actions) for _ in range(max_iter): error = 0 for state in range(model.num_states): value_ = torch.zeros(model.num_actions) for action in range(model.num_actions): value_estimate = 0 for transition in model.transitions[(state, action)]: next_state = torch.tensor(transition["next_state"]).long() value_estimate += transition["probability"] * ( transition["reward"] + gamma * value_function(next_state) ) value_[action] = value_estimate state = torch.tensor(state).long() value = value_function(state) value_, action = torch.max(value_, 0) error = max(error, torch.abs(value_ - value.item())) value_function.set_value(state, value_) policy.set_value(state, action) if error < eps: break return policy, value_function
def get_default_policy(environment, function_approximation): """Get default policy.""" if function_approximation == "tabular": policy = TabularPolicy.default(environment) elif function_approximation == "linear": policy = NNPolicy.default(environment, layers=[200]) freeze_hidden_layers(policy) else: policy = NNPolicy.default(environment) return policy
def test_set_value(self): policy = TabularPolicy(num_states=4, num_actions=2) policy.set_value(2, torch.tensor(1)) l1 = torch.log(torch.tensor(1e-12)) l2 = torch.log(torch.tensor(1.0 + 1e-12)) torch.testing.assert_allclose( policy.table, torch.tensor([[1.0, 1.0, l1, 1], [1.0, 1.0, l2, 1]]) ) policy.set_value(0, torch.tensor([0.3, 0.7])) torch.testing.assert_allclose( policy.table, torch.tensor([[0.3, 1.0, l1, 1], [0.7, 1.0, l2, 1]]) )
def test_init(self): policy = TabularPolicy(num_states=4, num_actions=2) torch.testing.assert_allclose(policy.table, torch.ones(2, 4))
from exps.plotting import set_figure_params palette = sns.color_palette(n_colors=15) eta = 0.5 num_states = 3 num_actions = 2 P = torch.tensor([ [[0.8, 0.2, 0.0], [0.05, 0.9, 0.05]], [[0.0, 1.0, 0.0], [0.05, 0.05, 0.9]], [[0.1, 0.1, 0.8], [0.05, 0.05, 0.9]], ]) r = torch.tensor([[-0.0, 0], [-0.1, 0], [-0.0, 1.0]]) pi = Categorical(probs=torch.tensor([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])) policy = TabularPolicy(num_states=num_states, num_actions=num_actions) for state in range(num_states): policy.set_value(state, pi.logits[state]) rho = average_policy_evaluation(policy=policy, transitions=P, rewards=r) P_, r_ = mdp2mrp(policy=policy, transitions=P, rewards=r) r = r - rho r_ = r_ - rho V = torch.zeros(num_states) for i in range(1000): V[1:] = P_[1:, 0, :] @ V + r_[1:, 0] # Q = r + (P @ V).squeeze(-1) Q = torch.tensor([[-1.0, 0], [0.5, 1.0], [0.2, 0.5]])