def get_MDP_sample(self): mdp = MDP.DiscreteMDP(self.n_states, self.n_actions) for s in range(self.n_states): for a in range(self.n_actions): ## Sample transitions from the Dirichlet mdp.P[s,a] = np.random.dirichlet(self.alpha[s, a]) mdp.R[s,a] = np.random.beta(self.reward_alpha[s,a], self.reward_beta[s,a]) return mdp
def get_mean_MDP(self): mdp = MDP.DiscreteMDP(self.n_states, self.n_actions) for s in range(self.n_states): for a in range(self.n_actions): ## Sample transitions from the Dirichlet mdp.P[s,a] = self.get_marginal_transition_probabilities(s, a) mdp.R[s,a] = self.get_expected_reward(s,a) return mdp
Q = np.zeros([mdp.n_states, mdp.n_actions, T]) for t in range(T - 1, -1, -1): for s in range(mdp.n_states): for a in range(mdp.n_actions): if (t == T - 1): Q[s, a, t] = mdp.get_reward(s, a) else: P_sa = mdp.get_transition_probabilities(s, a) U_next = sum(P_sa * V[:, t + 1]) Q[s, a, t] = mdp.get_reward(s, a) + U_next V[s, t] = max(Q[s, :, t]) policy[s, t] = np.argmax(Q[s, :, t]) return policy, V, Q n_actions = 2 n_states = 2 T = 1000 mdp = MDP.DiscreteMDP(n_states, n_actions) policy, V, Q = backwards_induction(mdp, T) for s in range(mdp.n_states): for a in range(mdp.n_actions): print("S:", s, "A:", a, mdp.get_transition_probabilities(s, a)) for t in range(T): print(policy[:, t]) for t in range(T): print(V[:, t])