Ejemplo n.º 1
0
 def get_MDP_sample(self):
     mdp = MDP.DiscreteMDP(self.n_states, self.n_actions)
     for s in range(self.n_states):
         for a in range(self.n_actions):
             ## Sample transitions from the Dirichlet
             mdp.P[s,a] = np.random.dirichlet(self.alpha[s, a])
             mdp.R[s,a] = np.random.beta(self.reward_alpha[s,a], self.reward_beta[s,a])
     return mdp
Ejemplo n.º 2
0
 def get_mean_MDP(self):
     mdp = MDP.DiscreteMDP(self.n_states, self.n_actions)
     for s in range(self.n_states):
         for a in range(self.n_actions):
             ## Sample transitions from the Dirichlet
             mdp.P[s,a] = self.get_marginal_transition_probabilities(s, a)
             mdp.R[s,a] = self.get_expected_reward(s,a)
     return mdp
Ejemplo n.º 3
0
    Q = np.zeros([mdp.n_states, mdp.n_actions, T])
    for t in range(T - 1, -1, -1):
        for s in range(mdp.n_states):
            for a in range(mdp.n_actions):
                if (t == T - 1):
                    Q[s, a, t] = mdp.get_reward(s, a)
                else:
                    P_sa = mdp.get_transition_probabilities(s, a)
                    U_next = sum(P_sa * V[:, t + 1])
                    Q[s, a, t] = mdp.get_reward(s, a) + U_next
            V[s, t] = max(Q[s, :, t])
            policy[s, t] = np.argmax(Q[s, :, t])
    return policy, V, Q


n_actions = 2
n_states = 2
T = 1000
mdp = MDP.DiscreteMDP(n_states, n_actions)
policy, V, Q = backwards_induction(mdp, T)

for s in range(mdp.n_states):
    for a in range(mdp.n_actions):
        print("S:", s, "A:", a, mdp.get_transition_probabilities(s, a))

for t in range(T):
    print(policy[:, t])

for t in range(T):
    print(V[:, t])