def sample_transition(self, max_n, policy, seed=None, with_restart=True, s_start=None): """ generator that samples from the MDP be aware that this chains can be infinitely long the chain is restarted if the policy changes max_n: maximum number of samples to draw policy pi: policy python function seed: optional seed for the random generator to generate deterministic samples with_restart: determines whether sampling with automatic restart: is used returns a transition tuple (X_n, A, X_n+1, R) """ if seed is not None: np.random.seed(seed) i = 0 term = 0 while i < max_n: if s_start is None: s0 = multinomial_sample(1, self.P0) else: s0 = s_start while i < max_n: if self.s_terminal[s0]: term += 1 if term > self.terminal_trans: term = 0 break a = policy(s0) s1 = multinomial_sample(1, self.P[s0, a]) r = self.r[s0, a, s1] yield (s0, a, s1, r) i += 1 s0 = s1 if not with_restart: break
def synchronous_sweep(self, seed=None, policy="uniform"): """ generate samples from the MDP so that exactly one transition from each non-terminal-state is yielded Parameters ----------- policy pi: policy python function seed: optional seed for the random generator to generate deterministic samples Returns --------- transition tuple (X_n, A, X_n+1, R) """ if seed is not None: np.random.seed(seed) if policy is "uniform": policy = self.uniform_policy() for s0 in self.states: if self.s_terminal[s0]: break a = policy(s0) s1 = multinomial_sample(1, self.P[s0, a]) r = self.r[s0, a, s1] yield (s0, a, s1, r)
def samples_cached_transitions(self, policy, states, seed=None): n = states.shape[0] sn = np.zeros_like(states) a = np.ones([n, self.dim_A]) r = np.ones(n) for i in xrange(n): a[i] = policy(states[i]) sn[i] = multinomial_sample(1, self.P[int(states[i]), int(a[i])]) r[i] = self.r[int(states[i]), int(a[i]), int(sn[i])] return a, r, sn
def __call__(self, s): return util.multinomial_sample(1, self.tab[int(s), :])