Exemple #1
0
    def sample_transition(self,
                          max_n,
                          policy,
                          seed=None,
                          with_restart=True,
                          s_start=None):
        """
        generator that samples from the MDP
        be aware that this chains can be infinitely long
        the chain is restarted if the policy changes

            max_n: maximum number of samples to draw

            policy pi: policy python function

            seed: optional seed for the random generator to generate
                deterministic samples

            with_restart: determines whether sampling with automatic restart:
                is used

            returns a transition tuple (X_n, A, X_n+1, R)
        """

        if seed is not None:
            np.random.seed(seed)

        i = 0
        term = 0
        while i < max_n:
            if s_start is None:
                s0 = multinomial_sample(1, self.P0)
            else:
                s0 = s_start
            while i < max_n:
                if self.s_terminal[s0]:
                    term += 1
                    if term > self.terminal_trans:
                        term = 0
                        break
                a = policy(s0)
                s1 = multinomial_sample(1, self.P[s0, a])
                r = self.r[s0, a, s1]
                yield (s0, a, s1, r)
                i += 1
                s0 = s1
            if not with_restart:
                break
Exemple #2
0
    def synchronous_sweep(self, seed=None, policy="uniform"):
        """
        generate samples from the MDP so that exactly one transition from each
        non-terminal-state is yielded

        Parameters
        -----------
            policy pi: policy python function

            seed: optional seed for the random generator to generate
                deterministic samples

        Returns
        ---------
            transition tuple (X_n, A, X_n+1, R)
        """
        if seed is not None:
            np.random.seed(seed)
        if policy is "uniform":
            policy = self.uniform_policy()

        for s0 in self.states:
            if self.s_terminal[s0]:
                break
            a = policy(s0)
            s1 = multinomial_sample(1, self.P[s0, a])
            r = self.r[s0, a, s1]
            yield (s0, a, s1, r)
Exemple #3
0
    def synchronous_sweep(self, seed=None, policy="uniform"):
        """
        generate samples from the MDP so that exactly one transition from each
        non-terminal-state is yielded

        Parameters
        -----------
            policy pi: policy python function

            seed: optional seed for the random generator to generate
                deterministic samples

        Returns
        ---------
            transition tuple (X_n, A, X_n+1, R)
        """
        if seed is not None:
            np.random.seed(seed)
        if policy is "uniform":
            policy = self.uniform_policy()

        for s0 in self.states:
            if self.s_terminal[s0]:
                break
            a = policy(s0)
            s1 = multinomial_sample(1, self.P[s0, a])
            r = self.r[s0, a, s1]
            yield (s0, a, s1, r)
Exemple #4
0
    def sample_transition(self, max_n, policy, seed=None,
                          with_restart=True, s_start=None):
        """
        generator that samples from the MDP
        be aware that this chains can be infinitely long
        the chain is restarted if the policy changes

            max_n: maximum number of samples to draw

            policy pi: policy python function

            seed: optional seed for the random generator to generate
                deterministic samples

            with_restart: determines whether sampling with automatic restart:
                is used

            returns a transition tuple (X_n, A, X_n+1, R)
        """

        if seed is not None:
            np.random.seed(seed)

        i = 0
        term = 0
        while i < max_n:
            if s_start is None:
                s0 = multinomial_sample(1, self.P0)
            else:
                s0 = s_start
            while i < max_n:
                if self.s_terminal[s0]:
                    term += 1
                    if term > self.terminal_trans:
                        term = 0
                        break
                a = policy(s0)
                s1 = multinomial_sample(1, self.P[s0, a])
                r = self.r[s0, a, s1]
                yield (s0, a, s1, r)
                i += 1
                s0 = s1
            if not with_restart:
                break
Exemple #5
0
 def samples_cached_transitions(self, policy, states, seed=None):
     n = states.shape[0]
     sn = np.zeros_like(states)
     a = np.ones([n, self.dim_A])
     r = np.ones(n)
     for i in xrange(n):
         a[i] = policy(states[i])
         sn[i] = multinomial_sample(1, self.P[int(states[i]), int(a[i])])
         r[i] = self.r[int(states[i]), int(a[i]), int(sn[i])]
     return a, r, sn
Exemple #6
0
 def samples_cached_transitions(self, policy, states, seed=None):
     n = states.shape[0]
     sn = np.zeros_like(states)
     a = np.ones([n, self.dim_A])
     r = np.ones(n)
     for i in xrange(n):
         a[i] = policy(states[i])
         sn[i] = multinomial_sample(1, self.P[int(states[i]), int(a[i])])
         r[i] = self.r[int(states[i]), int(a[i]), int(sn[i])]
     return a, r, sn
Exemple #7
0
 def __call__(self, s):
     return  util.multinomial_sample(1, self.tab[int(s), :])
Exemple #8
0
 def __call__(self, s):
     return util.multinomial_sample(1, self.tab[int(s), :])