Ejemplo n.º 1
0
    def run(self, π_list): #, P=None, R=None, C=None):
        (S, A, P, R, C, H) = (self.S, self.A, self.P, self.R, self.C, self.H)

        p_traj = np.zeros((S,A,S))
        r_traj = np.zeros((S,A))
        c_traj = np.zeros((S,A))
        visitation = np.zeros((S,A))

        if len(np.shape(R)) == 3:
            R = np.einsum("sap,sap->sa",P,R)
        if len(np.shape(C)) == 3:
            C = np.einsum("sap,sap->sa",P,C)
            #C = np.einsum("sap,dsap->dsa",P,C)

        s = sample(self.s0)
        for h in range(H):
            #a = sample(π_list[h,s,:])
            try:
                a = π_list[h,s,:].argmax()
            except:
                a = π_list(s)

            s_next = sample(P[s,a,:])
            p_traj[s,a,s_next] += 1.0
            r_traj[s,a] += R[s,a]
            c_traj[s,a] += C[s,a]
            visitation[s,a]+= 1.0
            s = s_next
        self.stats['training_consumpution'] += c_traj.sum()
        self.stats['num_trajs'] += 1
        return p_traj,r_traj,c_traj,visitation, self.Si.lookup(s)
Ejemplo n.º 2
0
    def simulate(self, π, s=None, a=None, mode='direct'):
        """Simulation is under various `modes` -- i.e., interpretations of γ:

        - "direct": episodes never terminate and might not mix (i.e., get stuck
          in a subset of the state space if the underlying Markov chain is not
          ergodic conditioned on π).

        - "terminate": with probability (1-γ) the episode terminates.

        - "reset": with probability (1-γ) we reset the trajectory to an initial
          state drawn from `s0`.

        """
        # Coerce arrays into functions
        if isinstance(π, np.ndarray): π = lambda s, π=π: sample(π[s, :])
        assert mode in ('direct', 'terminate', 'reset'), mode
        if s is None: s = sample(self.s0)
        if a is None: a = π(s)
        while True:
            sp = sample(self.P[s, a, :])
            r = self.R[s, a,
                       sp]  # this is consistent with r[s,a] = E[R[s,a,s']]
            yield (s, a, r)
            if mode != 'direct':
                if np.random.uniform(0, 1) <= (1 - self.γ):
                    if mode == 'terminate':
                        return
                    else:
                        sp = sample(self.s0)
            s = sp
            a = π(s)
Ejemplo n.º 3
0
 def run(self, learner):
     s = sample(self.s0)
     while True:
         a = learner(s)
         if np.random.uniform() <= (1 - self.gamma):
             sp = sample(self.s0)
             r = 0
         else:
             sp = sample(self.P[s, a, :])
             r = self.R[s, a, sp]
         if not learner.update(s, a, r, sp):
             break
         s = sp
Ejemplo n.º 4
0
 def sample(self, v=None):
     "Sample from parse forest."
     if v is None: v = self.root
     edges = self.incoming[v]
     # base case (leaf), nothing to sample
     if not edges: return v
     # sample incoming edge, p(e|head) \propto edge.weight * (\prod_{b in e.body} beta[b])
     ps = [float(e.weight) for e in edges]
     # sample one of the incoming edges
     i = sample(ps)
     e = edges[i]
     return Tree(v, [self.sample(y) for y in e.body])
Ejemplo n.º 5
0
    def __call__(self, P=None, R=None, theta=None, fic=False):
        if fic:
            R = np.einsum('sap,sap->sa', P, R) if len(R.shape) == 3 else R
            P = P
            num_episodes = self.num_fic_episodes
        else:
            R = self.R if R is None else R
            P = self.P if P is None else P
            num_episodes = self.num_episodes

        (C, H, Si, S, A) = (self.C, self.H, self.Si, self.S, self.A)

        s = None
        for t in range(num_episodes):
            s = sample(self.s0)
            c = 0
            for h in range(H):
                rep = self.G.get_representation(s=s, Si=self.Si)
                #rep = np.asarray(rep)
                a = self.select_action(rep)
                sp = sample(P[s, a])
                try:
                    r = R[s, a, sp]
                except:
                    r = R[s,a]
                c += C[s,a]
                self.policy.rewards.append(r)
                s = sp
            self.finish_episode()
            self.stats['training_consumpution'] += c
        self.stats['num_trajs'] += self.num_episodes
        π_list = np.repeat(self.to_table()[np.newaxis, :, :], H, axis=0)
        results = {
            'V': None,
            'Q': None,
            'pi_list': π_list,
            'last_state': self.Si.lookup(s)
        }
        return results
Ejemplo n.º 6
0
    def sample(self, v):
        "Sample an edge (dotted rule)."
        assert isinstance(v, tuple)

        edges = self.g.incoming[v]
        # base case (leaf), nothing to sample
        if not edges: return
        # sample incoming edge, p(e|head) \propto edge.weight * (\prod_{b in e.body} beta[b])
        ws = [e.weight for e in edges]
        ps = [float(e.weight) for e in edges]
        # sample one of the incoming edges
        i = sample(ps)
        e = edges[i]
        self.total *= ws[i]

        return DottedRule(e)
Ejemplo n.º 7
0
 def start(self):
     return sample(self.s0)
Ejemplo n.º 8
0
 def step_exact(self, s, a):
     sp = sample(self.P[s, a, :])
     r = self.R[s, a, sp]
     return r, sp
Ejemplo n.º 9
0
 def step(self, s, a):
     sp = sample(self.P[s, a, :])
     r = self.R[s, a, sp]
     if uniform(0, 1) <= 1 - self.gamma:
         sp = self.start()
     return r, sp
Ejemplo n.º 10
0
 def step(self, s):
     if np.random.uniform(0, 1) <= 1 - self.gamma:
         return self.start()
     return sample(self.P[s, :])