def run(self, π_list): #, P=None, R=None, C=None): (S, A, P, R, C, H) = (self.S, self.A, self.P, self.R, self.C, self.H) p_traj = np.zeros((S,A,S)) r_traj = np.zeros((S,A)) c_traj = np.zeros((S,A)) visitation = np.zeros((S,A)) if len(np.shape(R)) == 3: R = np.einsum("sap,sap->sa",P,R) if len(np.shape(C)) == 3: C = np.einsum("sap,sap->sa",P,C) #C = np.einsum("sap,dsap->dsa",P,C) s = sample(self.s0) for h in range(H): #a = sample(π_list[h,s,:]) try: a = π_list[h,s,:].argmax() except: a = π_list(s) s_next = sample(P[s,a,:]) p_traj[s,a,s_next] += 1.0 r_traj[s,a] += R[s,a] c_traj[s,a] += C[s,a] visitation[s,a]+= 1.0 s = s_next self.stats['training_consumpution'] += c_traj.sum() self.stats['num_trajs'] += 1 return p_traj,r_traj,c_traj,visitation, self.Si.lookup(s)
def simulate(self, π, s=None, a=None, mode='direct'): """Simulation is under various `modes` -- i.e., interpretations of γ: - "direct": episodes never terminate and might not mix (i.e., get stuck in a subset of the state space if the underlying Markov chain is not ergodic conditioned on π). - "terminate": with probability (1-γ) the episode terminates. - "reset": with probability (1-γ) we reset the trajectory to an initial state drawn from `s0`. """ # Coerce arrays into functions if isinstance(π, np.ndarray): π = lambda s, π=π: sample(π[s, :]) assert mode in ('direct', 'terminate', 'reset'), mode if s is None: s = sample(self.s0) if a is None: a = π(s) while True: sp = sample(self.P[s, a, :]) r = self.R[s, a, sp] # this is consistent with r[s,a] = E[R[s,a,s']] yield (s, a, r) if mode != 'direct': if np.random.uniform(0, 1) <= (1 - self.γ): if mode == 'terminate': return else: sp = sample(self.s0) s = sp a = π(s)
def run(self, learner): s = sample(self.s0) while True: a = learner(s) if np.random.uniform() <= (1 - self.gamma): sp = sample(self.s0) r = 0 else: sp = sample(self.P[s, a, :]) r = self.R[s, a, sp] if not learner.update(s, a, r, sp): break s = sp
def sample(self, v=None): "Sample from parse forest." if v is None: v = self.root edges = self.incoming[v] # base case (leaf), nothing to sample if not edges: return v # sample incoming edge, p(e|head) \propto edge.weight * (\prod_{b in e.body} beta[b]) ps = [float(e.weight) for e in edges] # sample one of the incoming edges i = sample(ps) e = edges[i] return Tree(v, [self.sample(y) for y in e.body])
def __call__(self, P=None, R=None, theta=None, fic=False): if fic: R = np.einsum('sap,sap->sa', P, R) if len(R.shape) == 3 else R P = P num_episodes = self.num_fic_episodes else: R = self.R if R is None else R P = self.P if P is None else P num_episodes = self.num_episodes (C, H, Si, S, A) = (self.C, self.H, self.Si, self.S, self.A) s = None for t in range(num_episodes): s = sample(self.s0) c = 0 for h in range(H): rep = self.G.get_representation(s=s, Si=self.Si) #rep = np.asarray(rep) a = self.select_action(rep) sp = sample(P[s, a]) try: r = R[s, a, sp] except: r = R[s,a] c += C[s,a] self.policy.rewards.append(r) s = sp self.finish_episode() self.stats['training_consumpution'] += c self.stats['num_trajs'] += self.num_episodes π_list = np.repeat(self.to_table()[np.newaxis, :, :], H, axis=0) results = { 'V': None, 'Q': None, 'pi_list': π_list, 'last_state': self.Si.lookup(s) } return results
def sample(self, v): "Sample an edge (dotted rule)." assert isinstance(v, tuple) edges = self.g.incoming[v] # base case (leaf), nothing to sample if not edges: return # sample incoming edge, p(e|head) \propto edge.weight * (\prod_{b in e.body} beta[b]) ws = [e.weight for e in edges] ps = [float(e.weight) for e in edges] # sample one of the incoming edges i = sample(ps) e = edges[i] self.total *= ws[i] return DottedRule(e)
def start(self): return sample(self.s0)
def step_exact(self, s, a): sp = sample(self.P[s, a, :]) r = self.R[s, a, sp] return r, sp
def step(self, s, a): sp = sample(self.P[s, a, :]) r = self.R[s, a, sp] if uniform(0, 1) <= 1 - self.gamma: sp = self.start() return r, sp
def step(self, s): if np.random.uniform(0, 1) <= 1 - self.gamma: return self.start() return sample(self.P[s, :])