def __call__(self, percept): s1, r1 = percept mdp = self.mdp R, P, terminals, pi = mdp.reward, mdp.P, mdp.terminals, self.pi s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U if s1 not in self.visited: # Reward is only known for visited state. U[s1] = R[s1] = r1 self.visited.add(s1) if s is not None: Nsa[(s, a)] += 1 Ns1_sa[(s1, s, a)] += 1 # for each t such that Ns′|sa [t, s, a] is nonzero for t in [res for (res, state, act), freq in Ns1_sa.items() if (state, act) == (s, a) and freq != 0]: P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)] self.U = policy_evaluation(pi, U, mdp) ## ## self.Nsa, self.Ns1_sa = Nsa, Ns1_sa if s1 in terminals: self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] return self.a
def __call__(self, percept): """What you need to do: 1. update the transistion mdp.P by current <s,a,r>, here a is in the MDP, <s,r> is in the percept 2. update the value function self.U and policy self.pi by policy_evaluation--Implemented in mdp.py 3. through the policy get the current action self.a (self.pi is pre-defined) Eventually, you only need to get the value function. It is a non-learning agent. Tips: How to deal with the terminal states ?""" s1, r1 = percept mdp = self.mdp R, P, terminals, pi = mdp.reward, mdp.P, mdp.terminals, self.pi s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U if s1 not in self.visited: # Reward is only known for visited state. U[s1] = R[s1] = r1 self.visited.add(s1) if s is not None: Nsa[(s, a)] += 1 Ns1_sa[(s1, s, a)] += 1 # for each t such that Ns′|sa [t, s, a] is nonzero for t in [ res for (res, state, act), freq in Ns1_sa.items() if (state, act) == (s, a) and freq != 0 ]: P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)] self.U = policy_evaluation(pi, U, mdp) ## ## self.Nsa, self.Ns1_sa = Nsa, Ns1_sa if s1 in terminals: self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] return self.a
def program(self, percept): s1,r1 = percept mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa if s1 not in mdp.reward: # mdp.R also tracks the visited states U[s1] = mdp.reward[s1] = r1 if s is not None: Nsa[s][a] += 1 Ns_sa[s][a][s1] += 1 for t in Ns_sa[s][a]: if Ns_sa[s][a][t] > 0: self.mdp.T_set((s,a,t), Ns_sa[s][a][t]/Nsa[s][a]) U = policy_evaluation(self.pi, U, mdp) if s1 in mdp.terminals: self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] return self.a
def program(self, percept): s1, r1 = percept mdp, U, s, a, Nsa, Ns_sa = self.mdp, self.U, self.s, self.a, self.Nsa, self.Ns_sa if s1 not in mdp.reward: # mdp.R also tracks the visited states U[s1] = mdp.reward[s1] = r1 if s is not None: Nsa[s][a] += 1 Ns_sa[s][a][s1] += 1 for t in Ns_sa[s][a]: if Ns_sa[s][a][t] > 0: self.mdp.T_set((s, a, t), Ns_sa[s][a][t] / Nsa[s][a]) U = policy_evaluation(self.pi, U, mdp) if s1 in mdp.terminals: self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] return self.a
def __call__(self, percept): s1, r1 = percept self.mdp.states.add(s1) # Model keeps track of visited states. R, P, mdp, pi = self.mdp.reward, self.mdp.P, self.mdp, self.pi s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U if s1 not in R: # Reward is only available for visted state. U[s1] = R[s1] = r1 if s is not None: Nsa[(s, a)] += 1 Ns1_sa[(s1, s, a)] += 1 # for each t such that Ns′|sa [t, s, a] is nonzero for t in [res for (res, state, act), freq in Ns1_sa.items() if (state, act) == (s, a) and freq != 0]: P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)] U = policy_evaluation(pi, U, mdp) if s1 in mdp.terminals: self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] return self.a