def value_iteration(self, epsilon=0.001, policy=None, k=100000, _Uvec=None, _stationary=True): """Solving a VVMdp by value iteration. The weight vector Lambda is known and used to compute scalar value functions, so that is the standard VI algorithm [Fig. 17.4]. Stops when the improvement is less than epsilon""" n, na, d, Lambda = self.nstates, self.nactions, self.d, self.Lambda gamma, R, expected_scalar_utility = self.gamma, self.rewards, self.expected_scalar_utility Udot = np.zeros(n, dtype=ftype) uvec = np.zeros(d, dtype=ftype) Uvec = np.zeros((n, d), dtype=ftype) # for test lastp = np.zeros(n, dtype=np.int16) newp = np.zeros(n, dtype=np.int16) if _Uvec is not None: Uvec[:] = _Uvec Q = np.zeros(na, dtype=ftype) for t in range(k): # bounds the number of iterations if the break condition is too weak delta = 0.0 for s in range(n): # Choose the action if policy is not None: if _stationary: act = random.choice(policy[s]) else: act = policy[s] else: Q[:] = [expected_scalar_utility(s, a, Udot) for a in range(na)] act = np.argmax(Q) newp[s] = act # Compute the update uvec[:] = R[s] + gamma * self.expected_vec_utility(s, act, Uvec) # vectorial utility of the best action udot = Lambda.dot(uvec) # its scalar utility if policy is not None: delta = max(delta, l1distance(uvec, Uvec[s])) else: delta = max(delta, abs(udot - Udot[s])) Uvec[s] = uvec Udot[s] = udot if (newp - lastp).any(): #print t, ":", newp, Udot lastp[:] = newp if delta < epsilon * (1 - gamma) / gamma: # total expected improvement for adding delta #print t, ":", newp, Udot return Uvec return Uvec
def value_iteration(self, epsilon=0.001,policy=None,k=100000,_Uvec=None, _stationary= True): "Solving an MDP by value iteration. [Fig. 17.4]" n , na, d , Lambda = self.nstates , self.nactions, self.d , self.Lambda gamma , R , expected_scalar_utility = self.gamma , self.rewards , self.expected_scalar_utility Udot = np.zeros( n , dtype=ftype) _uvec= np.zeros( d , dtype=ftype) #Rdot = np.array( [R[s].dot(Lambda) for s in range(n)] ,dtype=ftype) Uvec = np.zeros( (n,d) , dtype=ftype) if _Uvec != None: Uvec[:] = _Uvec Q = np.zeros( na , dtype=ftype ) for t in range(k): delta = 0.0 for s in range(n): # Choose the action if policy != None: if _stationary: act = random.choice(policy[s]) else: act = policy[s] else: Q[:] = [expected_scalar_utility(s, a, Udot) for a in range(na)] act = np.argmax(Q) # Compute the update _uvec[:] = R[s] + gamma * self.expected_vec_utility(s,act,Uvec) _udot = Lambda.dot(_uvec) if policy != None: delta = max(delta, l1distance(_uvec , Uvec[s]) ) else: #print "old delta=",delta," , new delta=",max(delta, abs(_udot-Udot[s]) ) #print "_udot=",_udot," , Udot[s]=",Udot[s] #print "_uvec=",_uvec," , Uvec[s]=",Uvec[s] #print delta = max(delta, abs(_udot-Udot[s]) ) Uvec[s] = _uvec Udot[s] = _udot if delta < epsilon * (1 - gamma) / gamma: return Uvec return Uvec
def prioritized_sweeping_policy_evaluation(self,pi, U1, k=maxint , epsilon=0.001): """Return an updated utility mapping U from each state in the MDP to its utility, using an approximation (modified policy iteration).""" R, gamma ,expect_vec_u = self.rewards, self.gamma , self.expected_vec_utility h = [] for s in range(self.nstates): heappush( h , (-self.rmax-random.uniform(0,1),s) ) print 'after push' print h for i in count(0): U = U1.copy() (delta,s) = heappop(h) print 'after pop' print h U1[s] = R[s] + gamma * expect_vec_u(s,pi[s],U) delta = l1distance(U1[s],U[s]) if i > k or delta < epsilon * (1 - gamma) / gamma: return U
def policy_evaluation(self, epsilon, policy, k, Uvec): #_Uvec is of dimension nxd n, d = self.nstates, self.d gamma , R , expected_scalar_utility = self.gamma , self.rewards , self.expected_scalar_utility _uvec= np.zeros( (n,d) , dtype=ftype) for t in range(k): delta = 0.0 for s in range(n): # Choose the action act = random.choice(policy[s]) # Compute the update _uvec[s] = R[s] + gamma * self.expected_vec_utility(s,act,Uvec) delta = max(delta, l1distance(_uvec[s] , Uvec[s]) ) for s in range(n): Uvec[s] = _uvec[s] if delta < epsilon * (1 - gamma) / gamma: return Uvec return Uvec