Ejemplo n.º 1
0
    def value_iteration(self, epsilon=0.001, policy=None, k=100000, _Uvec=None, _stationary=True):
        """Solving a VVMdp by value iteration. The weight vector Lambda is known and used to compute scalar
        value functions, so that is the standard VI algorithm [Fig. 17.4]. Stops when the improvement is
        less than epsilon"""
        n, na, d, Lambda = self.nstates, self.nactions, self.d, self.Lambda
        gamma, R, expected_scalar_utility = self.gamma, self.rewards, self.expected_scalar_utility

        Udot = np.zeros(n, dtype=ftype)
        uvec = np.zeros(d, dtype=ftype)
        Uvec = np.zeros((n, d), dtype=ftype)
        # for test
        lastp = np.zeros(n, dtype=np.int16)
        newp = np.zeros(n, dtype=np.int16)

        if _Uvec is not None:
            Uvec[:] = _Uvec

        Q = np.zeros(na, dtype=ftype)

        for t in range(k):  # bounds the number of iterations if the break condition is too weak

            delta = 0.0
            for s in range(n):

                # Choose the action
                if policy is not None:
                    if _stationary:
                        act = random.choice(policy[s])
                    else:
                        act = policy[s]

                else:
                    Q[:] = [expected_scalar_utility(s, a, Udot) for a in range(na)]
                    act = np.argmax(Q)
                    newp[s] = act

                # Compute the update
                uvec[:] = R[s] + gamma * self.expected_vec_utility(s, act,
                                                                    Uvec)  # vectorial utility of the best action
                udot = Lambda.dot(uvec)  # its scalar utility

                if policy is not None:
                    delta = max(delta, l1distance(uvec, Uvec[s]))
                else:
                    delta = max(delta, abs(udot - Udot[s]))

                Uvec[s] = uvec
                Udot[s] = udot

            if (newp - lastp).any():
                #print t, ":", newp, Udot
                lastp[:] = newp
            if delta < epsilon * (1 - gamma) / gamma: # total expected improvement for adding delta
                #print t, ":", newp, Udot
                return Uvec
        return Uvec
Ejemplo n.º 2
0
    def value_iteration(self, epsilon=0.001,policy=None,k=100000,_Uvec=None, _stationary= True):
        "Solving an MDP by value iteration. [Fig. 17.4]"
        n , na, d , Lambda = self.nstates , self.nactions, self.d , self.Lambda
        gamma , R , expected_scalar_utility = self.gamma , self.rewards , self.expected_scalar_utility

        Udot = np.zeros( n , dtype=ftype)
        _uvec= np.zeros( d , dtype=ftype)
        #Rdot = np.array( [R[s].dot(Lambda) for s in range(n)] ,dtype=ftype)
        Uvec = np.zeros( (n,d) , dtype=ftype)
        if _Uvec != None:
            Uvec[:] = _Uvec

        Q    = np.zeros( na , dtype=ftype )

        for t in range(k):

            delta = 0.0
            for s in range(n):

                # Choose the action
                if policy != None:
                    if _stationary:
                        act = random.choice(policy[s])
                    else:
                        act = policy[s]

                else:
                    Q[:]    = [expected_scalar_utility(s, a, Udot) for a in range(na)]
                    act     = np.argmax(Q)

                # Compute the update
                _uvec[:] = R[s] + gamma * self.expected_vec_utility(s,act,Uvec)
                _udot    = Lambda.dot(_uvec)

                if policy != None:
                    delta = max(delta, l1distance(_uvec , Uvec[s]) )
                else:
                    #print "old delta=",delta," , new delta=",max(delta, abs(_udot-Udot[s]) )
                    #print "_udot=",_udot,"  ,  Udot[s]=",Udot[s]
                    #print "_uvec=",_uvec,"  ,  Uvec[s]=",Uvec[s]
                    #print

                    delta = max(delta, abs(_udot-Udot[s]) )

                Uvec[s] = _uvec
                Udot[s] = _udot

            if delta < epsilon * (1 - gamma) / gamma:
                return Uvec
        return Uvec
Ejemplo n.º 3
0
    def prioritized_sweeping_policy_evaluation(self,pi, U1, k=maxint , epsilon=0.001):
        """Return an updated utility mapping U from each state in the MDP to its
        utility, using an approximation (modified policy iteration)."""
        R, gamma ,expect_vec_u = self.rewards, self.gamma , self.expected_vec_utility
        h = []

        for s in range(self.nstates):
            heappush( h , (-self.rmax-random.uniform(0,1),s) )
            print 'after push'
            print h

        for i in count(0):
            U = U1.copy()

            (delta,s) = heappop(h)
            print 'after pop'
            print h

            U1[s] = R[s] + gamma * expect_vec_u(s,pi[s],U)

            delta = l1distance(U1[s],U[s])

            if i > k or delta < epsilon * (1 - gamma) / gamma:
                return U
Ejemplo n.º 4
0
    def policy_evaluation(self, epsilon, policy, k, Uvec):
        #_Uvec is of dimension nxd
        n, d = self.nstates, self.d
        gamma , R , expected_scalar_utility = self.gamma , self.rewards , self.expected_scalar_utility

        _uvec= np.zeros( (n,d) , dtype=ftype)

        for t in range(k):

            delta = 0.0
            for s in range(n):
                # Choose the action
                act = random.choice(policy[s])
                # Compute the update
                _uvec[s] = R[s] + gamma * self.expected_vec_utility(s,act,Uvec)

                delta = max(delta, l1distance(_uvec[s] , Uvec[s]) )

            for s in range(n):
                Uvec[s] = _uvec[s]

            if delta < epsilon * (1 - gamma) / gamma:
                return Uvec
        return Uvec