Esempio n. 1
0
    def epsilon_greedy(self,sensation,applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation,action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask
        
        e0 = self.epsilon()/num_maxes
        e1 = self.epsilon()/num_others

        result = zeros(len(mask))+0.0
        putmask(result,mask,1-e0)
        putmask(result,mask==0,e1)
        return result
    def epsilon_greedy(self, sensation, applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation, action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask

        e0 = self.epsilon() / num_maxes
        e1 = self.epsilon() / num_others

        result = zeros(len(mask)) + 0.0
        putmask(result, mask, 1 - e0)
        putmask(result, mask == 0, e1)
        return result
Esempio n. 3
0
    def update_Q(self,sensation,action,delta,on_policy=True):
        """
        Do a linear update of the weights.  
        """
        if self.lambda_ and on_policy:
            self.e *= self.lambda_
            if self.prune_eligibility > 0.0:
                self.e *= (self.e > self.prune_eligibility)
        else:
            self.e *= 0.0

        self.e[action] += sensation
        
        if self.replacing_traces:
            putmask(self.e,self.e > 1,1)
            
        self.w += self.e * (self.alpha/(sum(sensation))) * delta
    def update_Q(self, sensation, action, delta, on_policy=True):
        """
        Do a linear update of the weights.  
        """
        if self.lambda_ and on_policy:
            self.e *= self.lambda_
            if self.prune_eligibility > 0.0:
                self.e *= (self.e > self.prune_eligibility)
        else:
            self.e *= 0.0

        self.e[action] += sensation

        if self.replacing_traces:
            putmask(self.e, self.e > 1, 1)

        self.w += self.e * (self.alpha / (sum(sensation))) * delta