def epsilon_greedy(self,sensation,applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation,action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon()/num_maxes e1 = self.epsilon()/num_others result = zeros(len(mask))+0.0 putmask(result,mask,1-e0) putmask(result,mask==0,e1) return result
def epsilon_greedy(self, sensation, applicable_actions): """ Given self.epsilon() and self.Q(), return a distribution over applicable_actions as an array where each element contains the a probability mass for the corresponding action. I.e. The action with the highest Q gets p = self.epsilon() and the others get the remainder of the mass, uniformly distributed. """ Q = array([self.Q(sensation, action) for action in applicable_actions]) # simple epsilon-greedy policy # get a vector with a 1 where each max element is, zero elsewhere mask = (Q == mmax(Q)) num_maxes = len(nonzero(mask)) num_others = len(mask) - num_maxes if num_others == 0: return mask e0 = self.epsilon() / num_maxes e1 = self.epsilon() / num_others result = zeros(len(mask)) + 0.0 putmask(result, mask, 1 - e0) putmask(result, mask == 0, e1) return result
def softmax(ar,temp): """ Given an array and a temperature, return the Boltzman distribution over that array. For an array X, and temp T returns a new array containing: exp(Xi/T)/sum_j(exp(Xj/T) for all Xi. If temp == 0 or any value in the array is inf, the function returns the limit value as T -> 0. """ if temp == 0 or inf in ar: v = (ar == mmax(ar)) return v/float(sum(v)) else: numer = Numeric.exp(ar/float(temp)) denom = Numeric.sum(numer) return numer/denom
def softmax(ar, temp): """ Given an array and a temperature, return the Boltzman distribution over that array. For an array X, and temp T returns a new array containing: exp(Xi/T)/sum_j(exp(Xj/T) for all Xi. If temp == 0 or any value in the array is inf, the function returns the limit value as T -> 0. """ if temp == 0 or inf in ar: v = (ar == mmax(ar)) return v / float(sum(v)) else: numer = Numeric.exp(ar / float(temp)) denom = Numeric.sum(numer) return numer / denom