def _generateSample(self): """ generate a new sample from the current distribution. """ if self.useCauchy: # Cauchy distribution chosenOne = drawIndex(self.alphas, True) return multivariateCauchy(self.mus[chosenOne], self.sigmas[chosenOne]) else: # Normal distribution chosenOne = drawIndex(self.alphas, True) return multivariate_normal(self.mus[chosenOne], self.sigmas[chosenOne])
def _generateOneOffspring(self, pop): """ produce one single offspring, given the population and the linkage matrix """ # TODO: optimize? n = self.xdim # one gene is chosen directly initindex = choice(range(n)) chosen = [(choice(range(len(pop))), initindex)] # the indices of the rest are shuffled indices = list(range(n)) shuffle(indices) indices.remove(initindex) for index in indices: probs = zeros(len(pop)) for parent in range(len(pop)): # determine the probability of drawing the i'th gene from parent p p1 = self._computeProbChosenGivenAq(len(pop), index, parent, chosen) p2 = self._computeProbChosenGivenAq(len(pop), index, parent, chosen, invertAq=True) probs[parent] = p1 / (p1 + (len(pop) - 1) * p2) # draw according to the probabilities chosen.append((drawIndex(probs, tolerant=True), index)) child = zeros(self.xdim) crossovervector = zeros(self.xdim) for parent, index in chosen: child[index] = pop[parent][index] crossovervector[index] = parent return child, crossovervector
def _generateOneOffspring(self, pop): """ produce one single offspring, given the population and the linkage matrix """ # TODO: optimize? n = self.xdim # one gene is chosen directly initindex = choice(range(n)) chosen = [(choice(range(len(pop))), initindex)] # the indices of the rest are shuffled indices = list(range(n)) shuffle(indices) indices.remove(initindex) for index in indices: probs = zeros(len(pop)) for parent in range(len(pop)): # determine the probability of drawing the i'th gene from parent p p1 = self._computeProbChosenGivenAq(len(pop), index, parent, chosen) p2 = self._computeProbChosenGivenAq(len(pop), index, parent, chosen, invertAq = True) probs[parent] = p1 / (p1 + (len(pop)-1)* p2) # draw according to the probabilities chosen.append((drawIndex(probs, tolerant = True), index)) child = zeros(self.xdim) crossovervector = zeros(self.xdim) for parent, index in chosen: child[index] = pop[parent][index] crossovervector[index] = parent return child, crossovervector
def _produceNewSample(self): """ returns a new sample, its fitness and its densities """ chosenOne = drawIndex(self.alphas, True) mu = self.mus[chosenOne] if self.useAnticipatedMeanShift: if len(self.allsamples) % 2 == 1 and len(self.allsamples) > 1: if not (self.elitism and chosenOne == self.bestChosenCenter): mu += self.meanShifts[chosenOne] if self.diagonalOnly: sample = normal(mu, self.sigmas[chosenOne]) else: sample = multivariate_normal(mu, self.sigmas[chosenOne]) if self.sampleElitism and len( self.allsamples) > self.windowSize and len( self.allsamples) % self.windowSize == 0: sample = self.bestEvaluable.copy() fit = self._oneEvaluation(sample) if ((not self.minimize and fit >= self.bestEvaluation) or (self.minimize and fit <= self.bestEvaluation) or len(self.allsamples) == 0): # used to determine which center produced the current best self.bestChosenCenter = chosenOne self.bestSigma = self.sigmas[chosenOne].copy() if self.minimize: fit = -fit self.allfitnesses.append(fit) self.allsamples.append(sample) return sample, fit
def getAction(self): self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) if self.learning and not self.learner.batchMode and self._oaro is not None: self.learner._updateWeights(*(self._oaro + [self.lastaction])) self._oaro = None # print "Agent " + str(self.indexOfAgent) + ": " + str(self.lastaction) return array([self.lastaction])
def performAction(self, action): """ POMDP tasks, as they have discrete actions, can me used by providing either an index, or an array with a 1-in-n coding (which can be stochastic). """ if type(action) == ndarray: action = drawIndex(action, tolerant = True) self.steps += 1 EpisodicTask.performAction(self, action)
def performAction(self, action): """ POMDP tasks, as they have discrete actions, can me used by providing either an index, or an array with a 1-in-n coding (which can be stochastic). """ if type(action) == ndarray: action = drawIndex(action, tolerant=True) self.steps += 1 EpisodicTask.performAction(self, action)
def _produceNewSample(self): """ returns a new sample, its fitness and its densities """ chosenOne = drawIndex(self.alphas, True) mu = self.mus[chosenOne] if self.useAnticipatedMeanShift: if len(self.allsamples) % 2 == 1 and len(self.allsamples) > 1: if not(self.elitism and chosenOne == self.bestChosenCenter): mu += self.meanShifts[chosenOne] if self.diagonalOnly: sample = normal(mu, self.sigmas[chosenOne]) else: sample = multivariate_normal(mu, self.sigmas[chosenOne]) if self.sampleElitism and len(self.allsamples) > self.windowSize and len(self.allsamples) % self.windowSize == 0: sample = self.bestEvaluable.copy() fit = self._oneEvaluation(sample) if ((not self.minimize and fit >= self.bestEvaluation) or (self.minimize and fit <= self.bestEvaluation) or len(self.allsamples) == 0): # used to determine which center produced the current best self.bestChosenCenter = chosenOne self.bestSigma = self.sigmas[chosenOne].copy() if self.minimize: fit = -fit self.allfitnesses.append(fit) self.allsamples.append(sample) return sample, fit
def computeChi(self, evals = 100): """ compute an estimate of the distance from the centers to the generated points """ # CHECKME: correct handling of multiple centers? s = 0 for dummy in range(evals): m = drawIndex(self.alpha, tolerant = True) z = mat(multivariate_normal(array(self.x[m]).flatten(), self.sigma[m])).T s += norm(self.x[m] - z) return s/evals
def computeChi(self, evals=100): """ compute an estimate of the distance from the centers to the generated points """ # CHECKME: correct handling of multiple centers? s = 0 for dummy in range(evals): m = drawIndex(self.alpha, tolerant=True) z = mat( multivariate_normal(array(self.x[m]).flatten(), self.sigma[m])).T s += norm(self.x[m] - z) return s / evals
def oneSample(self, k): """ produce one new sample and update phi correspondingly """ thesum = 0.0 for i in range(self.mu): thesum += exp(self.basealpha[i]) for i in range(self.mu): self.alpha[i] = exp(self.basealpha[i]) / thesum choosem = drawIndex(self.alpha, tolerant=True) self.chosenCenter[k] = choosem z = mat( multivariate_normal( array(self.x[choosem]).flatten(), self.sigma[choosem])).T self.zs[k] = z self.R[k] = self.evaluateAt(z) # TODO make for all mu if self.importanceSampling: self.rellhood[k] = multivariateNormalPdf(z, self.x[0], self.sigma[0]) logderivbasealpha = zeros((self.mu, 1)) logderivx = zeros((self.mu, self.xdim)) logderivfactorsigma = zeros((self.mu, self.xdim, self.xdim)) for m in range(self.mu): self.sigma[m] = dot(self.factorSigma[m].T, self.factorSigma[m]) if self.mu > 1: relresponsibility = (self.alpha[m] * multivariateNormalPdf( ravel(z), ravel(self.x[m]), self.sigma[m]) / sum( map( lambda mm: self.alpha[mm] * multivariateNormalPdf( ravel(z), ravel(self.x[mm]), self.sigma[mm]), range(self.mu)))) else: relresponsibility = 1.0 if self.mu > 1: logderivbasealpha[m] = relresponsibility * (1.0 - self.alpha[m]) else: logderivbasealpha[m] = 0.0 logderivx[m] = relresponsibility * (self.sigma[m].I * (z - self.x[m])).flatten() A = 0.5 * self.sigma[m].I * (z - self.x[m]) * ( z - self.x[m]).T * self.sigma[m].I - 0.5 * self.sigma[m].I logderivsigma_m = self.blackmagic * relresponsibility * A #0.5 * (A + diag(diag(A))) #* 2.0 logderivfactorsigma[m] = self.factorSigma[m] * (logderivsigma_m + logderivsigma_m.T) #print 'logalpha', logderivbasealpha.flatten(), self.alpha, sum(logderivbasealpha) tmp = self.combineParams(logderivbasealpha, logderivx, logderivfactorsigma) self.phi[k] = tmp
def _performAction(self, action, onlyavatar=False): """ Action is an index for the actionset. """ if action is None: return action # if actions are given as a vector, pick the argmax import numpy from scipy import argmax from pybrain.utilities import drawIndex if isinstance(action, numpy.ndarray): if abs(sum(action) - 1) < 1e5: # vector represents probabilities action = drawIndex(action) else: action = argmax(action) # take action and compute consequences # replace the method that reads multiple action keys with a fn that just # returns the currently desired action self._avatar._readMultiActions = lambda *x: [self._actionset[action]] if self.visualize: self._game._clearAll(self.visualize) # update sprites if onlyavatar: self._avatar.update(self._game) else: for s in self._game: s.update(self._game) # handle collision effects self._game._eventHandling() if self.visualize: self._game._clearAll(self.visualize) # update screen if self.visualize: self._game._drawAll() pygame.display.update(VGDLSprite.dirtyrects) VGDLSprite.dirtyrects = [] pygame.time.wait(self.actionDelay) if self.recordingEnabled: self._previous_state = self._last_state self._last_state = self.getState() self._allEvents.append( (self._previous_state, action, self._last_state))
def performAction(self, action, onlyavatar=False): """ Action is an index for the actionset. """ if action is None: return # if actions are given as a vector, pick the argmax import numpy from scipy import argmax from pybrain.utilities import drawIndex if isinstance(action, numpy.ndarray): if abs(sum(action) -1) < 1e5: # vector represents probabilities action = drawIndex(action) else: action = argmax(action) # take action and compute consequences self._avatar._readMultiActions = lambda *x: [self._actionset[action]] self._game._clearAll(self.visualize) # update sprites if onlyavatar: self._avatar.update(self._game) else: for s in self._game: s.update(self._game) # handle collision effects self._game._updateCollisionDict() self._game._eventHandling() self._game._clearAll(self.visualize) # update screen if self.visualize: self._game._drawAll() pygame.display.update(VGDLSprite.dirtyrects) VGDLSprite.dirtyrects = [] pygame.time.wait(self.actionDelay) if self.recordingEnabled: self._previous_state = self._last_state self._last_state = self.getState() self._allEvents.append((self._previous_state, action, self._last_state))
def oneSample(self, k): """ produce one new sample and update phi correspondingly """ thesum = 0.0 for i in range(self.mu): thesum += exp(self.basealpha[i]) for i in range(self.mu): self.alpha[i] = exp(self.basealpha[i])/thesum choosem = drawIndex(self.alpha, tolerant = True) self.chosenCenter[k] = choosem z = mat(multivariate_normal(array(self.x[choosem]).flatten(), self.sigma[choosem])).T self.zs[k] = z self.R[k] = self.evaluateAt(z) # TODO make for all mu if self.importanceSampling: self.rellhood[k] = multivariateNormalPdf(z, self.x[0], self.sigma[0]) logderivbasealpha = zeros((self.mu, 1)) logderivx = zeros((self.mu, self.xdim)) logderivfactorsigma = zeros((self.mu, self.xdim, self.xdim)) for m in range(self.mu): self.sigma[m] = dot(self.factorSigma[m].T,self.factorSigma[m]) if self.mu > 1: relresponsibility = (self.alpha[m] * multivariateNormalPdf(ravel(z), ravel(self.x[m]), self.sigma[m]) / sum(map(lambda mm: self.alpha[mm]*multivariateNormalPdf(ravel(z), ravel(self.x[mm]), self.sigma[mm]), range(self.mu)))) else: relresponsibility = 1.0 if self.mu > 1: logderivbasealpha[m] = relresponsibility * (1.0 - self.alpha[m]) else: logderivbasealpha[m] = 0.0 logderivx[m] = relresponsibility * (self.sigma[m].I * (z - self.x[m])).flatten() A = 0.5 * self.sigma[m].I * (z - self.x[m]) * (z - self.x[m]).T * self.sigma[m].I - 0.5 * self.sigma[m].I logderivsigma_m = self.blackmagic * relresponsibility * A#0.5 * (A + diag(diag(A))) #* 2.0 logderivfactorsigma[m] = self.factorSigma[m]*(logderivsigma_m + logderivsigma_m.T) #print 'logalpha', logderivbasealpha.flatten(), self.alpha, sum(logderivbasealpha) tmp = self.combineParams(logderivbasealpha, logderivx, logderivfactorsigma) self.phi[k] = tmp
def getAction(self): return drawIndex(self.policy[self.stateIndexFun()])
def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print(( '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', )) print(('avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')')) print(lens) # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print('Old value error: ', self.vbp.testOnData()) # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print('New value error: ', self.vbp.testOnData()) # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence( maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', print 'avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')' print lens # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print 'Old value error: ', self.vbp.testOnData() # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print 'New value error: ', self.vbp.testOnData() # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence(maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
def getAction(self): self.lastaction = drawIndex(self._actionProbs(self.lastobs), True) return array([self.lastaction])