def learn(self, savePrefix, numSampleIt, continueLearning = True, numLearnIt = 1, startEpsilon = 0.0, epsilonFactor = 1.0): """ sampling """ self.objectShape = 'quad' self.numKilobots = 4 self.numEpisodes = 25 self.numStepsPerEpisode = 100 self.stepsPerSec = 4096 self.sDim = self.NUM_NON_KB_DIM + 2 * self.numKilobots if continueLearning: # dont reset sample or policy if self.S is None: self.S, self.A, self.R, self.S_ = empty((0, self.sDim)),\ empty((0, 2)), empty((0, 1)), empty((0, self.sDim)) else: # reset samples, policy and number of iterations self.S, self.A, self.R, self.S_ = empty((0, self.sDim)),\ empty((0, 2)), empty((0, 1)), empty((0, self.sDim)) self.policy = SparseGPPolicy(KilobotKernel(self.NUM_NON_KB_DIM), self.aRange) self.it = 0 """ LSTD """ self.lstd.discountFactor = 0.99 factor = 1.0 factorKb = 1.0 weightNonKb = 0.5 self.bwFactorNonKbSA = factor self.bwFactorKbSA = factorKb self.weightNonKbSA = weightNonKb self.bwFactorNonKbS = factor self.bwFactorKbS = factorKb self.weightNonKbS = weightNonKb self.numFeatures = 200 """ REPS """ self.reps.epsilonAction = 0.5 """ GP """ self.policy.GPMinVariance = 0.0 self.policy.GPRegularizer = 0.05 self.numSamplesSubsetGP = 200 self.bwFactorNonKbGP = factor self.bwFactorKbGP = factorKb self.weightNonKbGP = weightNonKb self.numLearnIt = numLearnIt self.startEpsilon = startEpsilon self.epsilon = startEpsilon self.epsilonFactor = epsilonFactor # make data dir and save params if savePrefix == '': savePath = '' else: savePath = os.path.join(savePrefix, Helper.getSaveName()) os.makedirs(savePath) self.saveParams(os.path.join(savePath, 'params')) rewards = [] for i in range(numSampleIt): t = time.time() # get new samples St, At, Rt, S_t = self._getSamples() print('sum reward for last samples: {}'.format(Rt.sum())) rewards += [Rt.sum()] # add samples self.S = r_[self.S, St] self.A = r_[self.A, At] self.R = r_[self.R, Rt] self.S_ = r_[self.S_, S_t] # only keep 10000 samples SARS = c_[self.S, self.A, self.R, self.S_] SARS = Helper.getRandomSubset(SARS, 10000) self.S, self.A, self.R, self.S_ = self._unpackSARS(SARS) self._updateKernelParameters(self.S, self.A, random=True, normalize=True) self.PHI_S = self.kernelS.getGramMatrix(self.S, self.MuS) SA = self._getStateActionMatrix(self.S, self.A) self.PHI_SA = self.kernelSA.getGramMatrix(SA, self.MuSA) for j in range(numLearnIt): # LSTD to estimate Q function / Q(s,a) = phi(s, a).T * theta self.PHI_SA_ = self._getFeatureExpectation(self.S_, 5, self.MuSA) self.theta = self.lstd.learnLSTD(self.PHI_SA, self.PHI_SA_, self.R) # AC-REPS self.Q = self.PHI_SA * self.theta self.w = self.reps.computeWeighting(self.Q, self.PHI_S) # GP Ssub = self._getSubsetForGP(self.S, random=True, normalize=True) self._updateBandwidthsGP(Ssub) self.policy.train(self.S, self.A, self.w, Ssub) self.it += 1 print('finished learning iteration {}'.format(self.it)) # save results if savePath != '': figV = self.getValueFunctionFigure(50, 25, 4) figP = self.getPolicyFigure(50, 25) figV.savefig(os.path.join(savePath, 'V_{}.svg'.format(self.it))) figP.savefig(os.path.join(savePath, 'P_{}.svg'.format(self.it))) self.savePolicyAndSamples(os.path.join(savePath, 'policy_samples_{}'.format(self.it))) plt.close(figV) plt.close(figP) self.epsilon *= self.epsilonFactor print('sampling iteration took: {}s'.format(time.time() - t)) gc.collect() with open(os.path.join(savePath, 'rewards'), 'w') as f: for r in rewards: f.write('{}\n'.format(r))