コード例 #1
0
    def _updateKernelParameters(self, S, A, random=True, normalize=True):
        SA = self._getStateActionMatrix(S, A)

        if random:
            self.MuS = Helper.getRandomSubset(S, self.numFeatures)
            self.MuSA = Helper.getRandomSubset(SA, self.numFeatures)
        else:
            self.MuS = Helper.getRepresentativeRows(S, self.numFeatures, normalize)
            self.MuSA = Helper.getRepresentativeRows(SA, self.numFeatures, normalize)

        NUM_SAMPLES_FOR_BW_ESTIMATE = 500

        # bandwidth for PHI_S
        bwNonKbS = Helper.getBandwidth(self.MuS[:, 0:self.NUM_NON_KB_DIM],
                NUM_SAMPLES_FOR_BW_ESTIMATE, self.bwFactorNonKbS)

        kbPosS = self._reshapeKbPositions(self.MuS[:, self.NUM_NON_KB_DIM:])
        bwKbS = Helper.getBandwidth(kbPosS, NUM_SAMPLES_FOR_BW_ESTIMATE,
                self.bwFactorKbS)

        self.kernelS.setBandwidth(bwNonKbS, bwKbS)
        self.kernelS.setWeighting(self.weightNonKbS)

        # bandwidth for PHI_SA
        bwNonKbSA = Helper.getBandwidth(self.MuSA[:, 0:(self.NUM_NON_KB_DIM + 2)],
                NUM_SAMPLES_FOR_BW_ESTIMATE, self.bwFactorNonKbSA)

        kbPosSA = self._reshapeKbPositions(self.MuSA[:, (self.NUM_NON_KB_DIM + 2):])
        bwKbSA = Helper.getBandwidth(kbPosSA, NUM_SAMPLES_FOR_BW_ESTIMATE,
                self.bwFactorKbSA)

        self.kernelSA.setBandwidth(bwNonKbSA, bwKbSA)
        self.kernelSA.setWeighting(self.weightNonKbSA)
コード例 #2
0
    def _getSubsetForGP(self, S, random=True, normalize=True):
        Nsubset = min(self.numSamplesSubsetGP, S.shape[0])

        if random:
            return Helper.getRandomSubset(S, Nsubset)
        else:
            return Helper.getRepresentativeRows(S, Nsubset, normalize)
コード例 #3
0
    def learn(self, savePrefix, numSampleIt, continueLearning = True,
            numLearnIt = 1, startEpsilon = 0.0, epsilonFactor = 1.0):

        """ sampling """
        self.objectShape = 'quad'
        self.numKilobots = 4
        self.numEpisodes = 25
        self.numStepsPerEpisode = 100

        self.stepsPerSec = 4096


        self.sDim = self.NUM_NON_KB_DIM + 2 * self.numKilobots

        if continueLearning:
            # dont reset sample or policy
            if self.S is None:
                self.S, self.A, self.R, self.S_ = empty((0, self.sDim)),\
                    empty((0, 2)), empty((0, 1)), empty((0, self.sDim))
        else:
            # reset samples, policy and number of iterations
            self.S, self.A, self.R, self.S_ = empty((0, self.sDim)),\
                    empty((0, 2)), empty((0, 1)), empty((0, self.sDim))
            self.policy = SparseGPPolicy(KilobotKernel(self.NUM_NON_KB_DIM),
                    self.aRange)
            self.it = 0


        """ LSTD """
        self.lstd.discountFactor = 0.99

        factor = 1.0
        factorKb = 1.0
        weightNonKb = 0.5

        self.bwFactorNonKbSA = factor
        self.bwFactorKbSA = factorKb
        self.weightNonKbSA = weightNonKb

        self.bwFactorNonKbS = factor
        self.bwFactorKbS = factorKb
        self.weightNonKbS = weightNonKb

        self.numFeatures = 200

        """ REPS """
        self.reps.epsilonAction = 0.5

        """ GP """
        self.policy.GPMinVariance = 0.0
        self.policy.GPRegularizer = 0.05

        self.numSamplesSubsetGP = 200

        self.bwFactorNonKbGP = factor
        self.bwFactorKbGP = factorKb
        self.weightNonKbGP = weightNonKb

        self.numLearnIt = numLearnIt

        self.startEpsilon = startEpsilon
        self.epsilon = startEpsilon
        self.epsilonFactor = epsilonFactor

        # make data dir and save params
        if savePrefix == '':
            savePath = ''
        else:
            savePath = os.path.join(savePrefix, Helper.getSaveName())
            os.makedirs(savePath)

            self.saveParams(os.path.join(savePath, 'params'))

        rewards = []

        for i in range(numSampleIt):
            t = time.time()

            # get new samples
            St, At, Rt, S_t = self._getSamples()
            print('sum reward for last samples: {}'.format(Rt.sum()))
            rewards += [Rt.sum()]

            # add samples
            self.S = r_[self.S, St]
            self.A = r_[self.A, At]
            self.R = r_[self.R, Rt]
            self.S_ = r_[self.S_, S_t]

            # only keep 10000 samples
            SARS = c_[self.S, self.A, self.R, self.S_]
            SARS = Helper.getRandomSubset(SARS, 10000)

            self.S, self.A, self.R, self.S_ = self._unpackSARS(SARS)

            self._updateKernelParameters(self.S, self.A, random=True,
                    normalize=True)

            self.PHI_S = self.kernelS.getGramMatrix(self.S, self.MuS)

            SA = self._getStateActionMatrix(self.S, self.A)
            self.PHI_SA = self.kernelSA.getGramMatrix(SA, self.MuSA)

            for j in range(numLearnIt):
                # LSTD to estimate Q function / Q(s,a) = phi(s, a).T * theta
                self.PHI_SA_ = self._getFeatureExpectation(self.S_, 5, self.MuSA)
                self.theta = self.lstd.learnLSTD(self.PHI_SA, self.PHI_SA_, self.R)

                # AC-REPS
                self.Q = self.PHI_SA * self.theta
                self.w = self.reps.computeWeighting(self.Q, self.PHI_S)

                # GP
                Ssub = self._getSubsetForGP(self.S, random=True, normalize=True)
                self._updateBandwidthsGP(Ssub)
                self.policy.train(self.S, self.A, self.w, Ssub)

                self.it += 1
                print('finished learning iteration {}'.format(self.it))

                # save results
                if savePath != '':
                    figV = self.getValueFunctionFigure(50, 25, 4)
                    figP = self.getPolicyFigure(50, 25)

                    figV.savefig(os.path.join(savePath, 'V_{}.svg'.format(self.it)))
                    figP.savefig(os.path.join(savePath, 'P_{}.svg'.format(self.it)))

                    self.savePolicyAndSamples(os.path.join(savePath,
                        'policy_samples_{}'.format(self.it)))

                    plt.close(figV)
                    plt.close(figP)

            self.epsilon *= self.epsilonFactor

            print('sampling iteration took: {}s'.format(time.time() - t))

            gc.collect()

        with open(os.path.join(savePath, 'rewards'), 'w') as f:
            for r in rewards:
                f.write('{}\n'.format(r))