def performAction(self, action):
     """ POMDP tasks, as they have discrete actions, can me used by providing either an index,
     or an array with a 1-in-n coding (which can be stochastic). """
     if type(action) == ndarray:
         action = drawIndex(action, tolerant = True)
     self.steps += 1
     EpisodicTask.performAction(self, action)
Esempio n. 2
0
    def _produceNewSample(self):
        """ returns a new sample, its fitness and its densities """
        chosenOne = drawIndex(self.alphas, True)
        mu = self.mus[chosenOne]

        if self.useAnticipatedMeanShift:
            if len(self.allsamples) % 2 == 1 and len(self.allsamples) > 1:
                if not (self.elitism and chosenOne == self.bestChosenCenter):
                    mu += self.meanShifts[chosenOne]

        if self.diagonalOnly:
            sample = normal(mu, self.sigmas[chosenOne])
        else:
            sample = multivariate_normal(mu, self.sigmas[chosenOne])
        if self.sampleElitism and len(
                self.allsamples) > self.windowSize and len(
                    self.allsamples) % self.windowSize == 0:
            sample = self.bestEvaluable.copy()
        fit = self._oneEvaluation(sample)

        if ((not self.minimize and fit >= self.bestEvaluation)
                or (self.minimize and fit <= self.bestEvaluation)
                or len(self.allsamples) == 0):
            # used to determine which center produced the current best
            self.bestChosenCenter = chosenOne
            self.bestSigma = self.sigmas[chosenOne].copy()
        if self.minimize:
            fit = -fit
        self.allfitnesses.append(fit)
        self.allsamples.append(sample)
        return sample, fit
Esempio n. 3
0
    def performAction(self, action, onlyavatar=False):
        """ Action is an index for the actionset.  """
        if action is None:
            return
        # if actions are given as a vector, pick the argmax
        import numpy
        from scipy import argmax
        from pybrain3.utilities import drawIndex
        if isinstance(action, numpy.ndarray):
            if abs(sum(action) - 1) < 1e5:
                # vector represents probabilities
                action = drawIndex(action)
            else:
                action = argmax(action)

        # take action and compute consequences
        if self._avatar:
            self._avatar._readMultiActions = lambda *x: [
                self._actionset[action]
            ]

        self._game._clearAll(self.visualize)

        # update sprites
        if onlyavatar:
            self._avatar.update(self._game)
        else:
            for s in self._game:
                s.update(self._game)

        # handle collision effects
        self._game._updateCollisionDict()
        self._game._eventHandling()
        self._game._clearAll(self.visualize)

        # update screen
        if self.visualize:
            self._game._drawAll()
            pygame.display.update(VGDLSprite.dirtyrects)
            VGDLSprite.dirtyrects = []
            pygame.time.wait(self.actionDelay)

        if self.recordingEnabled:
            self._previous_state = self._last_state
            self._last_state = self.getState()
            self._allEvents.append(
                (self._previous_state, action, self._last_state))
Esempio n. 4
0
    def learnOneBatch(self):
        # collect a batch of runs as experience
        r0s = []
        lens = []
        avgReward = 0.
        for dummy in range(self.batchSize):
            self.rawDs.newSequence()
            self.valueDs.newSequence()
            self.task.reset()
            self.net.reset()
            acts, obss, rewards = [], [], []
            while not self.task.isFinished():
                obs = self.task.getObservation()
                act = self.net.activate(obs)
                chosen = drawIndex(act)
                self.task.performAction(chosen)
                reward = self.task.getReward()
                obss.append(obs)
                y = zeros(len(act))
                y[chosen] = 1
                acts.append(y)
                rewards.append(reward)
            avgReward += sum(rewards) / float(len(rewards))

            # compute the returns from the list of rewards
            current = 0
            returns = []
            for r in reversed(rewards):
                current *= self.task.discount
                current += r
                returns.append(current)
            returns.reverse()
            for i in range(len(obss)):
                self.rawDs.addSample(obss[i], acts[i], returns[i])
                self.valueDs.addSample(obss[i], returns[i])
            r0s.append(returns[0])
            lens.append(len(returns))

        r0s = array(r0s)
        self.totalSteps += sum(lens)
        avgLen = sum(lens) / float(self.batchSize)
        avgR0 = mean(r0s)
        avgReward /= self.batchSize
        if self.verbose:
            print('***',
                  round(avgLen, 3),
                  '***',
                  '(avg init exp. return:',
                  round(avgR0, 5),
                  ')',
                  end=' ')
            print('avg reward', round(avgReward, 5), '(tau:',
                  round(self.tau, 3), ')')
            print(lens)
        # storage:
        self.rewardAvg.append(avgReward)
        self.lengthAvg.append(avgLen)
        self.initr0Avg.append(avgR0)

        #        if self.vnet == None:
        #            # case 1: no value estimator:

        # prepare the dataset for training the acting network
        shaped = self.shapingFunction(r0s)
        self.updateTau(r0s, shaped)
        shaped /= max(shaped)
        for i, seq in enumerate(self.rawDs):
            self.weightedDs.newSequence()
            for sample in seq:
                obs, act, dummy = sample
                self.weightedDs.addSample(obs, act, shaped[i])

#        else:
#            # case 2: value estimator:
#
#
#            # train the value estimating network
#            if self.verbose: print 'Old value error:  ', self.vbp.testOnData()
#            self.vbp.trainEpochs(self.valueTrainEpochs)
#            if self.verbose: print 'New value error:  ', self.vbp.testOnData()
#
#            # produce the values and analyze
#            rminusvs = []
#            sizes = []
#            for i, seq in enumerate(self.valueDs):
#                self.vnet.reset()
#                seq = list(seq)
#                for sample in seq:
#                    obs, ret = sample
#                    val = self.vnet.activate(obs)
#                    rminusvs.append(ret-val)
#                    sizes.append(len(seq))
#
#            rminusvs = array(rminusvs)
#            shapedRminusv = self.shapingFunction(rminusvs)
#            # CHECKME: here?
#            self.updateTau(rminusvs, shapedRminusv)
#            shapedRminusv /= array(sizes)
#            shapedRminusv /= max(shapedRminusv)
#
#            # prepare the dataset for training the acting network
#            rvindex = 0
#            for i, seq in enumerate(self.rawDs):
#                self.weightedDs.newSequence()
#                self.vnet.reset()
#                for sample in seq:
#                    obs, act, ret = sample
#                    self.weightedDs.addSample(obs, act, shapedRminusv[rvindex])
#                    rvindex += 1

# train the acting network
        tmp1, tmp2 = self.bp.trainUntilConvergence(
            maxEpochs=self.maxEpochs,
            validationProportion=self.validationProportion,
            continueEpochs=self.continueEpochs,
            verbose=self.verbose)
        if self.supervisedPlotting:
            from pylab import plot, legend, figure, clf, draw
            figure(1)
            clf()
            plot(tmp1, label='train')
            plot(tmp2, label='valid')
            legend()
            draw()

        return avgLen, avgR0