Esempio n. 1
0
class RWR(DirectSearchLearner):
    """ Reward-weighted regression.

    The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks.
    """

    # parameters
    batchSize = 20

    # feedback settings
    verbose = True
    greedyRuns = 20
    supervisedPlotting = False

    # settings for the supervised training
    learningRate = 0.005
    momentum = 0.9
    maxEpochs = 20
    validationProportion = 0.33
    continueEpochs = 2

    # parameters for the variation that uses a value function
    # TODO: split into 2 classes.
    valueLearningRate = None
    valueMomentum = None
    #valueTrainEpochs = 5
    resetAllWeights = False
    netweights = 0.01

    def __init__(self, net, task, valueNetwork=None, **args):
        self.net = net
        self.task = task
        self.setArgs(**args)
        if self.valueLearningRate == None:
            self.valueLearningRate = self.learningRate
        if self.valueMomentum == None:
            self.valueMomentum = self.momentum
        if self.supervisedPlotting:
            from pylab import ion
            ion()

        # adaptive temperature:
        self.tau = 1.

        # prepare the datasets to be used
        self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim)
        self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim)
        self.valueDs = SequentialDataSet(self.task.outdim, 1)

        # prepare the supervised trainers
        self.bp = BackpropTrainer(self.net,
                                  self.weightedDs,
                                  self.learningRate,
                                  self.momentum,
                                  verbose=False,
                                  batchlearning=True)

        # CHECKME: outsource
        self.vnet = valueNetwork
        if valueNetwork != None:
            self.vbp = BackpropTrainer(self.vnet,
                                       self.valueDs,
                                       self.valueLearningRate,
                                       self.valueMomentum,
                                       verbose=self.verbose)

        # keep information:
        self.totalSteps = 0
        self.totalEpisodes = 0

    def shapingFunction(self, R):
        return exp(self.tau * R)

    def updateTau(self, R, U):
        self.tau = sum(U) / dot((R - self.task.minReward), U)

    def reset(self):
        self.weightedDs.clear()
        self.valueDs.clear()
        self.rawDs.clear()
        self.bp.momentumvector *= 0.0
        if self.vnet != None:
            self.vbp.momentumvector *= 0.0
            if self.resetAllWeights:
                self.vnet.params[:] = randn(len(
                    self.vnet.params)) * self.netweights

    def greedyEpisode(self):
        """ run one episode with greedy decisions, return the list of rewards recieved."""
        rewards = []
        self.task.reset()
        self.net.reset()
        while not self.task.isFinished():
            obs = self.task.getObservation()
            act = self.net.activate(obs)
            chosen = argmax(act)
            self.task.performAction(chosen)
            reward = self.task.getReward()
            rewards.append(reward)
        return rewards

    def learn(self, batches):
        self.greedyAvg = []
        self.rewardAvg = []
        self.lengthAvg = []
        self.initr0Avg = []
        for b in range(batches):
            if self.verbose:
                print()
                print(('Batch', b + 1))
            self.reset()
            self.learnOneBatch()
            self.totalEpisodes += self.batchSize

            # greedy measure (avg over some greedy runs)
            rws = 0.
            for dummy in range(self.greedyRuns):
                tmp = self.greedyEpisode()
                rws += (sum(tmp) / float(len(tmp)))
            self.greedyAvg.append(rws / self.greedyRuns)
            if self.verbose:
                print(('::', round(rws / self.greedyRuns, 5), '::'))

    def learnOneBatch(self):
        # collect a batch of runs as experience
        r0s = []
        lens = []
        avgReward = 0.
        for dummy in range(self.batchSize):
            self.rawDs.newSequence()
            self.valueDs.newSequence()
            self.task.reset()
            self.net.reset()
            acts, obss, rewards = [], [], []
            while not self.task.isFinished():
                obs = self.task.getObservation()
                act = self.net.activate(obs)
                chosen = drawIndex(act)
                self.task.performAction(chosen)
                reward = self.task.getReward()
                obss.append(obs)
                y = zeros(len(act))
                y[chosen] = 1
                acts.append(y)
                rewards.append(reward)
            avgReward += sum(rewards) / float(len(rewards))

            # compute the returns from the list of rewards
            current = 0
            returns = []
            for r in reversed(rewards):
                current *= self.task.discount
                current += r
                returns.append(current)
            returns.reverse()
            for i in range(len(obss)):
                self.rawDs.addSample(obss[i], acts[i], returns[i])
                self.valueDs.addSample(obss[i], returns[i])
            r0s.append(returns[0])
            lens.append(len(returns))

        r0s = array(r0s)
        self.totalSteps += sum(lens)
        avgLen = sum(lens) / float(self.batchSize)
        avgR0 = mean(r0s)
        avgReward /= self.batchSize
        if self.verbose:
            print((
                '***',
                round(avgLen, 3),
                '***',
                '(avg init exp. return:',
                round(avgR0, 5),
                ')',
            ))
            print(('avg reward', round(avgReward,
                                       5), '(tau:', round(self.tau, 3), ')'))
            print(lens)
        # storage:
        self.rewardAvg.append(avgReward)
        self.lengthAvg.append(avgLen)
        self.initr0Avg.append(avgR0)

        #        if self.vnet == None:
        #            # case 1: no value estimator:

        # prepare the dataset for training the acting network
        shaped = self.shapingFunction(r0s)
        self.updateTau(r0s, shaped)
        shaped /= max(shaped)
        for i, seq in enumerate(self.rawDs):
            self.weightedDs.newSequence()
            for sample in seq:
                obs, act, dummy = sample
                self.weightedDs.addSample(obs, act, shaped[i])

#        else:
#            # case 2: value estimator:
#
#
#            # train the value estimating network
#            if self.verbose: print('Old value error:  ', self.vbp.testOnData())
#            self.vbp.trainEpochs(self.valueTrainEpochs)
#            if self.verbose: print('New value error:  ', self.vbp.testOnData())
#
#            # produce the values and analyze
#            rminusvs = []
#            sizes = []
#            for i, seq in enumerate(self.valueDs):
#                self.vnet.reset()
#                seq = list(seq)
#                for sample in seq:
#                    obs, ret = sample
#                    val = self.vnet.activate(obs)
#                    rminusvs.append(ret-val)
#                    sizes.append(len(seq))
#
#            rminusvs = array(rminusvs)
#            shapedRminusv = self.shapingFunction(rminusvs)
#            # CHECKME: here?
#            self.updateTau(rminusvs, shapedRminusv)
#            shapedRminusv /= array(sizes)
#            shapedRminusv /= max(shapedRminusv)
#
#            # prepare the dataset for training the acting network
#            rvindex = 0
#            for i, seq in enumerate(self.rawDs):
#                self.weightedDs.newSequence()
#                self.vnet.reset()
#                for sample in seq:
#                    obs, act, ret = sample
#                    self.weightedDs.addSample(obs, act, shapedRminusv[rvindex])
#                    rvindex += 1

# train the acting network
        tmp1, tmp2 = self.bp.trainUntilConvergence(
            maxEpochs=self.maxEpochs,
            validationProportion=self.validationProportion,
            continueEpochs=self.continueEpochs,
            verbose=self.verbose)
        if self.supervisedPlotting:
            from pylab import plot, legend, figure, clf, draw
            figure(1)
            clf()
            plot(tmp1, label='train')
            plot(tmp2, label='valid')
            legend()
            draw()

        return avgLen, avgR0
Esempio n. 2
0
class RWR(DirectSearchLearner):
    """ Reward-weighted regression.
    
    The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks.
    """
    
    # parameters
    batchSize = 20
    
    # feedback settings
    verbose = True
    greedyRuns = 20
    supervisedPlotting = False
    
    # settings for the supervised training
    learningRate = 0.005
    momentum = 0.9
    maxEpochs = 20
    validationProportion = 0.33
    continueEpochs = 2
    
    # parameters for the variation that uses a value function
    # TODO: split into 2 classes.
    valueLearningRate = None
    valueMomentum = None
    #valueTrainEpochs = 5
    resetAllWeights = False
    netweights = 0.01
    
    def __init__(self, net, task, valueNetwork=None, **args):
        self.net = net
        self.task = task
        self.setArgs(**args)
        if self.valueLearningRate == None:
            self.valueLearningRate = self.learningRate
        if self.valueMomentum == None:
            self.valueMomentum = self.momentum        
        if self.supervisedPlotting:
            from pylab import ion
            ion() 
        
        # adaptive temperature:
        self.tau = 1.
        
        # prepare the datasets to be used
        self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim)
        self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim)
        self.valueDs = SequentialDataSet(self.task.outdim, 1)
        
        # prepare the supervised trainers
        self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate,
                                  self.momentum, verbose=False,
                                  batchlearning=True)            
        
        # CHECKME: outsource
        self.vnet = valueNetwork
        if valueNetwork != None:
            self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate,
                                       self.valueMomentum, verbose=self.verbose)
            
        # keep information:
        self.totalSteps = 0
        self.totalEpisodes = 0
            
    def shapingFunction(self, R):
        return exp(self.tau * R)        
    
    def updateTau(self, R, U):
        self.tau = sum(U) / dot((R - self.task.minReward), U)
        
    def reset(self):
        self.weightedDs.clear()
        self.valueDs.clear()
        self.rawDs.clear()
        self.bp.momentumvector *= 0.0
        if self.vnet != None:
            self.vbp.momentumvector *= 0.0
            if self.resetAllWeights:
                self.vnet.params[:] = randn(len(self.vnet.params)) * self.netweights            
            
    def greedyEpisode(self):
        """ run one episode with greedy decisions, return the list of rewards recieved."""
        rewards = []
        self.task.reset()
        self.net.reset()
        while not self.task.isFinished():
            obs = self.task.getObservation()
            act = self.net.activate(obs)
            chosen = argmax(act)
            self.task.performAction(chosen)
            reward = self.task.getReward()
            rewards.append(reward)
        return rewards
            
    def learn(self, batches):
        self.greedyAvg = []
        self.rewardAvg = []
        self.lengthAvg = []
        self.initr0Avg = []
        for b in range(batches):
            if self.verbose:
                print
                print 'Batch', b + 1
            self.reset()
            self.learnOneBatch()
            self.totalEpisodes += self.batchSize
            
            # greedy measure (avg over some greedy runs)
            rws = 0.
            for dummy in range(self.greedyRuns):
                tmp = self.greedyEpisode()
                rws += (sum(tmp) / float(len(tmp)))
            self.greedyAvg.append(rws / self.greedyRuns)
            if self.verbose:
                print '::', round(rws / self.greedyRuns, 5), '::'
            
    def learnOneBatch(self):
        # collect a batch of runs as experience
        r0s = []
        lens = []
        avgReward = 0.
        for dummy in range(self.batchSize):
            self.rawDs.newSequence()
            self.valueDs.newSequence()
            self.task.reset()
            self.net.reset()
            acts, obss, rewards = [], [], []
            while not self.task.isFinished():
                obs = self.task.getObservation()
                act = self.net.activate(obs)
                chosen = drawIndex(act)
                self.task.performAction(chosen)
                reward = self.task.getReward()
                obss.append(obs)
                y = zeros(len(act))
                y[chosen] = 1
                acts.append(y)
                rewards.append(reward)
            avgReward += sum(rewards) / float(len(rewards))
            
            # compute the returns from the list of rewards
            current = 0        
            returns = []
            for r in reversed(rewards):
                current *= self.task.discount
                current += r
                returns.append(current)
            returns.reverse()
            for i in range(len(obss)):
                self.rawDs.addSample(obss[i], acts[i], returns[i])
                self.valueDs.addSample(obss[i], returns[i])
            r0s.append(returns[0])
            lens.append(len(returns))
            
        r0s = array(r0s)  
        self.totalSteps += sum(lens)
        avgLen = sum(lens) / float(self.batchSize)
        avgR0 = mean(r0s)
        avgReward /= self.batchSize
        if self.verbose:
            print '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')',
            print 'avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')'
            print lens        
        # storage:
        self.rewardAvg.append(avgReward)
        self.lengthAvg.append(avgLen)
        self.initr0Avg.append(avgR0)
        
        
#        if self.vnet == None:
#            # case 1: no value estimator:
            
        # prepare the dataset for training the acting network  
        shaped = self.shapingFunction(r0s)
        self.updateTau(r0s, shaped)
        shaped /= max(shaped)
        for i, seq in enumerate(self.rawDs):
            self.weightedDs.newSequence()
            for sample in seq:
                obs, act, dummy = sample
                self.weightedDs.addSample(obs, act, shaped[i])
                    
#        else:
#            # case 2: value estimator:
#            
#            
#            # train the value estimating network
#            if self.verbose: print 'Old value error:  ', self.vbp.testOnData()
#            self.vbp.trainEpochs(self.valueTrainEpochs)
#            if self.verbose: print 'New value error:  ', self.vbp.testOnData()
#            
#            # produce the values and analyze
#            rminusvs = []
#            sizes = []
#            for i, seq in enumerate(self.valueDs):
#                self.vnet.reset()
#                seq = list(seq)
#                for sample in seq:
#                    obs, ret = sample
#                    val = self.vnet.activate(obs)
#                    rminusvs.append(ret-val)
#                    sizes.append(len(seq))
#                    
#            rminusvs = array(rminusvs)
#            shapedRminusv = self.shapingFunction(rminusvs)
#            # CHECKME: here?
#            self.updateTau(rminusvs, shapedRminusv)
#            shapedRminusv /= array(sizes)
#            shapedRminusv /= max(shapedRminusv)
#            
#            # prepare the dataset for training the acting network    
#            rvindex = 0
#            for i, seq in enumerate(self.rawDs):
#                self.weightedDs.newSequence()
#                self.vnet.reset()
#                for sample in seq:
#                    obs, act, ret = sample
#                    self.weightedDs.addSample(obs, act, shapedRminusv[rvindex])
#                    rvindex += 1
                    
        # train the acting network                
        tmp1, tmp2 = self.bp.trainUntilConvergence(maxEpochs=self.maxEpochs,
                                                   validationProportion=self.validationProportion,
                                                   continueEpochs=self.continueEpochs,
                                                   verbose=self.verbose)
        if self.supervisedPlotting:
            from pylab import plot, legend, figure, clf, draw
            figure(1)
            clf()
            plot(tmp1, label='train')
            plot(tmp2, label='valid')
            legend()
            draw()  
            
        return avgLen, avgR0                        
Esempio n. 3
0
class ModelExperiment(EpisodicExperiment):
    """ An experiment that learns a model of its (action, state) pair
        with a Gaussian Process for each dimension of the state.
    """
    def __init__(self, task, agent):
        EpisodicExperiment.__init__(self, task, agent)

        # create model and training set (action dimension + 1 for time)
        self.modelds = SequentialDataSet(self.task.indim + 1, 1)
        self.model = [
            GaussianProcess(indim=self.modelds.getDimension('input'),
                            start=(-10, -10, 0),
                            stop=(10, 10, 300),
                            step=(5, 5, 100)) for _ in range(self.task.outdim)
        ]

        # change hyper parameters for all gps
        for m in self.model:
            m.hyper = (20, 2.0, 0.01)
            # m.autonoise = True

    def doEpisodes(self, number=1):
        """ returns the rewards of each step as a list and learns
            the model for each rollout. 
        """

        all_rewards = []

        for dummy in range(number):
            self.stepid = 0
            rewards = []
            # the agent is informed of the start of the episode
            self.agent.newEpisode()
            self.task.reset()
            while not self.task.isFinished():
                r = self._oneInteraction()
                rewards.append(r)
            all_rewards.append(rewards)

        # clear model dataset (to retrain it)
        self.modelds.clear()
        print "retrain gp"
        [m.trainOnDataset(self.modelds) for m in self.model]

        for i in range(self.agent.history.getNumSequences()):
            seq = self.agent.history.getSequence(i)
            state, action, dummy, dummy = seq

            l = len(action)
            index = map(lambda x: int(floor(x)), mgrid[0:l - 1:5j])
            action = action[index, :]
            inp = c_[action, array([index]).T]
            self.modelds.setField('input', inp)

            # add training data to all gaussian processes
            for i, m in enumerate(self.model):
                tar = state[index, i]
                self.modelds.setField('target', array([tar]).T)
                m.addDataset(self.modelds)

        # print "updating GPs..."
        # [m._calculate() for m in self.model]
        # print "done."

        return all_rewards

    def _oneInteraction(self):
        self.stepid += 1
        obs = self.task.getObservation()
        self.agent.integrateObservation(obs)
        action = self.agent.getAction()
        self.task.performAction(action)

        # predict with model
        #modelobs = array([0, 0, 0])

        # time dimension
        # if self.stepid < self.model[0].stop:
        #     steps = self.model[0].step
        #
        #     # linear interpolation between two adjacent gp states
        #     try:
        #         modelobs = [ (1.0-float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(floor(float(self.stepid)/steps))] +
        #                      (float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(ceil(float(self.stepid)/steps))]
        #                      for i in range(self.task.outdim) ]
        #     except IndexError:

        action = r_[action, array([self.stepid])]
        action = reshape(action, (1, 3))
        modelobs = [
            self.model[i].testOnArray(action) for i in range(self.task.outdim)
        ]

        # tell environment about model obs
        self.task.env.model = [modelobs]

        reward = self.task.getReward()
        self.agent.giveReward(reward)
        return reward
Esempio n. 4
0
class ModelExperiment(EpisodicExperiment):
    """ An experiment that learns a model of its (action, state) pair
        with a Gaussian Process for each dimension of the state.
    """

    def __init__(self, task, agent):
        EpisodicExperiment.__init__(self, task, agent)
        
        # create model and training set (action dimension + 1 for time)
        self.modelds = SequentialDataSet(self.task.indim + 1, 1)
        self.model = [GaussianProcess(indim=self.modelds.getDimension('input'), 
                                      start=(-10, -10, 0), stop=(10, 10, 300), step=(5, 5, 100)) 
                      for _ in range(self.task.outdim)]
        
        # change hyper parameters for all gps
        for m in self.model:
            m.hyper = (20, 2.0, 0.01)
            # m.autonoise = True
        
    def doEpisodes(self, number = 1):
        """ returns the rewards of each step as a list and learns
            the model for each rollout. 
        """

        all_rewards = []

        for dummy in range(number):
            self.stepid = 0
            rewards = []
            # the agent is informed of the start of the episode
            self.agent.newEpisode()
            self.task.reset()
            while not self.task.isFinished():
                r = self._oneInteraction()
                rewards.append(r)
            all_rewards.append(rewards)
        
        # clear model dataset (to retrain it)
        self.modelds.clear()
        print "retrain gp"
        [m.trainOnDataset(self.modelds) for m in self.model]
        
        for i in range(self.agent.history.getNumSequences()):
            seq = self.agent.history.getSequence(i)
            state, action, dummy, dummy = seq
            
            l = len(action)
            index = map(lambda x: int(floor(x)), mgrid[0:l-1:5j])
            action = action[index, :]
            inp = c_[action, array([index]).T]
            self.modelds.setField('input', inp)
            
            # add training data to all gaussian processes
            for i,m in enumerate(self.model):
                tar = state[index, i]
                self.modelds.setField('target', array([tar]).T)
                m.addDataset(self.modelds)
        
        # print "updating GPs..."
        # [m._calculate() for m in self.model]
        # print "done."   
        
        return all_rewards

    def _oneInteraction(self):
        self.stepid += 1
        obs = self.task.getObservation()
        self.agent.integrateObservation(obs)
        action = self.agent.getAction()
        self.task.performAction(action)

        # predict with model
        #modelobs = array([0, 0, 0])
        
        # time dimension        
        # if self.stepid < self.model[0].stop:
        #     steps = self.model[0].step
        #     
        #     # linear interpolation between two adjacent gp states
        #     try:      
        #         modelobs = [ (1.0-float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(floor(float(self.stepid)/steps))] +
        #                      (float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(ceil(float(self.stepid)/steps))]
        #                      for i in range(self.task.outdim) ]
        #     except IndexError:
              
        action = r_[action, array([self.stepid])]
        action = reshape(action, (1, 3))
        modelobs = [self.model[i].testOnArray(action) for i in range(self.task.outdim)]
        
        # tell environment about model obs
        self.task.env.model = [modelobs]
    
        reward = self.task.getReward()
        self.agent.giveReward(reward)
        return reward