Example #1
0
class HistoryAgent(Agent):
    """ This agent stores actions, states, and rewards encountered during interaction with an environment
        in a ReinforcementDataSet (which is a variation of SequentialDataSet). The stored history can 
        be used for learning and is erased by resetting the agent. It also makes sure that integrateObservation,
        getAction and giveReward are called in exactly that order. """
        
    def __init__(self, indim, outdim):        
        # store input and output dimension
        self.indim = indim
        self.outdim = outdim
                
        # create history dataset
        self.remember = True
        self.history = ReinforcementDataSet(indim, outdim)

        # initialize temporary variables
        self.lastobs = None
        self.lastaction = None
        
    def integrateObservation(self, obs):
        """ 1. stores the observation received in a temporary variable until action is called and
            reward is given. """
        assert self.lastobs == None
        assert self.lastaction == None
        
        self.lastobs = obs
        
    def getAction(self):
        """ 2. stores the action in a temporary variable until reward is given. """
        assert self.lastobs != None 
        assert self.lastaction == None
        # implement getAction in subclass and set self.lastaction
        
    def enableHistory(self):
        self.remember = True
        
    def disableHistory(self):
        self.remember = False
   
    def giveReward(self, r):
        """ 3. stores observation, action and reward in the history dataset. """
        # step 3: assume that state and action have been set
        assert self.lastobs != None
        assert self.lastaction != None

        # store state, action and reward in dataset
        if self.remember:
            self.history.addSample(self.lastobs, self.lastaction, r)

        self.lastobs = None
        self.lastaction = None
            
    def reset(self):
        """ clears the history of the agent. """
        self.history.clear()
Example #2
0
 def __init__(self, net, task, valueNetwork=None, **args):
     self.net = net
     self.task = task
     self.setArgs(**args)
     if self.valueLearningRate == None:
         self.valueLearningRate = self.learningRate
     if self.valueMomentum == None:
         self.valueMomentum = self.momentum        
     if self.supervisedPlotting:
         from pylab import ion
         ion() 
     
     # adaptive temperature:
     self.tau = 1.
     
     # prepare the datasets to be used
     self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim)
     self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim)
     self.valueDs = SequentialDataSet(self.task.outdim, 1)
     
     # prepare the supervised trainers
     self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate,
                               self.momentum, verbose=False,
                               batchlearning=True)            
     
     # CHECKME: outsource
     self.vnet = valueNetwork
     if valueNetwork != None:
         self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate,
                                    self.valueMomentum, verbose=self.verbose)
         
     # keep information:
     self.totalSteps = 0
     self.totalEpisodes = 0
 def __init__(self, indim, outdim):        
     # store input and output dimension
     self.indim = indim
     self.outdim = outdim
             
     # create the history dataset
     self.history = ReinforcementDataSet(indim, outdim)
Example #4
0
    def setUp(self):
        self.theta = [0.4, 1.1]
        self.policy = BoltzmanPolicy(4, 2, self.theta)
        self.module = PolicyFeatureModule(self.policy, 'policywrapper')

        self.dataset = ReinforcementDataSet(8, 1)
        feature1 = scipy.array([
            (0.6, 0.2),
            (0.3, 0.6),
            (0.4, 0.01),
            (0.5, -0.2)
        ])
        #  feature2 = scipy.array([
        #      (0.3, 0.6),
        #      (0.6, 0.2),
        #      (50, -20),
        #      (0.4, 0.01),
        #  ])
        #  feature3 = scipy.array([
        #      (0.1, 0.1),
        #      (0.2, 0.2),
        #      (0.3, -0.3),
        #      (0.4, 0.4),
        #  ])

        self.dataset.addSample(feature1.reshape(-1), 0, 0)
        self.dataset.addSample(feature1.reshape(-1), 1, 1)
        self.dataset.addSample(feature1.reshape(-1), 2, 1.5)
        self.dataset.addSample(feature1.reshape(-1), 3, 0.5)
Example #5
0
    def __init__(self, indim, outdim):        
        # store input and output dimension
        self.indim = indim
        self.outdim = outdim
                
        # create history dataset
        self.remember = True
        self.history = ReinforcementDataSet(indim, outdim)

        # initialize temporary variables
        self.lastobs = None
        self.lastaction = None
Example #6
0
class GPSLearner(Q):
    def __init__(self, ):
        Q.__init__(self, const.ALPHA, const.GAMMA)
        self.explorer = FeasibleEpsilonGreedyExplorer(const.EPSILON, const.DECAY)
        self.dataset2 = ReinforcementDataSet(1, 1)
        
    
    def learn(self):
        """
            Performs Q learning based on observations but also
            performs learning on states that are adjacent time periods,
            albeit with a slower learning rate.
            
            For example, traffic at 4:30PM on an edge will be somewhat similar to
            traffic at 5:00. Hence, we can use an observation at 4:30 to update 5:00.
        """
        self.alpha = const.ALPHA
        Q.learn(self) #do normal learning
        for seq in self.dataset:
            self.dataset2.newSequence()
            for state, action, reward in seq: #add states of adjacent time periods
                #print(state, action, reward)
                period = state % const.PERIODS
                node = int(state / const.PERIODS)
                self.dataset2.addSample(node * const.PERIODS + (period + 1) % const.PERIODS, action, reward)
                self.dataset2.addSample(node * const.PERIODS + (period - 1) % const.PERIODS, action, reward)
        temp = self.dataset 
        self.dataset = self.dataset2       
        self.alpha = const.ALPHA_ADJ_PERIOD
        Q.learn(self)
        self.dataset = temp
        self.dataset2.clear()
#GPSLearner().learn()
Example #7
0
class LoggingAgent(Agent):
    """ This agent stores actions, states, and rewards encountered during
        interaction with an environment in a ReinforcementDataSet (which is
        a variation of SequentialDataSet).
        The stored history can be used for learning and is erased by resetting
        the agent. It also makes sure that integrateObservation, getAction and
        giveReward are called in exactly that order.
    """

    logging = True

    lastobs = None
    lastaction = None
    lastreward = None


    def __init__(self, indim, outdim, **kwargs):
        self.setArgs(**kwargs)
        
        # store input and output dimension
        self.indim = indim
        self.outdim = outdim

        # create the history dataset
        self.history = ReinforcementDataSet(indim, outdim)


    def integrateObservation(self, obs):
        """Step 1: store the observation received in a temporary variable until action is called and
        reward is given. """
        self.lastobs = obs
        self.lastaction = None
        self.lastreward = None


    def getAction(self):
        """Step 2: store the action in a temporary variable until reward is given. """
        assert self.lastobs != None
        assert self.lastaction == None
        assert self.lastreward == None

        # implement getAction in subclass and set self.lastaction


    def giveReward(self, r):
        """Step 3: store observation, action and reward in the history dataset. """
        # step 3: assume that state and action have been set
        assert self.lastobs != None
        assert self.lastaction != None
        assert self.lastreward == None

        self.lastreward = r

        # store state, action and reward in dataset if logging is enabled
        if self.logging:
            self.history.addSample(self.lastobs, self.lastaction, self.lastreward)


    def newEpisode(self):
        """ Indicate the beginning of a new episode in the training cycle. """
        if self.logging:
            self.history.newSequence()


    def reset(self):
        """ Clear the history of the agent. """
        self.lastobs = None
        self.lastaction = None
        self.lastreward = None

        self.history.clear()
Example #8
0
class RWR(DirectSearchLearner):
    """ Reward-weighted regression.
    
    The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks.
    """
    
    # parameters
    batchSize = 20
    
    # feedback settings
    verbose = True
    greedyRuns = 20
    supervisedPlotting = False
    
    # settings for the supervised training
    learningRate = 0.005
    momentum = 0.9
    maxEpochs = 20
    validationProportion = 0.33
    continueEpochs = 2
    
    # parameters for the variation that uses a value function
    # TODO: split into 2 classes.
    valueLearningRate = None
    valueMomentum = None
    #valueTrainEpochs = 5
    resetAllWeights = False
    netweights = 0.01
    
    def __init__(self, net, task, valueNetwork=None, **args):
        self.net = net
        self.task = task
        self.setArgs(**args)
        if self.valueLearningRate == None:
            self.valueLearningRate = self.learningRate
        if self.valueMomentum == None:
            self.valueMomentum = self.momentum        
        if self.supervisedPlotting:
            from pylab import ion
            ion() 
        
        # adaptive temperature:
        self.tau = 1.
        
        # prepare the datasets to be used
        self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim)
        self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim)
        self.valueDs = SequentialDataSet(self.task.outdim, 1)
        
        # prepare the supervised trainers
        self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate,
                                  self.momentum, verbose=False,
                                  batchlearning=True)            
        
        # CHECKME: outsource
        self.vnet = valueNetwork
        if valueNetwork != None:
            self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate,
                                       self.valueMomentum, verbose=self.verbose)
            
        # keep information:
        self.totalSteps = 0
        self.totalEpisodes = 0
            
    def shapingFunction(self, R):
        return exp(self.tau * R)        
    
    def updateTau(self, R, U):
        self.tau = sum(U) / dot((R - self.task.minReward), U)
        
    def reset(self):
        self.weightedDs.clear()
        self.valueDs.clear()
        self.rawDs.clear()
        self.bp.momentumvector *= 0.0
        if self.vnet != None:
            self.vbp.momentumvector *= 0.0
            if self.resetAllWeights:
                self.vnet.params[:] = randn(len(self.vnet.params)) * self.netweights            
            
    def greedyEpisode(self):
        """ run one episode with greedy decisions, return the list of rewards recieved."""
        rewards = []
        self.task.reset()
        self.net.reset()
        while not self.task.isFinished():
            obs = self.task.getObservation()
            act = self.net.activate(obs)
            chosen = argmax(act)
            self.task.performAction(chosen)
            reward = self.task.getReward()
            rewards.append(reward)
        return rewards
            
    def learn(self, batches):
        self.greedyAvg = []
        self.rewardAvg = []
        self.lengthAvg = []
        self.initr0Avg = []
        for b in range(batches):
            if self.verbose:
                print
                print 'Batch', b + 1
            self.reset()
            self.learnOneBatch()
            self.totalEpisodes += self.batchSize
            
            # greedy measure (avg over some greedy runs)
            rws = 0.
            for dummy in range(self.greedyRuns):
                tmp = self.greedyEpisode()
                rws += (sum(tmp) / float(len(tmp)))
            self.greedyAvg.append(rws / self.greedyRuns)
            if self.verbose:
                print '::', round(rws / self.greedyRuns, 5), '::'
            
    def learnOneBatch(self):
        # collect a batch of runs as experience
        r0s = []
        lens = []
        avgReward = 0.
        for dummy in range(self.batchSize):
            self.rawDs.newSequence()
            self.valueDs.newSequence()
            self.task.reset()
            self.net.reset()
            acts, obss, rewards = [], [], []
            while not self.task.isFinished():
                obs = self.task.getObservation()
                act = self.net.activate(obs)
                chosen = drawIndex(act)
                self.task.performAction(chosen)
                reward = self.task.getReward()
                obss.append(obs)
                y = zeros(len(act))
                y[chosen] = 1
                acts.append(y)
                rewards.append(reward)
            avgReward += sum(rewards) / float(len(rewards))
            
            # compute the returns from the list of rewards
            current = 0        
            returns = []
            for r in reversed(rewards):
                current *= self.task.discount
                current += r
                returns.append(current)
            returns.reverse()
            for i in range(len(obss)):
                self.rawDs.addSample(obss[i], acts[i], returns[i])
                self.valueDs.addSample(obss[i], returns[i])
            r0s.append(returns[0])
            lens.append(len(returns))
            
        r0s = array(r0s)  
        self.totalSteps += sum(lens)
        avgLen = sum(lens) / float(self.batchSize)
        avgR0 = mean(r0s)
        avgReward /= self.batchSize
        if self.verbose:
            print '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')',
            print 'avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')'
            print lens        
        # storage:
        self.rewardAvg.append(avgReward)
        self.lengthAvg.append(avgLen)
        self.initr0Avg.append(avgR0)
        
        
#        if self.vnet == None:
#            # case 1: no value estimator:
            
        # prepare the dataset for training the acting network  
        shaped = self.shapingFunction(r0s)
        self.updateTau(r0s, shaped)
        shaped /= max(shaped)
        for i, seq in enumerate(self.rawDs):
            self.weightedDs.newSequence()
            for sample in seq:
                obs, act, dummy = sample
                self.weightedDs.addSample(obs, act, shaped[i])
                    
#        else:
#            # case 2: value estimator:
#            
#            
#            # train the value estimating network
#            if self.verbose: print 'Old value error:  ', self.vbp.testOnData()
#            self.vbp.trainEpochs(self.valueTrainEpochs)
#            if self.verbose: print 'New value error:  ', self.vbp.testOnData()
#            
#            # produce the values and analyze
#            rminusvs = []
#            sizes = []
#            for i, seq in enumerate(self.valueDs):
#                self.vnet.reset()
#                seq = list(seq)
#                for sample in seq:
#                    obs, ret = sample
#                    val = self.vnet.activate(obs)
#                    rminusvs.append(ret-val)
#                    sizes.append(len(seq))
#                    
#            rminusvs = array(rminusvs)
#            shapedRminusv = self.shapingFunction(rminusvs)
#            # CHECKME: here?
#            self.updateTau(rminusvs, shapedRminusv)
#            shapedRminusv /= array(sizes)
#            shapedRminusv /= max(shapedRminusv)
#            
#            # prepare the dataset for training the acting network    
#            rvindex = 0
#            for i, seq in enumerate(self.rawDs):
#                self.weightedDs.newSequence()
#                self.vnet.reset()
#                for sample in seq:
#                    obs, act, ret = sample
#                    self.weightedDs.addSample(obs, act, shapedRminusv[rvindex])
#                    rvindex += 1
                    
        # train the acting network                
        tmp1, tmp2 = self.bp.trainUntilConvergence(maxEpochs=self.maxEpochs,
                                                   validationProportion=self.validationProportion,
                                                   continueEpochs=self.continueEpochs,
                                                   verbose=self.verbose)
        if self.supervisedPlotting:
            from pylab import plot, legend, figure, clf, draw
            figure(1)
            clf()
            plot(tmp1, label='train')
            plot(tmp2, label='valid')
            legend()
            draw()  
            
        return avgLen, avgR0                        
Example #9
0
class LoggingAgent(Agent):
    """ This agent stores actions, states, and rewards encountered during
        interaction with an environment in a ReinforcementDataSet (which is
        a variation of SequentialDataSet).
        The stored history can be used for learning and is erased by resetting
        the agent. It also makes sure that integrateObservation, getAction and
        giveReward are called in exactly that order.
    """

    logging = True

    lastobs = None
    lastaction = None
    lastreward = None


    def __init__(self, indim, outdim, **kwargs):
        self.setArgs(**kwargs)
        
        # store input and output dimension
        self.indim = indim
        self.outdim = outdim

        # create the history dataset
        self.history = ReinforcementDataSet(indim, outdim)


    def integrateObservation(self, obs):
        """Step 1: store the observation received in a temporary variable until action is called and
        reward is given. """
        self.lastobs = obs
        self.lastaction = None
        self.lastreward = None


    def getAction(self):
        """Step 2: store the action in a temporary variable until reward is given. """
        assert self.lastobs != None
        assert self.lastaction == None
        assert self.lastreward == None

        # implement getAction in subclass and set self.lastaction


    def giveReward(self, r):
        """Step 3: store observation, action and reward in the history dataset. """
        # step 3: assume that state and action have been set
        assert self.lastobs != None
        assert self.lastaction != None
        assert self.lastreward == None

        self.lastreward = r
        
        # store state, action and reward in dataset if logging is enabled
        if self.logging:
            self.history.addSample(self.lastobs, self.lastaction, self.lastreward)


    def newEpisode(self):
        """ Indicate the beginning of a new episode in the training cycle. """
        if self.logging:
            self.history.newSequence()


    def reset(self):
        """ Clear the history of the agent. """
        self.lastobs = None
        self.lastaction = None
        self.lastreward = None

        self.history.clear()
        
    def _reset(self):
        self.lastobs = None
        self.lastaction = None
        self.lastreward = None
Example #10
0
class RWR(DirectSearchLearner):
    """ Reward-weighted regression.

    The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks.
    """

    # parameters
    batchSize = 20

    # feedback settings
    verbose = True
    greedyRuns = 20
    supervisedPlotting = False

    # settings for the supervised training
    learningRate = 0.005
    momentum = 0.9
    maxEpochs = 20
    validationProportion = 0.33
    continueEpochs = 2

    # parameters for the variation that uses a value function
    # TODO: split into 2 classes.
    valueLearningRate = None
    valueMomentum = None
    #valueTrainEpochs = 5
    resetAllWeights = False
    netweights = 0.01

    def __init__(self, net, task, valueNetwork=None, **args):
        self.net = net
        self.task = task
        self.setArgs(**args)
        if self.valueLearningRate == None:
            self.valueLearningRate = self.learningRate
        if self.valueMomentum == None:
            self.valueMomentum = self.momentum
        if self.supervisedPlotting:
            from pylab import ion
            ion()

        # adaptive temperature:
        self.tau = 1.

        # prepare the datasets to be used
        self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim)
        self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim)
        self.valueDs = SequentialDataSet(self.task.outdim, 1)

        # prepare the supervised trainers
        self.bp = BackpropTrainer(self.net,
                                  self.weightedDs,
                                  self.learningRate,
                                  self.momentum,
                                  verbose=False,
                                  batchlearning=True)

        # CHECKME: outsource
        self.vnet = valueNetwork
        if valueNetwork != None:
            self.vbp = BackpropTrainer(self.vnet,
                                       self.valueDs,
                                       self.valueLearningRate,
                                       self.valueMomentum,
                                       verbose=self.verbose)

        # keep information:
        self.totalSteps = 0
        self.totalEpisodes = 0

    def shapingFunction(self, R):
        return exp(self.tau * R)

    def updateTau(self, R, U):
        self.tau = sum(U) / dot((R - self.task.minReward), U)

    def reset(self):
        self.weightedDs.clear()
        self.valueDs.clear()
        self.rawDs.clear()
        self.bp.momentumvector *= 0.0
        if self.vnet != None:
            self.vbp.momentumvector *= 0.0
            if self.resetAllWeights:
                self.vnet.params[:] = randn(len(
                    self.vnet.params)) * self.netweights

    def greedyEpisode(self):
        """ run one episode with greedy decisions, return the list of rewards recieved."""
        rewards = []
        self.task.reset()
        self.net.reset()
        while not self.task.isFinished():
            obs = self.task.getObservation()
            act = self.net.activate(obs)
            chosen = argmax(act)
            self.task.performAction(chosen)
            reward = self.task.getReward()
            rewards.append(reward)
        return rewards

    def learn(self, batches):
        self.greedyAvg = []
        self.rewardAvg = []
        self.lengthAvg = []
        self.initr0Avg = []
        for b in range(batches):
            if self.verbose:
                print
                print('Batch', b + 1)
            self.reset()
            self.learnOneBatch()
            self.totalEpisodes += self.batchSize

            # greedy measure (avg over some greedy runs)
            rws = 0.
            for dummy in range(self.greedyRuns):
                tmp = self.greedyEpisode()
                rws += (sum(tmp) / float(len(tmp)))
            self.greedyAvg.append(rws / self.greedyRuns)
            if self.verbose:
                print('::', round(rws / self.greedyRuns, 5), '::')

    def learnOneBatch(self):
        # collect a batch of runs as experience
        r0s = []
        lens = []
        avgReward = 0.
        for dummy in range(self.batchSize):
            self.rawDs.newSequence()
            self.valueDs.newSequence()
            self.task.reset()
            self.net.reset()
            acts, obss, rewards = [], [], []
            while not self.task.isFinished():
                obs = self.task.getObservation()
                act = self.net.activate(obs)
                chosen = drawIndex(act)
                self.task.performAction(chosen)
                reward = self.task.getReward()
                obss.append(obs)
                y = zeros(len(act))
                y[chosen] = 1
                acts.append(y)
                rewards.append(reward)
            avgReward += sum(rewards) / float(len(rewards))

            # compute the returns from the list of rewards
            current = 0
            returns = []
            for r in reversed(rewards):
                current *= self.task.discount
                current += r
                returns.append(current)
            returns.reverse()
            for i in range(len(obss)):
                self.rawDs.addSample(obss[i], acts[i], returns[i])
                self.valueDs.addSample(obss[i], returns[i])
            r0s.append(returns[0])
            lens.append(len(returns))

        r0s = array(r0s)
        self.totalSteps += sum(lens)
        avgLen = sum(lens) / float(self.batchSize)
        avgR0 = mean(r0s)
        avgReward /= self.batchSize
        if self.verbose:
            print(
                '***',
                round(avgLen, 3),
                '***',
                '(avg init exp. return:',
                round(avgR0, 5),
                ')',
            )
            print('avg reward', round(avgReward, 5), '(tau:',
                  round(self.tau, 3), ')')
            print(lens)
        # storage:
        self.rewardAvg.append(avgReward)
        self.lengthAvg.append(avgLen)
        self.initr0Avg.append(avgR0)

        #        if self.vnet == None:
        #            # case 1: no value estimator:

        # prepare the dataset for training the acting network
        shaped = self.shapingFunction(r0s)
        self.updateTau(r0s, shaped)
        shaped /= max(shaped)
        for i, seq in enumerate(self.rawDs):
            self.weightedDs.newSequence()
            for sample in seq:
                obs, act, dummy = sample
                self.weightedDs.addSample(obs, act, shaped[i])

#        else:
#            # case 2: value estimator:
#
#
#            # train the value estimating network
#            if self.verbose: print('Old value error:  ', self.vbp.testOnData())
#            self.vbp.trainEpochs(self.valueTrainEpochs)
#            if self.verbose: print('New value error:  ', self.vbp.testOnData())
#
#            # produce the values and analyze
#            rminusvs = []
#            sizes = []
#            for i, seq in enumerate(self.valueDs):
#                self.vnet.reset()
#                seq = list(seq)
#                for sample in seq:
#                    obs, ret = sample
#                    val = self.vnet.activate(obs)
#                    rminusvs.append(ret-val)
#                    sizes.append(len(seq))
#
#            rminusvs = array(rminusvs)
#            shapedRminusv = self.shapingFunction(rminusvs)
#            # CHECKME: here?
#            self.updateTau(rminusvs, shapedRminusv)
#            shapedRminusv /= array(sizes)
#            shapedRminusv /= max(shapedRminusv)
#
#            # prepare the dataset for training the acting network
#            rvindex = 0
#            for i, seq in enumerate(self.rawDs):
#                self.weightedDs.newSequence()
#                self.vnet.reset()
#                for sample in seq:
#                    obs, act, ret = sample
#                    self.weightedDs.addSample(obs, act, shapedRminusv[rvindex])
#                    rvindex += 1

# train the acting network
        tmp1, tmp2 = self.bp.trainUntilConvergence(
            maxEpochs=self.maxEpochs,
            validationProportion=self.validationProportion,
            continueEpochs=self.continueEpochs,
            verbose=self.verbose)
        if self.supervisedPlotting:
            from pylab import plot, legend, figure, clf, draw
            figure(1)
            clf()
            plot(tmp1, label='train')
            plot(tmp2, label='valid')
            legend()
            draw()

        return avgLen, avgR0
Example #11
0
class TDLearnerTestCase(unittest.TestCase):
    def setUp(self):
        self.theta = [0.4, 1.1]
        self.policy = BoltzmanPolicy(4, 2, self.theta)
        self.module = PolicyFeatureModule(self.policy, 'policywrapper')

        self.dataset = ReinforcementDataSet(8, 1)
        feature1 = scipy.array([
            (0.6, 0.2),
            (0.3, 0.6),
            (0.4, 0.01),
            (0.5, -0.2)
        ])
        #  feature2 = scipy.array([
        #      (0.3, 0.6),
        #      (0.6, 0.2),
        #      (50, -20),
        #      (0.4, 0.01),
        #  ])
        #  feature3 = scipy.array([
        #      (0.1, 0.1),
        #      (0.2, 0.2),
        #      (0.3, -0.3),
        #      (0.4, 0.4),
        #  ])

        self.dataset.addSample(feature1.reshape(-1), 0, 0)
        self.dataset.addSample(feature1.reshape(-1), 1, 1)
        self.dataset.addSample(feature1.reshape(-1), 2, 1.5)
        self.dataset.addSample(feature1.reshape(-1), 3, 0.5)

    # See https://goo.gl/7VMeDS for the spreadsheet that checks the math.
    # Note that actor is disabled in this test.
    def testLearnOnDataSet(self):
        learner = MockTDLearnerForTest(module=self.module,
                                       cssinitial=1,
                                       cssdecay=1, # css means critic step size
                                       assinitial=1,
                                       assdecay=1, # ass means actor steps size
                                       rdecay=1, # reward decay weight
                                       maxcriticnorm=100, # maximum critic norm
                                       tracestepsize=0.9, # trace stepsize
                                       parambound = None # bound for the parameters
                                       )

        learner.learnOnDataSet(self.dataset)
        assert_array_almost_equal([0.7610084396], learner.alpha)
        assert_array_almost_equal([0.7346593816], learner.d)
        assert_array_almost_equal([-0.04681298685,
                                   0.05935480268,
                                   -0.006860205142,
                                   0.01013414464,
                                   -0.04480498121], learner.r)
        assert_array_almost_equal([-0.0952710795,
                                   -0.2401405293,
                                   -0.0374024173,
                                   0.0552522117,
                                   -0.2442805382], learner.z)
    def testActor(self):
        learner = TDLearner(module=self.module,
                            cssinitial=1,
                            cssdecay=1, # css means critic step size
                            assinitial=1,
                            assdecay=1, # ass means actor steps size
                            rdecay=1, # reward decay weight
                            maxcriticnorm=100, # maximum critic norm
                            tracestepsize=0.9, # trace stepsize
                            parambound = None # bound for the parameters
                            )
        learner.r = scipy.array([1, 1, 1, 1, 1], dtype=float)
        lastfeature = scipy.array([1, 2, 3, 4, 5])
        learner.actor([], [], lastfeature)
        # the stateActionValue = 1*1 + 1*2 + 1*3 + 1*4 + ... + 1*5 = 15.
        # the initial theta is [0.4, 1.1], the update is [1, 2] * 15.
        assert_array_almost_equal([15.4, 31.1], learner.module.theta)
Example #12
0
 def __init__(self, ):
     Q.__init__(self, const.ALPHA, const.GAMMA)
     self.explorer = FeasibleEpsilonGreedyExplorer(const.EPSILON, const.DECAY)
     self.dataset2 = ReinforcementDataSet(1, 1)