class RWR(DirectSearchLearner): """ Reward-weighted regression. The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks. """ # parameters batchSize = 20 # feedback settings verbose = True greedyRuns = 20 supervisedPlotting = False # settings for the supervised training learningRate = 0.005 momentum = 0.9 maxEpochs = 20 validationProportion = 0.33 continueEpochs = 2 # parameters for the variation that uses a value function # TODO: split into 2 classes. valueLearningRate = None valueMomentum = None #valueTrainEpochs = 5 resetAllWeights = False netweights = 0.01 def __init__(self, net, task, valueNetwork=None, **args): self.net = net self.task = task self.setArgs(**args) if self.valueLearningRate == None: self.valueLearningRate = self.learningRate if self.valueMomentum == None: self.valueMomentum = self.momentum if self.supervisedPlotting: from pylab import ion ion() # adaptive temperature: self.tau = 1. # prepare the datasets to be used self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim) self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim) self.valueDs = SequentialDataSet(self.task.outdim, 1) # prepare the supervised trainers self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate, self.momentum, verbose=False, batchlearning=True) # CHECKME: outsource self.vnet = valueNetwork if valueNetwork != None: self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate, self.valueMomentum, verbose=self.verbose) # keep information: self.totalSteps = 0 self.totalEpisodes = 0 def shapingFunction(self, R): return exp(self.tau * R) def updateTau(self, R, U): self.tau = sum(U) / dot((R - self.task.minReward), U) def reset(self): self.weightedDs.clear() self.valueDs.clear() self.rawDs.clear() self.bp.momentumvector *= 0.0 if self.vnet != None: self.vbp.momentumvector *= 0.0 if self.resetAllWeights: self.vnet.params[:] = randn(len( self.vnet.params)) * self.netweights def greedyEpisode(self): """ run one episode with greedy decisions, return the list of rewards recieved.""" rewards = [] self.task.reset() self.net.reset() while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = argmax(act) self.task.performAction(chosen) reward = self.task.getReward() rewards.append(reward) return rewards def learn(self, batches): self.greedyAvg = [] self.rewardAvg = [] self.lengthAvg = [] self.initr0Avg = [] for b in range(batches): if self.verbose: print() print(('Batch', b + 1)) self.reset() self.learnOneBatch() self.totalEpisodes += self.batchSize # greedy measure (avg over some greedy runs) rws = 0. for dummy in range(self.greedyRuns): tmp = self.greedyEpisode() rws += (sum(tmp) / float(len(tmp))) self.greedyAvg.append(rws / self.greedyRuns) if self.verbose: print(('::', round(rws / self.greedyRuns, 5), '::')) def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print(( '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', )) print(('avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')')) print(lens) # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print('Old value error: ', self.vbp.testOnData()) # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print('New value error: ', self.vbp.testOnData()) # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence( maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
class RWR(DirectSearchLearner): """ Reward-weighted regression. The algorithm is currently limited to discrete-action episodic tasks, subclasses of POMDPTasks. """ # parameters batchSize = 20 # feedback settings verbose = True greedyRuns = 20 supervisedPlotting = False # settings for the supervised training learningRate = 0.005 momentum = 0.9 maxEpochs = 20 validationProportion = 0.33 continueEpochs = 2 # parameters for the variation that uses a value function # TODO: split into 2 classes. valueLearningRate = None valueMomentum = None #valueTrainEpochs = 5 resetAllWeights = False netweights = 0.01 def __init__(self, net, task, valueNetwork=None, **args): self.net = net self.task = task self.setArgs(**args) if self.valueLearningRate == None: self.valueLearningRate = self.learningRate if self.valueMomentum == None: self.valueMomentum = self.momentum if self.supervisedPlotting: from pylab import ion ion() # adaptive temperature: self.tau = 1. # prepare the datasets to be used self.weightedDs = ImportanceDataSet(self.task.outdim, self.task.indim) self.rawDs = ReinforcementDataSet(self.task.outdim, self.task.indim) self.valueDs = SequentialDataSet(self.task.outdim, 1) # prepare the supervised trainers self.bp = BackpropTrainer(self.net, self.weightedDs, self.learningRate, self.momentum, verbose=False, batchlearning=True) # CHECKME: outsource self.vnet = valueNetwork if valueNetwork != None: self.vbp = BackpropTrainer(self.vnet, self.valueDs, self.valueLearningRate, self.valueMomentum, verbose=self.verbose) # keep information: self.totalSteps = 0 self.totalEpisodes = 0 def shapingFunction(self, R): return exp(self.tau * R) def updateTau(self, R, U): self.tau = sum(U) / dot((R - self.task.minReward), U) def reset(self): self.weightedDs.clear() self.valueDs.clear() self.rawDs.clear() self.bp.momentumvector *= 0.0 if self.vnet != None: self.vbp.momentumvector *= 0.0 if self.resetAllWeights: self.vnet.params[:] = randn(len(self.vnet.params)) * self.netweights def greedyEpisode(self): """ run one episode with greedy decisions, return the list of rewards recieved.""" rewards = [] self.task.reset() self.net.reset() while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = argmax(act) self.task.performAction(chosen) reward = self.task.getReward() rewards.append(reward) return rewards def learn(self, batches): self.greedyAvg = [] self.rewardAvg = [] self.lengthAvg = [] self.initr0Avg = [] for b in range(batches): if self.verbose: print print 'Batch', b + 1 self.reset() self.learnOneBatch() self.totalEpisodes += self.batchSize # greedy measure (avg over some greedy runs) rws = 0. for dummy in range(self.greedyRuns): tmp = self.greedyEpisode() rws += (sum(tmp) / float(len(tmp))) self.greedyAvg.append(rws / self.greedyRuns) if self.verbose: print '::', round(rws / self.greedyRuns, 5), '::' def learnOneBatch(self): # collect a batch of runs as experience r0s = [] lens = [] avgReward = 0. for dummy in range(self.batchSize): self.rawDs.newSequence() self.valueDs.newSequence() self.task.reset() self.net.reset() acts, obss, rewards = [], [], [] while not self.task.isFinished(): obs = self.task.getObservation() act = self.net.activate(obs) chosen = drawIndex(act) self.task.performAction(chosen) reward = self.task.getReward() obss.append(obs) y = zeros(len(act)) y[chosen] = 1 acts.append(y) rewards.append(reward) avgReward += sum(rewards) / float(len(rewards)) # compute the returns from the list of rewards current = 0 returns = [] for r in reversed(rewards): current *= self.task.discount current += r returns.append(current) returns.reverse() for i in range(len(obss)): self.rawDs.addSample(obss[i], acts[i], returns[i]) self.valueDs.addSample(obss[i], returns[i]) r0s.append(returns[0]) lens.append(len(returns)) r0s = array(r0s) self.totalSteps += sum(lens) avgLen = sum(lens) / float(self.batchSize) avgR0 = mean(r0s) avgReward /= self.batchSize if self.verbose: print '***', round(avgLen, 3), '***', '(avg init exp. return:', round(avgR0, 5), ')', print 'avg reward', round(avgReward, 5), '(tau:', round(self.tau, 3), ')' print lens # storage: self.rewardAvg.append(avgReward) self.lengthAvg.append(avgLen) self.initr0Avg.append(avgR0) # if self.vnet == None: # # case 1: no value estimator: # prepare the dataset for training the acting network shaped = self.shapingFunction(r0s) self.updateTau(r0s, shaped) shaped /= max(shaped) for i, seq in enumerate(self.rawDs): self.weightedDs.newSequence() for sample in seq: obs, act, dummy = sample self.weightedDs.addSample(obs, act, shaped[i]) # else: # # case 2: value estimator: # # # # train the value estimating network # if self.verbose: print 'Old value error: ', self.vbp.testOnData() # self.vbp.trainEpochs(self.valueTrainEpochs) # if self.verbose: print 'New value error: ', self.vbp.testOnData() # # # produce the values and analyze # rminusvs = [] # sizes = [] # for i, seq in enumerate(self.valueDs): # self.vnet.reset() # seq = list(seq) # for sample in seq: # obs, ret = sample # val = self.vnet.activate(obs) # rminusvs.append(ret-val) # sizes.append(len(seq)) # # rminusvs = array(rminusvs) # shapedRminusv = self.shapingFunction(rminusvs) # # CHECKME: here? # self.updateTau(rminusvs, shapedRminusv) # shapedRminusv /= array(sizes) # shapedRminusv /= max(shapedRminusv) # # # prepare the dataset for training the acting network # rvindex = 0 # for i, seq in enumerate(self.rawDs): # self.weightedDs.newSequence() # self.vnet.reset() # for sample in seq: # obs, act, ret = sample # self.weightedDs.addSample(obs, act, shapedRminusv[rvindex]) # rvindex += 1 # train the acting network tmp1, tmp2 = self.bp.trainUntilConvergence(maxEpochs=self.maxEpochs, validationProportion=self.validationProportion, continueEpochs=self.continueEpochs, verbose=self.verbose) if self.supervisedPlotting: from pylab import plot, legend, figure, clf, draw figure(1) clf() plot(tmp1, label='train') plot(tmp2, label='valid') legend() draw() return avgLen, avgR0
class ModelExperiment(EpisodicExperiment): """ An experiment that learns a model of its (action, state) pair with a Gaussian Process for each dimension of the state. """ def __init__(self, task, agent): EpisodicExperiment.__init__(self, task, agent) # create model and training set (action dimension + 1 for time) self.modelds = SequentialDataSet(self.task.indim + 1, 1) self.model = [ GaussianProcess(indim=self.modelds.getDimension('input'), start=(-10, -10, 0), stop=(10, 10, 300), step=(5, 5, 100)) for _ in range(self.task.outdim) ] # change hyper parameters for all gps for m in self.model: m.hyper = (20, 2.0, 0.01) # m.autonoise = True def doEpisodes(self, number=1): """ returns the rewards of each step as a list and learns the model for each rollout. """ all_rewards = [] for dummy in range(number): self.stepid = 0 rewards = [] # the agent is informed of the start of the episode self.agent.newEpisode() self.task.reset() while not self.task.isFinished(): r = self._oneInteraction() rewards.append(r) all_rewards.append(rewards) # clear model dataset (to retrain it) self.modelds.clear() print "retrain gp" [m.trainOnDataset(self.modelds) for m in self.model] for i in range(self.agent.history.getNumSequences()): seq = self.agent.history.getSequence(i) state, action, dummy, dummy = seq l = len(action) index = map(lambda x: int(floor(x)), mgrid[0:l - 1:5j]) action = action[index, :] inp = c_[action, array([index]).T] self.modelds.setField('input', inp) # add training data to all gaussian processes for i, m in enumerate(self.model): tar = state[index, i] self.modelds.setField('target', array([tar]).T) m.addDataset(self.modelds) # print "updating GPs..." # [m._calculate() for m in self.model] # print "done." return all_rewards def _oneInteraction(self): self.stepid += 1 obs = self.task.getObservation() self.agent.integrateObservation(obs) action = self.agent.getAction() self.task.performAction(action) # predict with model #modelobs = array([0, 0, 0]) # time dimension # if self.stepid < self.model[0].stop: # steps = self.model[0].step # # # linear interpolation between two adjacent gp states # try: # modelobs = [ (1.0-float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(floor(float(self.stepid)/steps))] + # (float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(ceil(float(self.stepid)/steps))] # for i in range(self.task.outdim) ] # except IndexError: action = r_[action, array([self.stepid])] action = reshape(action, (1, 3)) modelobs = [ self.model[i].testOnArray(action) for i in range(self.task.outdim) ] # tell environment about model obs self.task.env.model = [modelobs] reward = self.task.getReward() self.agent.giveReward(reward) return reward
class ModelExperiment(EpisodicExperiment): """ An experiment that learns a model of its (action, state) pair with a Gaussian Process for each dimension of the state. """ def __init__(self, task, agent): EpisodicExperiment.__init__(self, task, agent) # create model and training set (action dimension + 1 for time) self.modelds = SequentialDataSet(self.task.indim + 1, 1) self.model = [GaussianProcess(indim=self.modelds.getDimension('input'), start=(-10, -10, 0), stop=(10, 10, 300), step=(5, 5, 100)) for _ in range(self.task.outdim)] # change hyper parameters for all gps for m in self.model: m.hyper = (20, 2.0, 0.01) # m.autonoise = True def doEpisodes(self, number = 1): """ returns the rewards of each step as a list and learns the model for each rollout. """ all_rewards = [] for dummy in range(number): self.stepid = 0 rewards = [] # the agent is informed of the start of the episode self.agent.newEpisode() self.task.reset() while not self.task.isFinished(): r = self._oneInteraction() rewards.append(r) all_rewards.append(rewards) # clear model dataset (to retrain it) self.modelds.clear() print "retrain gp" [m.trainOnDataset(self.modelds) for m in self.model] for i in range(self.agent.history.getNumSequences()): seq = self.agent.history.getSequence(i) state, action, dummy, dummy = seq l = len(action) index = map(lambda x: int(floor(x)), mgrid[0:l-1:5j]) action = action[index, :] inp = c_[action, array([index]).T] self.modelds.setField('input', inp) # add training data to all gaussian processes for i,m in enumerate(self.model): tar = state[index, i] self.modelds.setField('target', array([tar]).T) m.addDataset(self.modelds) # print "updating GPs..." # [m._calculate() for m in self.model] # print "done." return all_rewards def _oneInteraction(self): self.stepid += 1 obs = self.task.getObservation() self.agent.integrateObservation(obs) action = self.agent.getAction() self.task.performAction(action) # predict with model #modelobs = array([0, 0, 0]) # time dimension # if self.stepid < self.model[0].stop: # steps = self.model[0].step # # # linear interpolation between two adjacent gp states # try: # modelobs = [ (1.0-float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(floor(float(self.stepid)/steps))] + # (float(self.stepid%steps)/steps) * self.model[i].pred_mean[int(ceil(float(self.stepid)/steps))] # for i in range(self.task.outdim) ] # except IndexError: action = r_[action, array([self.stepid])] action = reshape(action, (1, 3)) modelobs = [self.model[i].testOnArray(action) for i in range(self.task.outdim)] # tell environment about model obs self.task.env.model = [modelobs] reward = self.task.getReward() self.agent.giveReward(reward) return reward