def getObservation(self): (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi) = self.env.getSensors() # TODO not calling superclass to do normalization, etc. top_half = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim - 20) bot_half = one_to_n(np.digitize([psi], self.psi_bounds)[0] - 1, 20) return np.concatenate((top_half,bot_half))
def getObservation(self): (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi) = self.env.getSensors() # TODO not calling superclass to do normalization, etc. top_half = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim - 20) bot_half = one_to_n(np.digitize([psi], self.psi_bounds)[0] - 1, 20) return np.concatenate((top_half, bot_half))
def performAction(self, action): """Incoming action is an int between 0 and 8. The action we provide to the environment consists of a torque T in {-2 N, 0, 2 N}, and a displacement d in {-.02 m, 0, 0.02 m}. """ self.t += 1 self.action_history += one_to_n(action[0], self.nactions) # Map the action integer to a torque and displacement. assert round(action[0]) == action[0] if self.only_steer: T = 2 * (action[0] / 4.0 - 1.0) d = 0. else: # -1 for action in {0, 1, 2}, 0 for action in {3, 4, 5}, 1 for # action in {6, 7, 8} torque_selector = np.floor(action[0] / 3.0) - 1.0 T = 2 * torque_selector # Random number in [-1, 1]: p = 2.0 * np.random.rand() - 1.0 # -1 for action in {0, 3, 6}, 0 for action in {1, 4, 7}, 1 for # action in {2, 5, 8} disp_selector = action[0] % 3 - 1.0 d = 0.02 * disp_selector + self._butt_disturbance_amplitude * p super(BalanceTask, self).performAction([T, d])
def performAction(self, action): """ adding the option to use an action space of size 5, where we either apply a torque (+/- 2.0), displace the butt (+/- 0.02), or do nothing (ala, Lagoudakis). """ if self.five_actions: p = 2.0 * np.random.rand() - 1.0 T = 0 d = self._butt_disturbance_amplitude * p self.t += 1 self.action_history += one_to_n(action[0], self.nactions) # Map the action integer to a torque and displacement. assert round(action[0]) == action[0] if action[0] == 0: T = -2 elif action[0] == 1: T = 2 elif action[0] == 2: d -= 0.02 elif action[0] == 3: d += 0.02 super(BalanceTask, self).performAction([T, d]) else: BalanceTask.performAction(self, action)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience inp = r_[state_, one_to_n(action_[0], self.module.numActions)] tgt = reward_ + self.gamma * max(self.module.getActionValues(state)) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) # alternative: backprop, was not as stable as rprop # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True) trainer.trainEpochs(1)
def toClassificationDataset(codedSampleSet): classifiedSampleSet = [] # Calculate the unique classes classes = [] for sample in codedSampleSet: classifier = getClassifier(sample) if classifier not in classes: classes.append(classifier) classes.sort() # Now that we have all the classes, we process the outputs for sample in codedSampleSet: classifier = getClassifier(sample) classifiedSample = one_to_n(classes.index(classifier), len(classes)) classifiedSampleSet.append(classifiedSample) # Build the dataset sampleSize = len(codedSampleSet[0]) classifiedSampleSize = len(classifiedSampleSet[0]) dataset = ClassificationDataSet(sampleSize, classifiedSampleSize) for i in range(len(classifiedSampleSet)): dataset.addSample(codedSampleSet[i], classifiedSampleSet[i]) return dataset, classes
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def makeGreedy1D(valNet, polNet, policyEvalStates, numAct, stepSize): from pybrain.datasets import SupervisedDataSet supervised = SupervisedDataSet(polNet.indim, numAct) # numInput, numOutputs # Try all the actions and see which has the best value for state in policyEvalStates: vBest = -100000 for action in range(numAct): nextState = [ep.updateDist(state, stepSize, numAct, action)] vNext = valNet.activate(nextState) if (vNext > vBest): actBest = action vBest = vNext from pybrain.utilities import one_to_n supervised.addSample(state, one_to_n(actBest, numAct)) # Print supervised training set # print(supervised) # input() # Train neural network from pybrain.supervised.trainers.rprop import RPropMinusTrainer trainer = RPropMinusTrainer(polNet, dataset=supervised, verbose=False) trainer.trainUntilConvergence(maxEpochs=50) # I'm OK with some interpolation here. It's the values we need to be exact on. return polNet
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, int(action_[0])) inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)] #input = r_[state_, action_] tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def _qValues(self, state): """ Return vector of q-values for all actions, given the state(-features). """ values = np.array([ self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions) ]) return values.flatten()
def getObservation(self): (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi) = self.env.getSensors() print(self.getBin(theta, thetad, omega, omegad, omegadd),self.outdim) state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim) print(state[self.getBin(theta, thetad, omega, omegad, omegadd)]) return state
def getObservation(self): (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi, psig) = self.env.getSensors() # TODO not calling superclass to do normalization, etc. state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim) self.bin_count += state return state
def getActionValues(self, state): """ Run forward activation for each of the actions and returns all values. """ values = array([ self.network.activate(r_[state, one_to_n(i, self.numActions)]) for i in range(self.numActions) ]) return values
def _updateEtraces(self, state, action, responsibility=1.): self._etraces *= self.rewardDiscount * self._lambda * responsibility # This assumes that state is an identity vector (like, from one_to_n). self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.) # Set the trace for all other actions in this state to 0: action_bit = one_to_n(action, self.num_actions) for argstate in argwhere(state == 1) : self._etraces[argwhere(action_bit != 1), argstate] = 0.
def _updateEtraces(self, state, action, responsibility=1.0): self._etraces *= self.rewardDiscount * self._lambda * responsibility # This assumes that state is an identity vector (like, from one_to_n). self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.0) # Set the trace for all other actions in this state to 0: action_bit = one_to_n(action, self.num_actions) for argstate in argwhere(state == 1): self._etraces[argwhere(action_bit != 1), argstate] = 0.0
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target = reward + self.rewardDiscount * max(self._qValues(next_state)) inp = r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ = BackpropTrainer(self.linQ, weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features + self.num_actions, 1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #update estimate of average policy self.averagePolicy.append(copy.deepcopy(self.linPolicy)) if len(self.averagePolicy) > self.maxNumberofAverage: self.averagePolicy.pop(np.random.randint(len(self.averagePolicy))) #update policy function approximator delta = None cumRewardOfCurrentPolicy = 0.0 values = self._qValues(state) pi = self._pi(state) for elem_action in range(self.num_actions): cumRewardOfCurrentPolicy = pi[elem_action] * values[elem_action] cumRewardOfAveragePolicy = 0.0 api = self._piAvr(state) for elem_action in range(self.num_actions): cumRewardOfAveragePolicy = api[elem_action] * values[elem_action] if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy: delta = self.deltaW else: delta = self.deltaL #Update policy bestAction = r_argmax(self._qValues(state)) target = one_to_n(bestAction, self.num_actions) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy = BackpropTrainer(self.linPolicy, learningrate=(delta), weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs( epochs=self.trainingEpochPerUpdateWight)
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target=reward + self.rewardDiscount * max(self._qValues(next_state)) inp=r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features+self.num_actions,1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #update estimate of average policy self.averagePolicy.append(copy.deepcopy(self.linPolicy)) if len(self.averagePolicy) > self.maxNumberofAverage: self.averagePolicy.pop(np.random.randint(len(self.averagePolicy))) #update policy function approximator delta=None cumRewardOfCurrentPolicy=0.0 values=self._qValues(state) pi=self._pi(state) for elem_action in range(self.num_actions): cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action] cumRewardOfAveragePolicy=0.0 api=self._piAvr(state) for elem_action in range(self.num_actions): cumRewardOfAveragePolicy=api[elem_action]*values[elem_action] if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy: delta=self.deltaW else: delta=self.deltaL #Update policy bestAction=r_argmax(self._qValues(state)) target=one_to_n(bestAction, self.num_actions) inp=r_[asarray(state)] ds = SupervisedDataSet(self.num_features,self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, learningrate=(delta), weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
def getActionValues(self, state): #valid_moves = get_moves(state) #valid_moves = range(self.numActions) valid_moves = self.get_valid_moves() values = array([ self.network.activate(r_[state, one_to_n(i, self.numActions)]) if i in valid_moves else np.NINF for i in range(self.numActions) ]) return values
def _EncodeStateAndJointActionIntoInputVector(self, state, jointAct): index=int(np.dot(self.w4ActIndexing, jointAct)) if index in self.actionVecDic: return np.r_[state, self.actionVecDic[index]] else: iVector=np.array([]) for iAgent in range(len(jointAct)): iVector=np.r_[iVector, one_to_n(jointAct[iAgent], self.num_actions[iAgent])] self.actionVecDic[index]=iVector return np.r_[state, self.actionVecDic[index]]
def nfq_action_value(network_fname, state=[0, 0, 0, 0, 0]): # TODO generalize away from 9 action values. Ask the network how many # discrete action values there are. n_actions = 9 network = NetworkReader.readFrom(network_fname) actionvalues = np.empty(n_actions) for i_action in range(n_actions): network_input = r_[state, one_to_n(i_action, n_actions)] actionvalues[i_action] = network.activate(network_input) return actionvalues
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target=reward + self.rewardDiscount * max(self._qValues(next_state)) inp=r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features+self.num_actions,1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #Update policy bestAction=r_argmax(self._qValues(state)) target= one_to_n(bestAction, self.num_actions) inp=r_[asarray(state)] ds = SupervisedDataSet(self.num_features,self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, learningrate=self.delta, weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
def _updateEtraces(self, state, action, responsibility=1.): self._etraces *= self.rewardDiscount * self._lambda * responsibility # TODO I think this assumes that state is an identity vector (like, # from one_to_n). self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.) # Set the trace for all other actions in this state to 0: action_bit = one_to_n(action, self.num_actions) # changed this line to allow for multiple # (state == 1) occurences for argstate in argwhere(state == 1) : self._etraces[argwhere(action_bit != 1), argstate] = 0.
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target = reward + self.rewardDiscount * max(self._qValues(next_state)) inp = r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ = BackpropTrainer(self.linQ, weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features + self.num_actions, 1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #Update policy bestAction = r_argmax(self._qValues(state)) target = one_to_n(bestAction, self.num_actions) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy = BackpropTrainer(self.linPolicy, learningrate=self.delta, weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs( epochs=self.trainingEpochPerUpdateWight)
def evalSpeakerOnClassificationDataset(speaker, problems): """Evaluate a listener agent on the forced choice dataset. Here fcProblems is a ClassificationDataset, where problem[0] is concatenated features, utterance, and problem[1] is target. """ nCorrect = 0 allActivations = [] for features, goldUtterance in problems: activations = speaker.activate(features) allActivations.append(activations) activationMax = activations.argmax() utterance = one_to_n(activationMax, 3) if (utterance == goldUtterance).all(): nCorrect +=1 return nCorrect, allActivations
def loadCSV(filename,multiclass=True,outputs=1,separator=','): #read in all the lines f = open(filename).readlines() #start our datasets in_data = [] out_data =[] #process the file for line in f: #remove whitespace and split according to separator character samples = line.strip(' \r\n').split(separator) #save input data in_data.append([float(i) for i in samples[:-outputs]]) #save output data if multiclass: out_data.append(samples[-1]) else: out_data.append([float(i) for i in samples[-outputs:]]) processed_out_data = out_data #process multiclass encoding if multiclass: processed_out_data = [] #get all the unique values for classes keys = [] for d in out_data: if d not in keys: keys.append(d) keys.sort() #encode all data for d in out_data: processed_out_data.append(one_to_n(keys.index(d),len(keys))) #create the dataset dataset = SupervisedDataSet(len(in_data[0]),len(processed_out_data[0])) for i in xrange(len(out_data)): dataset.addSample(in_data[i],processed_out_data[i]) #return the keys if we have if multiclass: return dataset,keys # a multiclass classifier else: return dataset
def loadCSV(filename, multiclass=True, outputs=1, separator=','): #read in all the lines f = open(filename).readlines() #start our datasets in_data = [] out_data = [] #process the file for line in f: #remove whitespace and split according to separator character samples = line.strip(' \r\n').split(separator) #save input data in_data.append([float(i) for i in samples[:-outputs]]) #save output data if multiclass: out_data.append(samples[-1]) else: out_data.append([float(i) for i in samples[-outputs:]]) processed_out_data = out_data #process multiclass encoding if multiclass: processed_out_data = [] #get all the unique values for classes keys = [] for d in out_data: if d not in keys: keys.append(d) keys.sort() #encode all data for d in out_data: processed_out_data.append(one_to_n(keys.index(d), len(keys))) #create the dataset dataset = SupervisedDataSet(len(in_data[0]), len(processed_out_data[0])) for i in xrange(len(out_data)): dataset.addSample(in_data[i], processed_out_data[i]) #return the keys if we have if multiclass: return dataset, keys # a multiclass classifier else: return dataset
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset[self.indexOfAgent]: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] if self.isFirstLerning: tgt = reward_ else: tgt = Q + 0.5 * (reward_ + self.gamma * max( self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) #for reward normalization # update last experience with current one lastexperience = (state, action, reward) #Re-building netowrks is required in multiprocessing environments. params = self.module.network.params self.module.network = buildNetwork( self.module.indim + self.module.numActions, self.module.indim + self.module.numActions, 1) self.module.network._setParameters(params) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) #, weightdecay=0.01) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) if self.isFirstLerning: self.isFirstLerning = False
def evalListenerOnClassificationDataset(listener, fcProblems): """Evaluate a listener agent on the forced choice dataset. Here fcProblems is a ClassificationDataset, where problem[0] is concatenated features, utterance, and problem[1] is target. returns number correct, raw model activations, and also an array of if each problem is correct (1/0) """ nCorrect = 0 allActivations = [] correct = [] for problem in fcProblems: activations = listener.activate(problem[0]) allActivations.append(activations) activationMax = activations.argmax() target = one_to_n(activationMax, 3) if (target == problem[1]).all(): nCorrect += 1 correct.append(1) else: correct.append(0) return (nCorrect, allActivations, correct)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset[self.indexOfAgent]: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] if self.isFirstLerning: tgt = reward_ else: tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) #for reward normalization # update last experience with current one lastexperience = (state, action, reward) #Re-building netowrks is required in multiprocessing environments. params=self.module.network.params self.module.network=buildNetwork(self.module.indim+self.module.numActions, self.module.indim+self.module.numActions, 1) self.module.network._setParameters(params) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) if self.isFirstLerning: self.isFirstLerning=False
def setInitEst(optVal, optLocs, agent,maxEpochs): # Create training data from pybrain.datasets import SupervisedDataSet from scipy import r_ from pybrain.utilities import one_to_n module = agent.module # Currently we have one input (location) and two outputs (corresponding to travelling towards or away) supervised = SupervisedDataSet(module.network.indim, 1) for loc in optLocs: # Go through all locations we are going to be optimistic about for currAction in range(module.numActions): inp = r_[loc, one_to_n(currAction, module.numActions)] tgt = optVal supervised.addSample(inp,tgt) print(supervised) # Train from pybrain.supervised.trainers import BackpropTrainer trainer = BackpropTrainer(module.network, dataset=supervised, learningrate=0.005, batchlearning=True, verbose=False) trainer.trainUntilConvergence(maxEpochs = maxEpochs) return agent
from environment import Environment from tasks import LinearFATileCoding3456BalanceTask from training import LinearFATraining from learners import SARSALambda_LinFA_ReplacingTraces task = LinearFATileCoding3456BalanceTask() learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim) learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 100000 # NOTE increasing this number above from the default of 100 is what got the # learning to actually happen, and fixed the bug/issue where the performance # agent's performance stopped improving. for i in np.arange(2000, 3800, 50): theta = np.loadtxt('/home/fitze/Documents/agent-bicycle/data/balance_sarsalambda_linfa_replacetrace_anneal_112217H56M04S/theta_%i.dat' % i) learner._theta = theta Q = learner._qValues(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim)) pl.plot(Q, label='%s' % i) #pl.legend() pl.show() print learner._greedyAction(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
def getActionValues(self, state): #valid_moves = get_moves(state) #valid_moves = range(self.numActions) valid_moves = self.get_valid_moves() values = array([self.network.activate(r_[state, one_to_n(i, self.numActions)]) if i in valid_moves else np.NINF for i in range(self.numActions)]) return values
def _qValues(self, state): """ Return vector of q-values for all actions, given the state(-features). """ values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)]) return values.flatten()
def getActionValues(self, state): """ Run forward activation for each of the actions and returns all values. """ values = array([self.network.activate(r_[state, one_to_n(i, self.numActions)]) for i in range(self.numActions)]) return values
def getObservation(self): (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi) = self.env.getSensors() state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim) return state
def getValue(self, state, action): return self.network.activate(r_[state, one_to_n(action, self.numActions)])
from training import LinearFATraining from learners import SARSALambda_LinFA_ReplacingTraces task = LinearFATileCoding3456BalanceTask() learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim) learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 100000 # NOTE increasing this number above from the default of 100 is what got the # learning to actually happen, and fixed the bug/issue where the performance # agent's performance stopped improving. for i in np.arange(2000, 3800, 50): theta = np.loadtxt( '/home/fitze/Documents/agent-bicycle/data/balance_sarsalambda_linfa_replacetrace_anneal_112217H56M04S/theta_%i.dat' % i) learner._theta = theta Q = learner._qValues(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim)) pl.plot(Q, label='%s' % i) #pl.legend() pl.show() print learner._greedyAction(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator (estimate Q-value instead of V) BellmanErrors = np.zeros(self.num_agents) for iAgent in range(self.num_agents): vValC = self._qValues(state, iAgent) vValN = self._qValues(next_state, iAgent) vArgMaxValC = r_argmax(vValC) vArgMaxValN = r_argmax(vValN) BellmanError = (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[vArgMaxValC] target = vValC[action[iAgent]] + self.cn * ( (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[action[iAgent]]) BellmanErrors[iAgent] = BellmanError inp = r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])] ds = SupervisedDataSet( self.num_features + self.num_actions[iAgent], 1) ds.addSample(inp, target) BackpropTrainer(self.linQ[iAgent], learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) #Estimate gradient grad = self.linGradient.activate( np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])])[0] target = grad + self.cn * (np.sum(BellmanErrors, axis=0) - grad) inp = np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])] ds = SupervisedDataSet( self.num_features + self.num_actions[self.indexOfAgent], 1) ds.addSample(inp, target) BackpropTrainer(self.linGradient, learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) # print str(self.indexOfAgent) + "-th agents optimization info.:" # print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0)) # print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent])) # print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent)) #Update policy c_pi = self._pi(state) # print "Policy: " + str(c_pi) firstTerm = c_pi[action[self.indexOfAgent]] secondTerm = (np.sqrt(firstTerm) * np.absolute(BellmanErrors[self.indexOfAgent]) * self._sgn(-1.0 * self.linGradient.activate( np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])])[0])) target = c_pi target[action[self.indexOfAgent]] = self._gamma(firstTerm - self.bn * secondTerm) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions[self.indexOfAgent]) ds.addSample(inp, target) BackpropTrainer(self.linPolicy, learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) #update bn, cn self.bn = self.bn * self.decayBn self.cn = self.cn * self.decayCn