def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience inp = r_[state_, one_to_n(action_[0], self.module.numActions)] tgt = reward_ + self.gamma * max(self.module.getActionValues(state)) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) # alternative: backprop, was not as stable as rprop # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True) trainer.trainEpochs(1)
def train(self, transitionSamples): print "Entrenando..." k = 0 trainer = RPropMinusTrainer(self.Q, batchlearning=True) #trainer = BackpropTrainer(self.Q, batchlearning=False) TS = SupervisedDataSet(4, 1) while (k < self._epochs): if k % 10 == 0: print "\t ", k # Genero training set en base a las muestras # Input: Vector de 4 dimensiones (angulo, vel.angular, pos, accion) # Target: Valor TS.clear() for s, a, s_1, costo in transitionSamples: # Tomo Q para s', para todas las acciones posibles # (vector con el valor para s', para cada una de las 3 acciones posibles) # Q_s1 = [ self.Q.activate([s_1.angulo, s_1.velocidadAngular, s_1.posicion, b]) for b in range(Accion.maxValor + 1) ] valDerecha = self.Q.activate([ s_1.angulo, s_1.velocidadAngular, s_1.posicion, Accion.DERECHA ]) valIzquierda = self.Q.activate([ s_1.angulo, s_1.velocidadAngular, s_1.posicion, Accion.IZQUIERDA ]) if valDerecha >= 1 or valDerecha <= 0: print "Q incorrecta: ", valDerecha if valIzquierda >= 1 or valIzquierda <= 0: print "Q incorrecta: ", valIzquierda # Input y Target para la red neuronal inputVal = (s.angulo, s.velocidadAngular, s.posicion, a) if costo == 0: targetVal = costo else: targetVal = costo + self._gamma * min( valDerecha, valIzquierda) if targetVal > 1 or targetVal < 0: print "Target incorrecto: ", targetVal TS.addSample(inputVal, targetVal) # Entreno la red neuronal trainer.setData(TS) trainer.train() # 1 epoch #trainer.trainEpochs(self._epochsNN) k = k + 1
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, int(action_[0])) inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)] #input = r_[state_, action_] tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) # update last experience with current one lastexperience = (state, action, reward) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = SupervisedDataSet(self.module.network.indim, 1) for seq in self.dataset[self.indexOfAgent]: lastexperience = None for state, action, reward in seq: if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience Q = self.module.getValue(state_, action_[0]) inp = r_[state_, one_to_n(action_[0], self.module.numActions)] if self.isFirstLerning: tgt = reward_ else: tgt = Q + 0.5 * (reward_ + self.gamma * max( self.module.getActionValues(state)) - Q) supervised.addSample(inp, tgt) #for reward normalization # update last experience with current one lastexperience = (state, action, reward) #Re-building netowrks is required in multiprocessing environments. params = self.module.network.params self.module.network = buildNetwork( self.module.indim + self.module.numActions, self.module.indim + self.module.numActions, 1) self.module.network._setParameters(params) # train module with backprop/rprop on dataset trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False) #, weightdecay=0.01) trainer.trainUntilConvergence(maxEpochs=self.maxEpochs) if self.isFirstLerning: self.isFirstLerning = False
from utils import updateDataset, buildDataset, buildRecurrentNetwork, loadRecurrentNetwork from pybrain.supervised.trainers.rprop import RPropMinusTrainer from pybrain.tools.xml.networkwriter import NetworkWriter from pybrain.tools.xml.networkreader import NetworkReader #nn=buildRecurrentNetwork() nn = loadRecurrentNetwork('recurrentNetwork.xml') dataset = buildDataset() trainer = RPropMinusTrainer(nn) trainer.setData(dataset) print 'dataset set for trainer' trainer.trainUntilConvergence() print 'trained to convergence' NetworkWriter.writeToFile(nn, 'recurrentNetwork.xml')
def test_multilayer_perceptron(): def plot(fig, data): ax = fig.add_subplot(111) ax.plot([x[0] for x in data], [x[1] for x in data]) def scat(fig, liner_data, marker='o', color='g'): ax = fig.add_subplot(111) ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data], marker=marker, color=color, s=10) def get_predict_list(x_range, y_range, nn, split=10): data = [] xspan = float(x_range[1] - x_range[0]) / split yspan = float(y_range[1] - y_range[0]) / split for x_value in [float(i) * xspan + x_range[0] for i in range(split)]: predict_list = [] for y_value in [ float(j) * yspan + y_range[0] for j in range(split) ]: #if nn.predict([x_value,y_value])[0] >= 0.5: if nn.activate([x_value, y_value])[0] >= 0.5: data.append((x_value, y_value)) break return data import matplotlib.pyplot as plt """ トレーニングデータ取得 """ x_range = [0, 1] y_range = [0, 1] #liner_data = liner_training_data(x_range, y_range) liner_data = quadratic_function_data(x_range, y_range, split=20) #liner_data = sin_function_data(x_range, y_range, 20) train_data_input, train_data_output = change_format(liner_data) fig = plt.figure() scat(fig, [key for key, value in liner_data.items() if value == 0], color='g') scat(fig, [key for key, value in liner_data.items() if value == 1], color='b') """ NN構築 """ network = build_network() # mlnn = MultiLayerNeuralNetwork( [2, 5, 1], # threshold=0.1, # start_learning_coef=0.2, # sigmoid_alpha=10, # mini_batch=100, # layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer], # rprop=True # ) """ 学習 """ #error_hist = mlnn.train_multi(train_data_input, train_data_output) supervised = get_supervised(network, train_data_input, train_data_output) trainer = RPropMinusTrainer(network, dataset=supervised, batchlearning=True, verbose=True) trainer.trainUntilConvergence(maxEpochs=100) # xに対応するyを算出, 学習後分離線書く data = get_predict_list(x_range, y_range, network, split=20) plot(fig, data) # # エラー表示 # fig2 = plt.figure() # plot(fig2, error_hist) # 表示 plt.show()
def learn(self): # convert reinforcement dataset to NFQ supervised dataset supervised = [] dats=[]#[seq index][turn]=[state,jointAct,jointReward] for i in range(self.num_agents): supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1)) for i in range(self.dataset[self.indexOfAgent].getNumSequences()): seq=[] for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])): state=self.dataset[self.indexOfAgent].getSequence(i)[0][j] jointAct=[] jointReward=[] for k in range(self.num_agents): jointAct.append(self.dataset[k].getSequence(i)[1][j][0]) jointReward.append(self.dataset[k].getSequence(i)[2][j][0]) seq.append([state, jointAct, jointReward]) dats.append(seq) #prepare data set for i in range(self.num_agents): for seq in dats: lastexperience = None for sarPair in seq: state = sarPair[0] action = sarPair[1] reward = sarPair[2] if not lastexperience: # delay each experience in sequence by one lastexperience = (state, action, reward) continue # use experience from last timestep to do Q update (state_, action_, reward_) = lastexperience #update Q-value function approximator qValuesNext=self._qValuesForAllPossibleJointAction(state) eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing) #Learn inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_) if self.isFirstLerning: target=reward_[i] else: target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i)) target=np.array([target]) supervised[i].addSample(inp, target) # update last experience with current one lastexperience = (state, action, reward) if self.isFirstLerning: self.isFirstLerning=False procTrainers=[] qResult=Queue() for i in range(self.num_agents): trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], batchlearning=True, verbose=False, ) if not self.validateMultiProc: trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False) else: procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult})) if self.validateMultiProc: for proc in procTrainers: proc.start() for i in range(self.num_agents): res=qResult.get() self.linQ[res[0]]=res[1]