Beispiel #1
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue
                
                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = reward_ + self.gamma * max(self.module.getActionValues(state))
                supervised.addSample(inp, tgt)
                
                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        
        # alternative: backprop, was not as stable as rprop
        # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True)

        trainer.trainEpochs(1)
    def train(self, transitionSamples):

        print "Entrenando..."

        k = 0
        trainer = RPropMinusTrainer(self.Q, batchlearning=True)
        #trainer = BackpropTrainer(self.Q, batchlearning=False)
        TS = SupervisedDataSet(4, 1)

        while (k < self._epochs):

            if k % 10 == 0:
                print "\t ", k

            # Genero training set en base a las muestras
            # Input: Vector de 4 dimensiones (angulo, vel.angular, pos, accion)
            # Target: Valor

            TS.clear()

            for s, a, s_1, costo in transitionSamples:

                # Tomo Q para s', para todas las acciones posibles
                # (vector con el valor para s', para cada una de las 3 acciones posibles)
                # Q_s1 = [ self.Q.activate([s_1.angulo, s_1.velocidadAngular, s_1.posicion, b]) for b in range(Accion.maxValor + 1) ]
                valDerecha = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.DERECHA
                ])
                valIzquierda = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.IZQUIERDA
                ])

                if valDerecha >= 1 or valDerecha <= 0:
                    print "Q incorrecta: ", valDerecha

                if valIzquierda >= 1 or valIzquierda <= 0:
                    print "Q incorrecta: ", valIzquierda

                # Input y Target para la red neuronal
                inputVal = (s.angulo, s.velocidadAngular, s.posicion, a)

                if costo == 0:
                    targetVal = costo
                else:
                    targetVal = costo + self._gamma * min(
                        valDerecha, valIzquierda)

                if targetVal > 1 or targetVal < 0:
                    print "Target incorrecto: ", targetVal

                TS.addSample(inputVal, targetVal)

            # Entreno la red neuronal
            trainer.setData(TS)
            trainer.train()  # 1 epoch
            #trainer.trainEpochs(self._epochsNN)

            k = k + 1
Beispiel #3
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                Q = self.module.getValue(state_, int(action_[0]))
                

                inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)]
                #input = r_[state_, action_]
                tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
Beispiel #4
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        for seq in self.dataset[self.indexOfAgent]:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                if self.isFirstLerning:
                    tgt = reward_
                else:
                    tgt = Q + 0.5 * (reward_ + self.gamma * max(
                        self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                #for reward normalization

                # update last experience with current one
                lastexperience = (state, action, reward)

        #Re-building netowrks is required in multiprocessing environments.
        params = self.module.network.params
        self.module.network = buildNetwork(
            self.module.indim + self.module.numActions,
            self.module.indim + self.module.numActions, 1)
        self.module.network._setParameters(params)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network,
                                    dataset=supervised,
                                    batchlearning=True,
                                    verbose=False)  #, weightdecay=0.01)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
        if self.isFirstLerning:
            self.isFirstLerning = False
Beispiel #5
0
from utils import updateDataset, buildDataset, buildRecurrentNetwork, loadRecurrentNetwork
from pybrain.supervised.trainers.rprop import RPropMinusTrainer
from pybrain.tools.xml.networkwriter import NetworkWriter
from pybrain.tools.xml.networkreader import NetworkReader

#nn=buildRecurrentNetwork()
nn = loadRecurrentNetwork('recurrentNetwork.xml')
dataset = buildDataset()

trainer = RPropMinusTrainer(nn)
trainer.setData(dataset)
print 'dataset set for trainer'
trainer.trainUntilConvergence()
print 'trained to convergence'

NetworkWriter.writeToFile(nn, 'recurrentNetwork.xml')
def test_multilayer_perceptron():
    def plot(fig, data):
        ax = fig.add_subplot(111)
        ax.plot([x[0] for x in data], [x[1] for x in data])

    def scat(fig, liner_data, marker='o', color='g'):
        ax = fig.add_subplot(111)
        ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data],
                   marker=marker,
                   color=color,
                   s=10)

    def get_predict_list(x_range, y_range, nn, split=10):
        data = []
        xspan = float(x_range[1] - x_range[0]) / split
        yspan = float(y_range[1] - y_range[0]) / split

        for x_value in [float(i) * xspan + x_range[0] for i in range(split)]:
            predict_list = []
            for y_value in [
                    float(j) * yspan + y_range[0] for j in range(split)
            ]:
                #if nn.predict([x_value,y_value])[0] >= 0.5:
                if nn.activate([x_value, y_value])[0] >= 0.5:
                    data.append((x_value, y_value))
                    break
        return data

    import matplotlib.pyplot as plt
    """ トレーニングデータ取得
    """
    x_range = [0, 1]
    y_range = [0, 1]
    #liner_data = liner_training_data(x_range, y_range)
    liner_data = quadratic_function_data(x_range, y_range, split=20)
    #liner_data = sin_function_data(x_range, y_range, 20)
    train_data_input, train_data_output = change_format(liner_data)

    fig = plt.figure()
    scat(fig, [key for key, value in liner_data.items() if value == 0],
         color='g')
    scat(fig, [key for key, value in liner_data.items() if value == 1],
         color='b')
    """ NN構築
    """
    network = build_network()

    # mlnn = MultiLayerNeuralNetwork( [2, 5, 1],
    #                                 threshold=0.1,
    #                                 start_learning_coef=0.2,
    #                                 sigmoid_alpha=10,
    #                                 mini_batch=100,
    #                                 layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer],
    #                                 rprop=True
    #                                 )
    """ 学習
    """
    #error_hist = mlnn.train_multi(train_data_input, train_data_output)
    supervised = get_supervised(network, train_data_input, train_data_output)
    trainer = RPropMinusTrainer(network,
                                dataset=supervised,
                                batchlearning=True,
                                verbose=True)
    trainer.trainUntilConvergence(maxEpochs=100)

    # xに対応するyを算出, 学習後分離線書く
    data = get_predict_list(x_range, y_range, network, split=20)
    plot(fig, data)

    # # エラー表示
    # fig2 = plt.figure()
    # plot(fig2, error_hist)

    # 表示
    plt.show()
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = []
     dats=[]#[seq index][turn]=[state,jointAct,jointReward]
     for i in range(self.num_agents):
         supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1))
     for i in range(self.dataset[self.indexOfAgent].getNumSequences()):            
         seq=[]
         for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])):
             state=self.dataset[self.indexOfAgent].getSequence(i)[0][j]
             jointAct=[]
             jointReward=[]
             for k in range(self.num_agents):
                 jointAct.append(self.dataset[k].getSequence(i)[1][j][0])
                 jointReward.append(self.dataset[k].getSequence(i)[2][j][0])
             seq.append([state, jointAct, jointReward])
         dats.append(seq)
     #prepare data set
     for i in range(self.num_agents):
         for seq in dats:
             lastexperience = None
             for sarPair in seq:
                 state = sarPair[0]
                 action = sarPair[1]
                 reward = sarPair[2]
                 if not lastexperience:
                     # delay each experience in sequence by one
                     lastexperience = (state, action, reward)
                     continue
                 # use experience from last timestep to do Q update
                 (state_, action_, reward_) = lastexperience
                 
                 #update Q-value function approximator
                 qValuesNext=self._qValuesForAllPossibleJointAction(state)
                 eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing)
                 #Learn
                 inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_)
                 if self.isFirstLerning:
                     target=reward_[i]
                 else:
                     target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i))
                 target=np.array([target])
                 supervised[i].addSample(inp, target)
                 # update last experience with current one
                 lastexperience = (state, action, reward)
     if self.isFirstLerning:
         self.isFirstLerning=False
         
     procTrainers=[]
     qResult=Queue()
     for i in range(self.num_agents):
         trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], 
                                   batchlearning=True, 
                                   verbose=False, 
                                   )
         if not self.validateMultiProc:
             trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False)
         else:
             procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult}))
     if self.validateMultiProc:
         for proc in procTrainers:
             proc.start()
         for i in range(self.num_agents):
             res=qResult.get()
             self.linQ[res[0]]=res[1]