Beispiel #1
0
 def train(self):
     if len(self.pybdataset) == 0:
         return
     # train module with backprop/rprop on dataset
     trainer = RPropMinusTrainer(self.network, dataset=self.pybdataset, batchlearning=True, verbose=False)
     # trainer = BackpropTrainer(self.network, dataset=self.pybdataset, batchlearning=True, verbose=True)
     trainer.trainEpochs(100)
Beispiel #2
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue
                
                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = reward_ + self.gamma * max(self.module.getActionValues(state))
                supervised.addSample(inp, tgt)
                
                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        
        # alternative: backprop, was not as stable as rprop
        # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True)

        trainer.trainEpochs(1)
Beispiel #3
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)

        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
Beispiel #4
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                Q = self.module.getValue(state_, int(action_[0]))
                

                inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)]
                #input = r_[state_, action_]
                tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
def makeGreedy1D(valNet, polNet, policyEvalStates, numAct, stepSize):

    from pybrain.datasets import SupervisedDataSet                
    supervised = SupervisedDataSet(polNet.indim, numAct) # numInput, numOutputs   
    
    # Try all the actions and see which has the best value    
    for state in policyEvalStates:
        vBest = -100000
        for action in range(numAct):            
            nextState = [ep.updateDist(state, stepSize, numAct, action)]
            vNext = valNet.activate(nextState)
            if (vNext > vBest):
                actBest = action
                vBest = vNext
        from pybrain.utilities import one_to_n
        supervised.addSample(state, one_to_n(actBest, numAct))
    
    # Print supervised training set 
    # print(supervised)
    # input()
    
    # Train neural network
    from pybrain.supervised.trainers.rprop import RPropMinusTrainer                
    trainer = RPropMinusTrainer(polNet, dataset=supervised, verbose=False)  
    trainer.trainUntilConvergence(maxEpochs=50) # I'm OK with some interpolation here. It's the values we need to be exact on.
    return polNet
Beispiel #6
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        for seq in self.dataset[self.indexOfAgent]:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                if self.isFirstLerning:
                    tgt = reward_
                else:
                    tgt = Q + 0.5 * (reward_ + self.gamma * max(
                        self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                #for reward normalization

                # update last experience with current one
                lastexperience = (state, action, reward)

        #Re-building netowrks is required in multiprocessing environments.
        params = self.module.network.params
        self.module.network = buildNetwork(
            self.module.indim + self.module.numActions,
            self.module.indim + self.module.numActions, 1)
        self.module.network._setParameters(params)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network,
                                    dataset=supervised,
                                    batchlearning=True,
                                    verbose=False)  #, weightdecay=0.01)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
        if self.isFirstLerning:
            self.isFirstLerning = False
    def train(self, transitionSamples):

        print "Entrenando..."

        k = 0
        trainer = RPropMinusTrainer(self.Q, batchlearning=True)
        #trainer = BackpropTrainer(self.Q, batchlearning=False)
        TS = SupervisedDataSet(4, 1)

        while (k < self._epochs):

            if k % 10 == 0:
                print "\t ", k

            # Genero training set en base a las muestras
            # Input: Vector de 4 dimensiones (angulo, vel.angular, pos, accion)
            # Target: Valor

            TS.clear()

            for s, a, s_1, costo in transitionSamples:

                # Tomo Q para s', para todas las acciones posibles
                # (vector con el valor para s', para cada una de las 3 acciones posibles)
                # Q_s1 = [ self.Q.activate([s_1.angulo, s_1.velocidadAngular, s_1.posicion, b]) for b in range(Accion.maxValor + 1) ]
                valDerecha = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.DERECHA
                ])
                valIzquierda = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.IZQUIERDA
                ])

                if valDerecha >= 1 or valDerecha <= 0:
                    print "Q incorrecta: ", valDerecha

                if valIzquierda >= 1 or valIzquierda <= 0:
                    print "Q incorrecta: ", valIzquierda

                # Input y Target para la red neuronal
                inputVal = (s.angulo, s.velocidadAngular, s.posicion, a)

                if costo == 0:
                    targetVal = costo
                else:
                    targetVal = costo + self._gamma * min(
                        valDerecha, valIzquierda)

                if targetVal > 1 or targetVal < 0:
                    print "Target incorrecto: ", targetVal

                TS.addSample(inputVal, targetVal)

            # Entreno la red neuronal
            trainer.setData(TS)
            trainer.train()  # 1 epoch
            #trainer.trainEpochs(self._epochsNN)

            k = k + 1
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = SupervisedDataSet(self.module.network.indim, 1)
     for seq in self.dataset[self.indexOfAgent]:
         lastexperience = None
         for state, action, reward in seq:
             if not lastexperience:
                 # delay each experience in sequence by one
                 lastexperience = (state, action, reward)
                 continue
             
             # use experience from last timestep to do Q update
             (state_, action_, reward_) = lastexperience
             
             Q = self.module.getValue(state_, action_[0])
             
             inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
             if self.isFirstLerning:
                 tgt = reward_
             else:
                 tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
             supervised.addSample(inp, tgt)
             
             #for reward normalization
             
             # update last experience with current one
             lastexperience = (state, action, reward)
             
     #Re-building netowrks is required in multiprocessing environments. 
     params=self.module.network.params
     self.module.network=buildNetwork(self.module.indim+self.module.numActions, 
                                      self.module.indim+self.module.numActions, 
                                      1)
     self.module.network._setParameters(params)
     
     # train module with backprop/rprop on dataset
     trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01)
     trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
     if self.isFirstLerning:
         self.isFirstLerning=False
Beispiel #9
0
from utils import updateDataset,buildDataset,buildRecurrentNetwork,loadRecurrentNetwork
from pybrain.supervised.trainers.rprop import RPropMinusTrainer
from pybrain.tools.xml.networkwriter import NetworkWriter
from pybrain.tools.xml.networkreader import NetworkReader

#nn=buildRecurrentNetwork()
nn=loadRecurrentNetwork('recurrentNetwork.xml')
dataset=buildDataset()

trainer=RPropMinusTrainer(nn)
trainer.setData(dataset)
print 'dataset set for trainer'
trainer.trainUntilConvergence()
print 'trained to convergence'


NetworkWriter.writeToFile(nn,'recurrentNetwork.xml')
Beispiel #10
0
from utils import updateDataset, buildDataset, buildRecurrentNetwork, loadRecurrentNetwork
from pybrain.supervised.trainers.rprop import RPropMinusTrainer
from pybrain.tools.xml.networkwriter import NetworkWriter
from pybrain.tools.xml.networkreader import NetworkReader

#nn=buildRecurrentNetwork()
nn = loadRecurrentNetwork('recurrentNetwork.xml')
dataset = buildDataset()

trainer = RPropMinusTrainer(nn)
trainer.setData(dataset)
print 'dataset set for trainer'
trainer.trainUntilConvergence()
print 'trained to convergence'

NetworkWriter.writeToFile(nn, 'recurrentNetwork.xml')
def test_multilayer_perceptron():
    def plot(fig, data):
        ax = fig.add_subplot(111)
        ax.plot([x[0] for x in data], [x[1] for x in data])

    def scat(fig, liner_data, marker='o', color='g'):
        ax = fig.add_subplot(111)
        ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data],
                   marker=marker,
                   color=color,
                   s=10)

    def get_predict_list(x_range, y_range, nn, split=10):
        data = []
        xspan = float(x_range[1] - x_range[0]) / split
        yspan = float(y_range[1] - y_range[0]) / split

        for x_value in [float(i) * xspan + x_range[0] for i in range(split)]:
            predict_list = []
            for y_value in [
                    float(j) * yspan + y_range[0] for j in range(split)
            ]:
                #if nn.predict([x_value,y_value])[0] >= 0.5:
                if nn.activate([x_value, y_value])[0] >= 0.5:
                    data.append((x_value, y_value))
                    break
        return data

    import matplotlib.pyplot as plt
    """ トレーニングデータ取得
    """
    x_range = [0, 1]
    y_range = [0, 1]
    #liner_data = liner_training_data(x_range, y_range)
    liner_data = quadratic_function_data(x_range, y_range, split=20)
    #liner_data = sin_function_data(x_range, y_range, 20)
    train_data_input, train_data_output = change_format(liner_data)

    fig = plt.figure()
    scat(fig, [key for key, value in liner_data.items() if value == 0],
         color='g')
    scat(fig, [key for key, value in liner_data.items() if value == 1],
         color='b')
    """ NN構築
    """
    network = build_network()

    # mlnn = MultiLayerNeuralNetwork( [2, 5, 1],
    #                                 threshold=0.1,
    #                                 start_learning_coef=0.2,
    #                                 sigmoid_alpha=10,
    #                                 mini_batch=100,
    #                                 layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer],
    #                                 rprop=True
    #                                 )
    """ 学習
    """
    #error_hist = mlnn.train_multi(train_data_input, train_data_output)
    supervised = get_supervised(network, train_data_input, train_data_output)
    trainer = RPropMinusTrainer(network,
                                dataset=supervised,
                                batchlearning=True,
                                verbose=True)
    trainer.trainUntilConvergence(maxEpochs=100)

    # xに対応するyを算出, 学習後分離線書く
    data = get_predict_list(x_range, y_range, network, split=20)
    plot(fig, data)

    # # エラー表示
    # fig2 = plt.figure()
    # plot(fig2, error_hist)

    # 表示
    plt.show()
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = []
     dats=[]#[seq index][turn]=[state,jointAct,jointReward]
     for i in range(self.num_agents):
         supervised.append(SupervisedDataSet(self.num_features+self.actionDiminInput, 1))
     for i in range(self.dataset[self.indexOfAgent].getNumSequences()):            
         seq=[]
         for j in range(len(self.dataset[self.indexOfAgent].getSequence(i)[0])):
             state=self.dataset[self.indexOfAgent].getSequence(i)[0][j]
             jointAct=[]
             jointReward=[]
             for k in range(self.num_agents):
                 jointAct.append(self.dataset[k].getSequence(i)[1][j][0])
                 jointReward.append(self.dataset[k].getSequence(i)[2][j][0])
             seq.append([state, jointAct, jointReward])
         dats.append(seq)
     #prepare data set
     for i in range(self.num_agents):
         for seq in dats:
             lastexperience = None
             for sarPair in seq:
                 state = sarPair[0]
                 action = sarPair[1]
                 reward = sarPair[2]
                 if not lastexperience:
                     # delay each experience in sequence by one
                     lastexperience = (state, action, reward)
                     continue
                 # use experience from last timestep to do Q update
                 (state_, action_, reward_) = lastexperience
                 
                 #update Q-value function approximator
                 qValuesNext=self._qValuesForAllPossibleJointAction(state)
                 eqNext=findCorrelatedEquilibrium(self.num_agents, self.num_actions, qValuesNext, self.possibleJointAction,self.w4ActIndexing)
                 #Learn
                 inp=self._EncodeStateAndJointActionIntoInputVector(state_, action_)
                 if self.isFirstLerning:
                     target=reward_[i]
                 else:
                     target=reward_[i] + self.rewardDiscount * max(self._qValuesForEachActionOfAgent(state, eqNext, i))
                 target=np.array([target])
                 supervised[i].addSample(inp, target)
                 # update last experience with current one
                 lastexperience = (state, action, reward)
     if self.isFirstLerning:
         self.isFirstLerning=False
         
     procTrainers=[]
     qResult=Queue()
     for i in range(self.num_agents):
         trainer=RPropMinusTrainer(self.linQ[i],dataset=supervised[i], 
                                   batchlearning=True, 
                                   verbose=False, 
                                   )
         if not self.validateMultiProc:
             trainer.trainUntilConvergence(maxEpochs=self.max_epochs,verbose=False)
         else:
             procTrainers.append(Process(target=self._learningQfunction, kwargs={"trainer":trainer,"i":i,"q":qResult}))
     if self.validateMultiProc:
         for proc in procTrainers:
             proc.start()
         for i in range(self.num_agents):
             res=qResult.get()
             self.linQ[res[0]]=res[1]
def test_multilayer_perceptron():

    def plot(fig, data):
        ax  = fig.add_subplot(111)
        ax.plot([x[0] for x in data], [x[1] for x in data])

    def scat(fig, liner_data, marker='o', color='g'):
        ax  = fig.add_subplot(111)
        ax.scatter([x[0] for x in liner_data], [x[1] for x in liner_data], marker=marker, color=color, s=10)

    def get_predict_list(x_range, y_range, nn, split=10):
        data = []
        xspan = float(x_range[1] - x_range[0]) / split
        yspan = float(y_range[1] - y_range[0]) / split

        for x_value in [ float(i)*xspan+x_range[0] for i in range(split)]:
            predict_list = []
            for y_value in [ float(j) * yspan + y_range[0]  for j in range(split)]:
                #if nn.predict([x_value,y_value])[0] >= 0.5:
                if nn.activate([x_value,y_value])[0] >= 0.5:
                    data.append((x_value, y_value))
                    break
        return data

    import matplotlib.pyplot as plt

    """ トレーニングデータ取得
    """
    x_range = [0,1]
    y_range = [0,1]
    #liner_data = liner_training_data(x_range, y_range)
    liner_data = quadratic_function_data(x_range, y_range, split=20)
    #liner_data = sin_function_data(x_range, y_range, 20)
    train_data_input, train_data_output = change_format(liner_data)

    fig = plt.figure()
    scat(fig, [key for key, value in liner_data.items() if value == 0], color='g' )
    scat(fig, [key for key, value in liner_data.items() if value == 1], color='b' )



    """ NN構築
    """
    network = build_network()

    # mlnn = MultiLayerNeuralNetwork( [2, 5, 1],
    #                                 threshold=0.1,
    #                                 start_learning_coef=0.2,
    #                                 sigmoid_alpha=10,
    #                                 mini_batch=100,
    #                                 layer_type=[LinearLayer, SigmoidLayer, SigmoidLayer],
    #                                 rprop=True
    #                                 )

    """ 学習
    """
    #error_hist = mlnn.train_multi(train_data_input, train_data_output)
    supervised = get_supervised(network, train_data_input, train_data_output)
    trainer = RPropMinusTrainer(network, dataset=supervised, batchlearning=True, verbose=True)
    trainer.trainUntilConvergence(maxEpochs=100)


    # xに対応するyを算出, 学習後分離線書く
    data = get_predict_list(x_range,y_range, network, split=20)
    plot(fig, data)

    # # エラー表示
    # fig2 = plt.figure()
    # plot(fig2, error_hist)

    # 表示
    plt.show()
def evalPolicy1D(valNet,polNet,policyEvalStates, vMaxAll, stepSize, thermRadius):
    vDiffStart = 10000
    vDiff = vDiffStart    
    while(vDiff > vMaxAll):
        vDiff = vDiffStart
        
        for state in policyEvalStates: # Go through the states in question  

        
                # Stores next state according to the current policy
                nextState = [];

                # Determine what the chosen action is, from the policy network
                actionPref = polNet.activate([state])               
                chosenAction = np.argmax(actionPref) # Choose the one with highest output
                
                
                # Determine the next state (from contThermalEnvironment)
                numAng = len(actionPref)
                oldDist = state
                nextState = [ep.updateDist(oldDist, stepSize, numAng, chosenAction)]   
                
                # Calculate reward given for transition                
                
                # Calculate new value of states under the current policy, based on reward given
                # Discount rate is how farsighted we are (between 0 and 1, with 1 being very far sighted, and 0 being not far sighted)
                discRate = 0.7
                scale = 10 # Size of reward
                reward = getReward1D(state, thermRadius,scale)
                
                # Calculate new estimate for value 
                VstateNew = reward + discRate*valNet.activate(nextState);       
                
                # Determine how much the value changed
                # Keep track of maximum change seen so far
                VstateOld = valNet.activate([state])
                vChange = abs(VstateOld - VstateNew)
                if (vDiff == vDiffStart):
                    vDiff = vChange
                elif (vChange > vDiff):
                    vDiff = vChange
                
                # Update value network with new estimate, keeping everything else the same   

                # First, get training examples
                from pybrain.datasets import SupervisedDataSet                
                supervised = SupervisedDataSet(valNet.indim, 1) # numInput, numOutputs   
                supervised.addSample(state, VstateNew)
                for loc in policyEvalStates: # Go through all discretized states 
                    if (loc != state):
                        inp = loc
                        tgt = valNet.activate([loc])
                        supervised.addSample(inp,tgt)
                        
                # Next, train on these training examples
                from pybrain.supervised.trainers.rprop import RPropMinusTrainer                
                trainer = RPropMinusTrainer(valNet, dataset=supervised, verbose=False)               
                
                # Train manually, to avoid using validation data
                # trainer.trainUntilConvergence(maxEpochs=maxEpochsVal, validationProportion = 0)   # Requires validation data 
                # I don't mind overfitting this - just so long as generalization is OK (so far, seems OK)
                numTrainIter = 30
                for i in range(numTrainIter):
                    trainer.train()                

                # Print training status
                # print('Old dist:', oldDist)
                # print('Preferences:', actionPref)
                # print('Choice:', chosenAction)
                # print('New dist:', nextState)
                # print('Reward:', reward)
                # print('New Value:', VstateNew)
                # print('Value change:', vChange)
                # print('Max change:', vDiff)
                # print('Supervised data set:', supervised)
                
                # print('Actual network outputs:')
                # for loc in policyEvalStates:
                    # print(valNet.activate([loc]))
                
                # input()
                
                # Return updated vallue function
        print('Max value change: ', vDiff)
        import sys ;sys.stdout.flush()
        
    return valNet