def train(self, transitionSamples):

        print "Entrenando..."

        k = 0
        trainer = RPropMinusTrainer(self.Q, batchlearning=True)
        #trainer = BackpropTrainer(self.Q, batchlearning=False)
        TS = SupervisedDataSet(4, 1)

        while (k < self._epochs):

            if k % 10 == 0:
                print "\t ", k

            # Genero training set en base a las muestras
            # Input: Vector de 4 dimensiones (angulo, vel.angular, pos, accion)
            # Target: Valor

            TS.clear()

            for s, a, s_1, costo in transitionSamples:

                # Tomo Q para s', para todas las acciones posibles
                # (vector con el valor para s', para cada una de las 3 acciones posibles)
                # Q_s1 = [ self.Q.activate([s_1.angulo, s_1.velocidadAngular, s_1.posicion, b]) for b in range(Accion.maxValor + 1) ]
                valDerecha = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.DERECHA
                ])
                valIzquierda = self.Q.activate([
                    s_1.angulo, s_1.velocidadAngular, s_1.posicion,
                    Accion.IZQUIERDA
                ])

                if valDerecha >= 1 or valDerecha <= 0:
                    print "Q incorrecta: ", valDerecha

                if valIzquierda >= 1 or valIzquierda <= 0:
                    print "Q incorrecta: ", valIzquierda

                # Input y Target para la red neuronal
                inputVal = (s.angulo, s.velocidadAngular, s.posicion, a)

                if costo == 0:
                    targetVal = costo
                else:
                    targetVal = costo + self._gamma * min(
                        valDerecha, valIzquierda)

                if targetVal > 1 or targetVal < 0:
                    print "Target incorrecto: ", targetVal

                TS.addSample(inputVal, targetVal)

            # Entreno la red neuronal
            trainer.setData(TS)
            trainer.train()  # 1 epoch
            #trainer.trainEpochs(self._epochsNN)

            k = k + 1
def evalPolicy1D(valNet,polNet,policyEvalStates, vMaxAll, stepSize, thermRadius):
    vDiffStart = 10000
    vDiff = vDiffStart    
    while(vDiff > vMaxAll):
        vDiff = vDiffStart
        
        for state in policyEvalStates: # Go through the states in question  

        
                # Stores next state according to the current policy
                nextState = [];

                # Determine what the chosen action is, from the policy network
                actionPref = polNet.activate([state])               
                chosenAction = np.argmax(actionPref) # Choose the one with highest output
                
                
                # Determine the next state (from contThermalEnvironment)
                numAng = len(actionPref)
                oldDist = state
                nextState = [ep.updateDist(oldDist, stepSize, numAng, chosenAction)]   
                
                # Calculate reward given for transition                
                
                # Calculate new value of states under the current policy, based on reward given
                # Discount rate is how farsighted we are (between 0 and 1, with 1 being very far sighted, and 0 being not far sighted)
                discRate = 0.7
                scale = 10 # Size of reward
                reward = getReward1D(state, thermRadius,scale)
                
                # Calculate new estimate for value 
                VstateNew = reward + discRate*valNet.activate(nextState);       
                
                # Determine how much the value changed
                # Keep track of maximum change seen so far
                VstateOld = valNet.activate([state])
                vChange = abs(VstateOld - VstateNew)
                if (vDiff == vDiffStart):
                    vDiff = vChange
                elif (vChange > vDiff):
                    vDiff = vChange
                
                # Update value network with new estimate, keeping everything else the same   

                # First, get training examples
                from pybrain.datasets import SupervisedDataSet                
                supervised = SupervisedDataSet(valNet.indim, 1) # numInput, numOutputs   
                supervised.addSample(state, VstateNew)
                for loc in policyEvalStates: # Go through all discretized states 
                    if (loc != state):
                        inp = loc
                        tgt = valNet.activate([loc])
                        supervised.addSample(inp,tgt)
                        
                # Next, train on these training examples
                from pybrain.supervised.trainers.rprop import RPropMinusTrainer                
                trainer = RPropMinusTrainer(valNet, dataset=supervised, verbose=False)               
                
                # Train manually, to avoid using validation data
                # trainer.trainUntilConvergence(maxEpochs=maxEpochsVal, validationProportion = 0)   # Requires validation data 
                # I don't mind overfitting this - just so long as generalization is OK (so far, seems OK)
                numTrainIter = 30
                for i in range(numTrainIter):
                    trainer.train()                

                # Print training status
                # print('Old dist:', oldDist)
                # print('Preferences:', actionPref)
                # print('Choice:', chosenAction)
                # print('New dist:', nextState)
                # print('Reward:', reward)
                # print('New Value:', VstateNew)
                # print('Value change:', vChange)
                # print('Max change:', vDiff)
                # print('Supervised data set:', supervised)
                
                # print('Actual network outputs:')
                # for loc in policyEvalStates:
                    # print(valNet.activate([loc]))
                
                # input()
                
                # Return updated vallue function
        print('Max value change: ', vDiff)
        import sys ;sys.stdout.flush()
        
    return valNet