def getObservation(self):
        (theta, thetad, omega, omegad, omegadd,
                xf, yf, xb, yb, psi) = self.env.getSensors()
        # TODO not calling superclass to do normalization, etc.
        top_half =  one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd),
                self.outdim - 20)
    
        bot_half = one_to_n(np.digitize([psi], self.psi_bounds)[0] - 1, 20)

        return np.concatenate((top_half,bot_half))
    def getObservation(self):
        (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb,
         psi) = self.env.getSensors()
        # TODO not calling superclass to do normalization, etc.
        top_half = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd),
                            self.outdim - 20)

        bot_half = one_to_n(np.digitize([psi], self.psi_bounds)[0] - 1, 20)

        return np.concatenate((top_half, bot_half))
    def performAction(self, action):
        """Incoming action is an int between 0 and 8. The action we provide to
        the environment consists of a torque T in {-2 N, 0, 2 N}, and a
        displacement d in {-.02 m, 0, 0.02 m}.

        """
        self.t += 1
        self.action_history += one_to_n(action[0], self.nactions)
        # Map the action integer to a torque and displacement.
        assert round(action[0]) == action[0]

        if self.only_steer:
            T = 2 * (action[0] / 4.0 - 1.0)
            d = 0.
        else:
            # -1 for action in {0, 1, 2}, 0 for action in {3, 4, 5}, 1 for
            # action in {6, 7, 8}
            torque_selector = np.floor(action[0] / 3.0) - 1.0
            T = 2 * torque_selector
            # Random number in [-1, 1]:
            p = 2.0 * np.random.rand() - 1.0
            # -1 for action in {0, 3, 6}, 0 for action in {1, 4, 7}, 1 for
            # action in {2, 5, 8}
            disp_selector = action[0] % 3 - 1.0
            d = 0.02 * disp_selector + self._butt_disturbance_amplitude * p
        super(BalanceTask, self).performAction([T, d])
Beispiel #4
0
 def performAction(self, action):
     """ adding the option to use an action space of size 5, where 
     we either apply a torque (+/- 2.0), displace the butt (+/- 0.02),
     or do nothing (ala, Lagoudakis).
     """
     if self.five_actions:
         p = 2.0 * np.random.rand() - 1.0
         T = 0
         d = self._butt_disturbance_amplitude * p
         
         self.t += 1
         self.action_history += one_to_n(action[0], self.nactions)
         
         # Map the action integer to a torque and displacement.
         assert round(action[0]) == action[0]
         if action[0] == 0:
             T = -2
         elif action[0] == 1:
             T = 2
         elif action[0] == 2:
             d -= 0.02
         elif action[0] == 3:
             d += 0.02  
             
         super(BalanceTask, self).performAction([T, d])
         
     else:
         BalanceTask.performAction(self, action)
Beispiel #5
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue
                
                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = reward_ + self.gamma * max(self.module.getActionValues(state))
                supervised.addSample(inp, tgt)
                
                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        
        # alternative: backprop, was not as stable as rprop
        # trainer = BackpropTrainer(self.module.network, dataset=supervised, learningrate=0.01, batchlearning=True, verbose=True)

        trainer.trainEpochs(1)
Beispiel #6
0
def toClassificationDataset(codedSampleSet):
   
    classifiedSampleSet = []
    
    # Calculate the unique classes
    classes = []
    for sample in codedSampleSet:
    
        classifier = getClassifier(sample)
        if classifier not in classes:
            classes.append(classifier)
    classes.sort()
    
    # Now that we have all the classes, we process the outputs
    for sample in codedSampleSet:
        classifier = getClassifier(sample)
        classifiedSample = one_to_n(classes.index(classifier), len(classes))
        classifiedSampleSet.append(classifiedSample)

    # Build the dataset
    sampleSize = len(codedSampleSet[0])
    classifiedSampleSize = len(classifiedSampleSet[0])
    dataset = ClassificationDataSet(sampleSize, classifiedSampleSize)
    
    for i in range(len(classifiedSampleSet)):
        dataset.addSample(codedSampleSet[i], classifiedSampleSet[i])

    return dataset, classes
Beispiel #7
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)

        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
Beispiel #8
0
def toClassificationDataset(codedSampleSet):
   
    classifiedSampleSet = []
    
    # Calculate the unique classes
    classes = []
    for sample in codedSampleSet:
    
        classifier = getClassifier(sample)
        if classifier not in classes:
            classes.append(classifier)
    classes.sort()
    
    # Now that we have all the classes, we process the outputs
    for sample in codedSampleSet:
        classifier = getClassifier(sample)
        classifiedSample = one_to_n(classes.index(classifier), len(classes))
        classifiedSampleSet.append(classifiedSample)

    # Build the dataset
    sampleSize = len(codedSampleSet[0])
    classifiedSampleSize = len(classifiedSampleSet[0])
    dataset = ClassificationDataSet(sampleSize, classifiedSampleSize)
    
    for i in range(len(classifiedSampleSet)):
        dataset.addSample(codedSampleSet[i], classifiedSampleSet[i])

    return dataset, classes
def makeGreedy1D(valNet, polNet, policyEvalStates, numAct, stepSize):

    from pybrain.datasets import SupervisedDataSet                
    supervised = SupervisedDataSet(polNet.indim, numAct) # numInput, numOutputs   
    
    # Try all the actions and see which has the best value    
    for state in policyEvalStates:
        vBest = -100000
        for action in range(numAct):            
            nextState = [ep.updateDist(state, stepSize, numAct, action)]
            vNext = valNet.activate(nextState)
            if (vNext > vBest):
                actBest = action
                vBest = vNext
        from pybrain.utilities import one_to_n
        supervised.addSample(state, one_to_n(actBest, numAct))
    
    # Print supervised training set 
    # print(supervised)
    # input()
    
    # Train neural network
    from pybrain.supervised.trainers.rprop import RPropMinusTrainer                
    trainer = RPropMinusTrainer(polNet, dataset=supervised, verbose=False)  
    trainer.trainUntilConvergence(maxEpochs=50) # I'm OK with some interpolation here. It's the values we need to be exact on.
    return polNet
Beispiel #10
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        
        for seq in self.dataset:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience
                Q = self.module.getValue(state_, int(action_[0]))
                

                inp = r_[state_, one_to_n(int(action_[0]), self.module.numActions)]
                #input = r_[state_, action_]
                tgt = Q + self.alpha*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                # update last experience with current one
                lastexperience = (state, action, reward)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=True)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
Beispiel #11
0
    def performAction(self, action):
        """ adding the option to use an action space of size 5, where 
        we either apply a torque (+/- 2.0), displace the butt (+/- 0.02),
        or do nothing (ala, Lagoudakis).
        """
        if self.five_actions:
            p = 2.0 * np.random.rand() - 1.0
            T = 0
            d = self._butt_disturbance_amplitude * p

            self.t += 1
            self.action_history += one_to_n(action[0], self.nactions)

            # Map the action integer to a torque and displacement.
            assert round(action[0]) == action[0]
            if action[0] == 0:
                T = -2
            elif action[0] == 1:
                T = 2
            elif action[0] == 2:
                d -= 0.02
            elif action[0] == 3:
                d += 0.02

            super(BalanceTask, self).performAction([T, d])

        else:
            BalanceTask.performAction(self, action)
    def performAction(self, action):
        """Incoming action is an int between 0 and 8. The action we provide to
        the environment consists of a torque T in {-2 N, 0, 2 N}, and a
        displacement d in {-.02 m, 0, 0.02 m}.

        """
        self.t += 1
        self.action_history += one_to_n(action[0], self.nactions)
        # Map the action integer to a torque and displacement.
        assert round(action[0]) == action[0]

        if self.only_steer:
            T = 2 * (action[0] / 4.0 - 1.0)
            d = 0.
        else:
            # -1 for action in {0, 1, 2}, 0 for action in {3, 4, 5}, 1 for
            # action in {6, 7, 8}
            torque_selector = np.floor(action[0] / 3.0) - 1.0
            T = 2 * torque_selector
            # Random number in [-1, 1]:
            p = 2.0 * np.random.rand() - 1.0
            # -1 for action in {0, 3, 6}, 0 for action in {1, 4, 7}, 1 for
            # action in {2, 5, 8}
            disp_selector = action[0] % 3 - 1.0
            d = 0.02 * disp_selector + self._butt_disturbance_amplitude * p
        super(BalanceTask, self).performAction([T, d])
 def _qValues(self, state):
     """ Return vector of q-values for all actions, 
     given the state(-features). """
     values = np.array([
         self.linQ.activate(r_[state, one_to_n(i, self.num_actions)])
         for i in range(self.num_actions)
     ])
     return values.flatten()
Beispiel #14
0
 def getObservation(self):
     (theta, thetad, omega, omegad, omegadd,
             xf, yf, xb, yb, psi) = self.env.getSensors()
     print(self.getBin(theta, thetad, omega, omegad, omegadd),self.outdim)
     state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd),
             self.outdim)
     print(state[self.getBin(theta, thetad, omega, omegad, omegadd)])
     return state
Beispiel #15
0
 def getObservation(self):
     (theta, thetad, omega, omegad, omegadd,
             xf, yf, xb, yb, psi, psig) = self.env.getSensors()
     # TODO not calling superclass to do normalization, etc.
     state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd),
             self.outdim)
     self.bin_count += state
     return state
 def getObservation(self):
     (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi,
      psig) = self.env.getSensors()
     # TODO not calling superclass to do normalization, etc.
     state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd),
                      self.outdim)
     self.bin_count += state
     return state
 def getActionValues(self, state):
     """ Run forward activation for each of the actions and returns all values. """
     values = array([
         self.network.activate(r_[state,
                                  one_to_n(i, self.numActions)])
         for i in range(self.numActions)
     ])
     return values
Beispiel #18
0
    def _updateEtraces(self, state, action, responsibility=1.):
        self._etraces *= self.rewardDiscount * self._lambda * responsibility
        # This assumes that state is an identity vector (like, from one_to_n).
        self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.)
        # Set the trace for all other actions in this state to 0:
        action_bit = one_to_n(action, self.num_actions)

        for argstate in argwhere(state == 1) :
        	self._etraces[argwhere(action_bit != 1), argstate] = 0.
Beispiel #19
0
    def _updateEtraces(self, state, action, responsibility=1.0):
        self._etraces *= self.rewardDiscount * self._lambda * responsibility
        # This assumes that state is an identity vector (like, from one_to_n).
        self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.0)
        # Set the trace for all other actions in this state to 0:
        action_bit = one_to_n(action, self.num_actions)

        for argstate in argwhere(state == 1):
            self._etraces[argwhere(action_bit != 1), argstate] = 0.0
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator
        target = reward + self.rewardDiscount * max(self._qValues(next_state))
        inp = r_[asarray(state), one_to_n(action, self.num_actions)]
        self.trainer4LinQ = BackpropTrainer(self.linQ,
                                            weightdecay=self.weightdecay)
        ds = SupervisedDataSet(self.num_features + self.num_actions, 1)
        ds.addSample(inp, target)
        self.trainer4LinQ.trainOnDataset(ds)

        #update estimate of average policy
        self.averagePolicy.append(copy.deepcopy(self.linPolicy))
        if len(self.averagePolicy) > self.maxNumberofAverage:
            self.averagePolicy.pop(np.random.randint(len(self.averagePolicy)))

        #update policy function approximator
        delta = None
        cumRewardOfCurrentPolicy = 0.0
        values = self._qValues(state)
        pi = self._pi(state)
        for elem_action in range(self.num_actions):
            cumRewardOfCurrentPolicy = pi[elem_action] * values[elem_action]
        cumRewardOfAveragePolicy = 0.0
        api = self._piAvr(state)
        for elem_action in range(self.num_actions):
            cumRewardOfAveragePolicy = api[elem_action] * values[elem_action]
        if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy:
            delta = self.deltaW
        else:
            delta = self.deltaL

        #Update policy
        bestAction = r_argmax(self._qValues(state))
        target = one_to_n(bestAction, self.num_actions)
        inp = r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features, self.num_actions)
        ds.addSample(inp, target)
        self.trainer4LinPolicy = BackpropTrainer(self.linPolicy,
                                                 learningrate=(delta),
                                                 weightdecay=self.weightdecay)
        self.trainer4LinPolicy.setData(ds)
        self.trainer4LinPolicy.trainEpochs(
            epochs=self.trainingEpochPerUpdateWight)
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator
        target=reward + self.rewardDiscount * max(self._qValues(next_state))
        inp=r_[asarray(state), one_to_n(action, self.num_actions)]
        self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
        ds = SupervisedDataSet(self.num_features+self.num_actions,1)
        ds.addSample(inp, target)        
        self.trainer4LinQ.trainOnDataset(ds)

        #update estimate of average policy
        self.averagePolicy.append(copy.deepcopy(self.linPolicy))
        if len(self.averagePolicy) > self.maxNumberofAverage:
            self.averagePolicy.pop(np.random.randint(len(self.averagePolicy)))
            
        #update policy function approximator
        delta=None
        cumRewardOfCurrentPolicy=0.0
        values=self._qValues(state)
        pi=self._pi(state)
        for elem_action in range(self.num_actions):
            cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action]
        cumRewardOfAveragePolicy=0.0
        api=self._piAvr(state)
        for elem_action in range(self.num_actions):
            cumRewardOfAveragePolicy=api[elem_action]*values[elem_action]
        if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy:
            delta=self.deltaW
        else:
            delta=self.deltaL
        
        #Update policy
        bestAction=r_argmax(self._qValues(state))
        target=one_to_n(bestAction, self.num_actions)
        inp=r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features,self.num_actions)
        ds.addSample(inp, target)
        self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
                                               learningrate=(delta),
                                               weightdecay=self.weightdecay)
        self.trainer4LinPolicy.setData(ds)
        self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
                        
        
Beispiel #22
0
 def getActionValues(self, state):
     #valid_moves = get_moves(state)
     #valid_moves = range(self.numActions)
     valid_moves = self.get_valid_moves()
     values = array([
         self.network.activate(r_[state,
                                  one_to_n(i, self.numActions)])
         if i in valid_moves else np.NINF for i in range(self.numActions)
     ])
     return values
 def _EncodeStateAndJointActionIntoInputVector(self, state, jointAct):
     index=int(np.dot(self.w4ActIndexing, jointAct))
     if index in self.actionVecDic:
         return np.r_[state, self.actionVecDic[index]]
     else:
         iVector=np.array([])
         for iAgent in range(len(jointAct)):
             iVector=np.r_[iVector, one_to_n(jointAct[iAgent], self.num_actions[iAgent])]
         self.actionVecDic[index]=iVector
         return np.r_[state, self.actionVecDic[index]]
Beispiel #24
0
def nfq_action_value(network_fname, state=[0, 0, 0, 0, 0]):
    # TODO generalize away from 9 action values. Ask the network how many
    # discrete action values there are.
    n_actions = 9
    network = NetworkReader.readFrom(network_fname)
    actionvalues = np.empty(n_actions)
    for i_action in range(n_actions):
        network_input = r_[state, one_to_n(i_action, n_actions)]
        actionvalues[i_action] = network.activate(network_input)
    return actionvalues
 def _updateWeights(self, state, action, reward, next_state):
     """ state and next_state are vectors, action is an integer. """
     #update Q-value function approximator
     target=reward + self.rewardDiscount * max(self._qValues(next_state))
     inp=r_[asarray(state), one_to_n(action, self.num_actions)]
     self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
     ds = SupervisedDataSet(self.num_features+self.num_actions,1)
     ds.addSample(inp, target)
     self.trainer4LinQ.trainOnDataset(ds)
     #Update policy
     bestAction=r_argmax(self._qValues(state))
     target= one_to_n(bestAction, self.num_actions)
     inp=r_[asarray(state)]
     ds = SupervisedDataSet(self.num_features,self.num_actions)
     ds.addSample(inp, target)
     self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
                                            learningrate=self.delta,
                                            weightdecay=self.weightdecay)
     self.trainer4LinPolicy.setData(ds)
     self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
Beispiel #26
0
 def _updateEtraces(self, state, action, responsibility=1.):
     self._etraces *= self.rewardDiscount * self._lambda * responsibility
     # TODO I think this assumes that state is an identity vector (like,
     # from one_to_n).
     self._etraces[action] = clip(self._etraces[action] + state, -np.inf, 1.)
     # Set the trace for all other actions in this state to 0:
     action_bit = one_to_n(action, self.num_actions)
     
     # changed this line to allow for multiple 
     # (state == 1) occurences
     for argstate in argwhere(state == 1) :
     	self._etraces[argwhere(action_bit != 1), argstate] = 0.   
 def _updateWeights(self, state, action, reward, next_state):
     """ state and next_state are vectors, action is an integer. """
     #update Q-value function approximator
     target = reward + self.rewardDiscount * max(self._qValues(next_state))
     inp = r_[asarray(state), one_to_n(action, self.num_actions)]
     self.trainer4LinQ = BackpropTrainer(self.linQ,
                                         weightdecay=self.weightdecay)
     ds = SupervisedDataSet(self.num_features + self.num_actions, 1)
     ds.addSample(inp, target)
     self.trainer4LinQ.trainOnDataset(ds)
     #Update policy
     bestAction = r_argmax(self._qValues(state))
     target = one_to_n(bestAction, self.num_actions)
     inp = r_[asarray(state)]
     ds = SupervisedDataSet(self.num_features, self.num_actions)
     ds.addSample(inp, target)
     self.trainer4LinPolicy = BackpropTrainer(self.linPolicy,
                                              learningrate=self.delta,
                                              weightdecay=self.weightdecay)
     self.trainer4LinPolicy.setData(ds)
     self.trainer4LinPolicy.trainEpochs(
         epochs=self.trainingEpochPerUpdateWight)
def evalSpeakerOnClassificationDataset(speaker, problems):
  """Evaluate a listener agent on the forced choice dataset.
     Here fcProblems is a ClassificationDataset, where problem[0] is concatenated features, utterance, 
     and problem[1] is target.
  """
  nCorrect = 0
  allActivations = []
  for features, goldUtterance in problems:
    activations = speaker.activate(features)
    allActivations.append(activations)
    activationMax = activations.argmax()
    utterance = one_to_n(activationMax, 3)
    if (utterance == goldUtterance).all():
      nCorrect +=1
  return nCorrect, allActivations 
Beispiel #29
0
def loadCSV(filename,multiclass=True,outputs=1,separator=','):
    #read in all the lines
    f = open(filename).readlines()
    
    #start our datasets
    in_data = []
    out_data =[]
    
    #process the file
    for line in f:
        #remove whitespace and split according to separator character
        samples = line.strip(' \r\n').split(separator)
        
        #save input data
        in_data.append([float(i) for i in samples[:-outputs]])
        
        #save output data
        if multiclass:
            out_data.append(samples[-1])
        else:
            out_data.append([float(i) for i in samples[-outputs:]])
        
    
    processed_out_data = out_data
    
    #process multiclass encoding
    if multiclass:
        processed_out_data = []
        #get all the unique values for classes
        keys = []
        for d in out_data:
            if d not in keys:
                keys.append(d)
        keys.sort()
        #encode all data
        for d in out_data:
            processed_out_data.append(one_to_n(keys.index(d),len(keys)))
    
    #create the dataset
    dataset = SupervisedDataSet(len(in_data[0]),len(processed_out_data[0]))
    for i in xrange(len(out_data)):
        dataset.addSample(in_data[i],processed_out_data[i])
    
    #return the keys if we have
    if multiclass:
        return dataset,keys # a multiclass classifier
    else:
        return dataset
Beispiel #30
0
def loadCSV(filename, multiclass=True, outputs=1, separator=','):
    #read in all the lines
    f = open(filename).readlines()

    #start our datasets
    in_data = []
    out_data = []

    #process the file
    for line in f:
        #remove whitespace and split according to separator character
        samples = line.strip(' \r\n').split(separator)

        #save input data
        in_data.append([float(i) for i in samples[:-outputs]])

        #save output data
        if multiclass:
            out_data.append(samples[-1])
        else:
            out_data.append([float(i) for i in samples[-outputs:]])

    processed_out_data = out_data

    #process multiclass encoding
    if multiclass:
        processed_out_data = []
        #get all the unique values for classes
        keys = []
        for d in out_data:
            if d not in keys:
                keys.append(d)
        keys.sort()
        #encode all data
        for d in out_data:
            processed_out_data.append(one_to_n(keys.index(d), len(keys)))

    #create the dataset
    dataset = SupervisedDataSet(len(in_data[0]), len(processed_out_data[0]))
    for i in xrange(len(out_data)):
        dataset.addSample(in_data[i], processed_out_data[i])

    #return the keys if we have
    if multiclass:
        return dataset, keys  # a multiclass classifier
    else:
        return dataset
Beispiel #31
0
    def learn(self):
        # convert reinforcement dataset to NFQ supervised dataset
        supervised = SupervisedDataSet(self.module.network.indim, 1)
        for seq in self.dataset[self.indexOfAgent]:
            lastexperience = None
            for state, action, reward in seq:
                if not lastexperience:
                    # delay each experience in sequence by one
                    lastexperience = (state, action, reward)
                    continue

                # use experience from last timestep to do Q update
                (state_, action_, reward_) = lastexperience

                Q = self.module.getValue(state_, action_[0])

                inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
                if self.isFirstLerning:
                    tgt = reward_
                else:
                    tgt = Q + 0.5 * (reward_ + self.gamma * max(
                        self.module.getActionValues(state)) - Q)
                supervised.addSample(inp, tgt)

                #for reward normalization

                # update last experience with current one
                lastexperience = (state, action, reward)

        #Re-building netowrks is required in multiprocessing environments.
        params = self.module.network.params
        self.module.network = buildNetwork(
            self.module.indim + self.module.numActions,
            self.module.indim + self.module.numActions, 1)
        self.module.network._setParameters(params)

        # train module with backprop/rprop on dataset
        trainer = RPropMinusTrainer(self.module.network,
                                    dataset=supervised,
                                    batchlearning=True,
                                    verbose=False)  #, weightdecay=0.01)
        trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
        if self.isFirstLerning:
            self.isFirstLerning = False
def evalListenerOnClassificationDataset(listener, fcProblems):
  """Evaluate a listener agent on the forced choice dataset.
     Here fcProblems is a ClassificationDataset, where problem[0] is concatenated features, utterance, 
     and problem[1] is target.
     returns number correct, raw model activations, and also an array of if each problem is correct (1/0)
  """
  nCorrect = 0
  allActivations = []
  correct = []
  for problem in fcProblems:
    activations = listener.activate(problem[0])
    allActivations.append(activations)
    activationMax = activations.argmax()
    target = one_to_n(activationMax, 3)
    if (target == problem[1]).all():
      nCorrect += 1
      correct.append(1)
    else:
      correct.append(0) 
  return (nCorrect, allActivations, correct)
 def learn(self):
     # convert reinforcement dataset to NFQ supervised dataset
     supervised = SupervisedDataSet(self.module.network.indim, 1)
     for seq in self.dataset[self.indexOfAgent]:
         lastexperience = None
         for state, action, reward in seq:
             if not lastexperience:
                 # delay each experience in sequence by one
                 lastexperience = (state, action, reward)
                 continue
             
             # use experience from last timestep to do Q update
             (state_, action_, reward_) = lastexperience
             
             Q = self.module.getValue(state_, action_[0])
             
             inp = r_[state_, one_to_n(action_[0], self.module.numActions)]
             if self.isFirstLerning:
                 tgt = reward_
             else:
                 tgt = Q + 0.5*(reward_ + self.gamma * max(self.module.getActionValues(state)) - Q)
             supervised.addSample(inp, tgt)
             
             #for reward normalization
             
             # update last experience with current one
             lastexperience = (state, action, reward)
             
     #Re-building netowrks is required in multiprocessing environments. 
     params=self.module.network.params
     self.module.network=buildNetwork(self.module.indim+self.module.numActions, 
                                      self.module.indim+self.module.numActions, 
                                      1)
     self.module.network._setParameters(params)
     
     # train module with backprop/rprop on dataset
     trainer = RPropMinusTrainer(self.module.network, dataset=supervised, batchlearning=True, verbose=False)#, weightdecay=0.01)
     trainer.trainUntilConvergence(maxEpochs=self.maxEpochs)
     if self.isFirstLerning:
         self.isFirstLerning=False
def setInitEst(optVal, optLocs, agent,maxEpochs):   
    
    # Create training data
    from pybrain.datasets import SupervisedDataSet
    from scipy import r_
    from pybrain.utilities import one_to_n    
    
    module = agent.module
    
    # Currently we have one input (location) and two outputs (corresponding to travelling towards or away)
    supervised = SupervisedDataSet(module.network.indim, 1)
    for loc in optLocs: # Go through all locations we are going to be optimistic about               
        for currAction in range(module.numActions):      
            inp = r_[loc, one_to_n(currAction, module.numActions)]
            tgt = optVal 
            supervised.addSample(inp,tgt)
    print(supervised)
    # Train
    from pybrain.supervised.trainers import BackpropTrainer
    trainer = BackpropTrainer(module.network, dataset=supervised, learningrate=0.005, batchlearning=True, verbose=False)
    trainer.trainUntilConvergence(maxEpochs = maxEpochs)

    return agent
from environment import Environment
from tasks import LinearFATileCoding3456BalanceTask
from training import LinearFATraining
from learners import SARSALambda_LinFA_ReplacingTraces

task = LinearFATileCoding3456BalanceTask()
learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim)
learner._lambda = 0.95
task.discount = learner.rewardDiscount
agent = LinearFA_Agent(learner)
# The state has a huge number of dimensions, and the logging causes me to run
# out of memory. We needn't log, since learning is done online.
agent.logging = False

# TODO PyBrain says that the learning rate needs to decay, but I don't see that
# described in Randlov's paper.
# A higher number here means the learning rate decays slower.
learner.learningRateDecay = 100000
# NOTE increasing this number above from the default of 100 is what got the
# learning to actually happen, and fixed the bug/issue where the performance
# agent's performance stopped improving.

for i in np.arange(2000, 3800, 50):
    theta = np.loadtxt('/home/fitze/Documents/agent-bicycle/data/balance_sarsalambda_linfa_replacetrace_anneal_112217H56M04S/theta_%i.dat' % i)
    learner._theta = theta
    Q = learner._qValues(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
    pl.plot(Q, label='%s' % i)
#pl.legend()
pl.show()
print learner._greedyAction(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
Beispiel #36
0
 def getActionValues(self, state):
     #valid_moves = get_moves(state)
     #valid_moves = range(self.numActions)
     valid_moves = self.get_valid_moves()
     values = array([self.network.activate(r_[state, one_to_n(i, self.numActions)]) if i in valid_moves else np.NINF for i in range(self.numActions)])
     return values
 def _qValues(self, state):
     """ Return vector of q-values for all actions, 
     given the state(-features). """
     values = np.array([self.linQ.activate(r_[state, one_to_n(i, self.num_actions)]) for i in range(self.num_actions)])
     return values.flatten()
Beispiel #38
0
 def getActionValues(self, state):
     """ Run forward activation for each of the actions and returns all values. """
     values = array([self.network.activate(r_[state, one_to_n(i, self.numActions)]) for i in range(self.numActions)])
     return values
Beispiel #39
0
 def getObservation(self):
     (theta, thetad, omega, omegad, omegadd, xf, yf, xb, yb, psi) = self.env.getSensors()
     state = one_to_n(self.getBin(theta, thetad, omega, omegad, omegadd), self.outdim)
     return state
Beispiel #40
0
 def getValue(self, state, action):
     return self.network.activate(r_[state, one_to_n(action, self.numActions)])
Beispiel #41
0
from training import LinearFATraining
from learners import SARSALambda_LinFA_ReplacingTraces

task = LinearFATileCoding3456BalanceTask()
learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim)
learner._lambda = 0.95
task.discount = learner.rewardDiscount
agent = LinearFA_Agent(learner)
# The state has a huge number of dimensions, and the logging causes me to run
# out of memory. We needn't log, since learning is done online.
agent.logging = False

# TODO PyBrain says that the learning rate needs to decay, but I don't see that
# described in Randlov's paper.
# A higher number here means the learning rate decays slower.
learner.learningRateDecay = 100000
# NOTE increasing this number above from the default of 100 is what got the
# learning to actually happen, and fixed the bug/issue where the performance
# agent's performance stopped improving.

for i in np.arange(2000, 3800, 50):
    theta = np.loadtxt(
        '/home/fitze/Documents/agent-bicycle/data/balance_sarsalambda_linfa_replacetrace_anneal_112217H56M04S/theta_%i.dat'
        % i)
    learner._theta = theta
    Q = learner._qValues(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
    pl.plot(Q, label='%s' % i)
#pl.legend()
pl.show()
print learner._greedyAction(one_to_n(task.getBin(0, 0, 0, 0, 0), task.outdim))
Beispiel #42
0
 def getValue(self, state, action):
     return self.network.activate(r_[state, one_to_n(action, self.numActions)])
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator (estimate Q-value instead of V)
        BellmanErrors = np.zeros(self.num_agents)
        for iAgent in range(self.num_agents):
            vValC = self._qValues(state, iAgent)
            vValN = self._qValues(next_state, iAgent)
            vArgMaxValC = r_argmax(vValC)
            vArgMaxValN = r_argmax(vValN)
            BellmanError = (reward[iAgent] + self.rewardDiscount *
                            vValN[vArgMaxValN]) - vValC[vArgMaxValC]
            target = vValC[action[iAgent]] + self.cn * (
                (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) -
                vValC[action[iAgent]])
            BellmanErrors[iAgent] = BellmanError
            inp = r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])]
            ds = SupervisedDataSet(
                self.num_features + self.num_actions[iAgent], 1)
            ds.addSample(inp, target)
            BackpropTrainer(self.linQ[iAgent],
                            learningrate=1.0,
                            weightdecay=self.weightdecay).trainOnDataset(ds)

        #Estimate gradient
        grad = self.linGradient.activate(
            np.r_[asarray(state),
                  one_to_n(action[self.indexOfAgent], self.
                           num_actions[self.indexOfAgent])])[0]
        target = grad + self.cn * (np.sum(BellmanErrors, axis=0) - grad)
        inp = np.r_[asarray(state),
                    one_to_n(action[self.indexOfAgent], self.
                             num_actions[self.indexOfAgent])]
        ds = SupervisedDataSet(
            self.num_features + self.num_actions[self.indexOfAgent], 1)
        ds.addSample(inp, target)
        BackpropTrainer(self.linGradient,
                        learningrate=1.0,
                        weightdecay=self.weightdecay).trainOnDataset(ds)
        #         print str(self.indexOfAgent) + "-th agents optimization info.:"
        #         print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0))
        #         print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent]))
        #         print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent))
        #Update policy
        c_pi = self._pi(state)
        #         print "Policy: " + str(c_pi)
        firstTerm = c_pi[action[self.indexOfAgent]]
        secondTerm = (np.sqrt(firstTerm) *
                      np.absolute(BellmanErrors[self.indexOfAgent]) *
                      self._sgn(-1.0 * self.linGradient.activate(
                          np.r_[asarray(state),
                                one_to_n(action[self.indexOfAgent], self.
                                         num_actions[self.indexOfAgent])])[0]))
        target = c_pi
        target[action[self.indexOfAgent]] = self._gamma(firstTerm -
                                                        self.bn * secondTerm)
        inp = r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features,
                               self.num_actions[self.indexOfAgent])
        ds.addSample(inp, target)
        BackpropTrainer(self.linPolicy,
                        learningrate=1.0,
                        weightdecay=self.weightdecay).trainOnDataset(ds)

        #update bn, cn
        self.bn = self.bn * self.decayBn
        self.cn = self.cn * self.decayCn