Esempio n. 1
0
 def _boltzmannProbs(qvalues, temperature=1.):
     if temperature == 0:
         tmp = zeros(len(qvalues))
         tmp[r_argmax(qvalues)] = 1.
     else:
         tmp = qvalues / temperature
         tmp -= max(tmp)
         tmp = exp(clip(tmp, -20, 0))
     return tmp / sum(tmp)
Esempio n. 2
0
 def _boltzmannProbs(qvalues, temperature=1.):
     if temperature == 0:
         tmp = zeros(len(qvalues))        
         tmp[r_argmax(qvalues)] = 1.
     else:
         tmp = qvalues / temperature            
         tmp -= max(tmp)        
         tmp = exp(clip(tmp, -20, 0))
     return tmp / sum(tmp)
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator
        target = reward + self.rewardDiscount * max(self._qValues(next_state))
        inp = r_[asarray(state), one_to_n(action, self.num_actions)]
        self.trainer4LinQ = BackpropTrainer(self.linQ,
                                            weightdecay=self.weightdecay)
        ds = SupervisedDataSet(self.num_features + self.num_actions, 1)
        ds.addSample(inp, target)
        self.trainer4LinQ.trainOnDataset(ds)

        #update estimate of average policy
        self.averagePolicy.append(copy.deepcopy(self.linPolicy))
        if len(self.averagePolicy) > self.maxNumberofAverage:
            self.averagePolicy.pop(np.random.randint(len(self.averagePolicy)))

        #update policy function approximator
        delta = None
        cumRewardOfCurrentPolicy = 0.0
        values = self._qValues(state)
        pi = self._pi(state)
        for elem_action in range(self.num_actions):
            cumRewardOfCurrentPolicy = pi[elem_action] * values[elem_action]
        cumRewardOfAveragePolicy = 0.0
        api = self._piAvr(state)
        for elem_action in range(self.num_actions):
            cumRewardOfAveragePolicy = api[elem_action] * values[elem_action]
        if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy:
            delta = self.deltaW
        else:
            delta = self.deltaL

        #Update policy
        bestAction = r_argmax(self._qValues(state))
        target = one_to_n(bestAction, self.num_actions)
        inp = r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features, self.num_actions)
        ds.addSample(inp, target)
        self.trainer4LinPolicy = BackpropTrainer(self.linPolicy,
                                                 learningrate=(delta),
                                                 weightdecay=self.weightdecay)
        self.trainer4LinPolicy.setData(ds)
        self.trainer4LinPolicy.trainEpochs(
            epochs=self.trainingEpochPerUpdateWight)
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator
        target=reward + self.rewardDiscount * max(self._qValues(next_state))
        inp=r_[asarray(state), one_to_n(action, self.num_actions)]
        self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
        ds = SupervisedDataSet(self.num_features+self.num_actions,1)
        ds.addSample(inp, target)        
        self.trainer4LinQ.trainOnDataset(ds)

        #update estimate of average policy
        self.averagePolicy.append(copy.deepcopy(self.linPolicy))
        if len(self.averagePolicy) > self.maxNumberofAverage:
            self.averagePolicy.pop(np.random.randint(len(self.averagePolicy)))
            
        #update policy function approximator
        delta=None
        cumRewardOfCurrentPolicy=0.0
        values=self._qValues(state)
        pi=self._pi(state)
        for elem_action in range(self.num_actions):
            cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action]
        cumRewardOfAveragePolicy=0.0
        api=self._piAvr(state)
        for elem_action in range(self.num_actions):
            cumRewardOfAveragePolicy=api[elem_action]*values[elem_action]
        if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy:
            delta=self.deltaW
        else:
            delta=self.deltaL
        
        #Update policy
        bestAction=r_argmax(self._qValues(state))
        target=one_to_n(bestAction, self.num_actions)
        inp=r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features,self.num_actions)
        ds.addSample(inp, target)
        self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
                                               learningrate=(delta),
                                               weightdecay=self.weightdecay)
        self.trainer4LinPolicy.setData(ds)
        self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
                        
        
 def _updateWeights(self, state, action, reward, next_state):
     """ state and next_state are vectors, action is an integer. """
     #update Q-value function approximator
     target=reward + self.rewardDiscount * max(self._qValues(next_state))
     inp=r_[asarray(state), one_to_n(action, self.num_actions)]
     self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay)
     ds = SupervisedDataSet(self.num_features+self.num_actions,1)
     ds.addSample(inp, target)
     self.trainer4LinQ.trainOnDataset(ds)
     #Update policy
     bestAction=r_argmax(self._qValues(state))
     target= one_to_n(bestAction, self.num_actions)
     inp=r_[asarray(state)]
     ds = SupervisedDataSet(self.num_features,self.num_actions)
     ds.addSample(inp, target)
     self.trainer4LinPolicy=BackpropTrainer(self.linPolicy,
                                            learningrate=self.delta,
                                            weightdecay=self.weightdecay)
     self.trainer4LinPolicy.setData(ds)
     self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
 def _updateWeights(self, state, action, reward, next_state):
     """ state and next_state are vectors, action is an integer. """
     #update Q-value function approximator
     target = reward + self.rewardDiscount * max(self._qValues(next_state))
     inp = r_[asarray(state), one_to_n(action, self.num_actions)]
     self.trainer4LinQ = BackpropTrainer(self.linQ,
                                         weightdecay=self.weightdecay)
     ds = SupervisedDataSet(self.num_features + self.num_actions, 1)
     ds.addSample(inp, target)
     self.trainer4LinQ.trainOnDataset(ds)
     #Update policy
     bestAction = r_argmax(self._qValues(state))
     target = one_to_n(bestAction, self.num_actions)
     inp = r_[asarray(state)]
     ds = SupervisedDataSet(self.num_features, self.num_actions)
     ds.addSample(inp, target)
     self.trainer4LinPolicy = BackpropTrainer(self.linPolicy,
                                              learningrate=self.delta,
                                              weightdecay=self.weightdecay)
     self.trainer4LinPolicy.setData(ds)
     self.trainer4LinPolicy.trainEpochs(
         epochs=self.trainingEpochPerUpdateWight)
Esempio n. 7
0
 def _greedyAction(self, state):
     return r_argmax(self._qValues(state))
Esempio n. 8
0
 def _greedyAction(self, state):
     return r_argmax(self._qValues(state))
    def _updateWeights(self, state, action, reward, next_state):
        """ state and next_state are vectors, action is an integer. """
        #update Q-value function approximator (estimate Q-value instead of V)
        BellmanErrors = np.zeros(self.num_agents)
        for iAgent in range(self.num_agents):
            vValC = self._qValues(state, iAgent)
            vValN = self._qValues(next_state, iAgent)
            vArgMaxValC = r_argmax(vValC)
            vArgMaxValN = r_argmax(vValN)
            BellmanError = (reward[iAgent] + self.rewardDiscount *
                            vValN[vArgMaxValN]) - vValC[vArgMaxValC]
            target = vValC[action[iAgent]] + self.cn * (
                (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) -
                vValC[action[iAgent]])
            BellmanErrors[iAgent] = BellmanError
            inp = r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])]
            ds = SupervisedDataSet(
                self.num_features + self.num_actions[iAgent], 1)
            ds.addSample(inp, target)
            BackpropTrainer(self.linQ[iAgent],
                            learningrate=1.0,
                            weightdecay=self.weightdecay).trainOnDataset(ds)

        #Estimate gradient
        grad = self.linGradient.activate(
            np.r_[asarray(state),
                  one_to_n(action[self.indexOfAgent], self.
                           num_actions[self.indexOfAgent])])[0]
        target = grad + self.cn * (np.sum(BellmanErrors, axis=0) - grad)
        inp = np.r_[asarray(state),
                    one_to_n(action[self.indexOfAgent], self.
                             num_actions[self.indexOfAgent])]
        ds = SupervisedDataSet(
            self.num_features + self.num_actions[self.indexOfAgent], 1)
        ds.addSample(inp, target)
        BackpropTrainer(self.linGradient,
                        learningrate=1.0,
                        weightdecay=self.weightdecay).trainOnDataset(ds)
        #         print str(self.indexOfAgent) + "-th agents optimization info.:"
        #         print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0))
        #         print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent]))
        #         print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent))
        #Update policy
        c_pi = self._pi(state)
        #         print "Policy: " + str(c_pi)
        firstTerm = c_pi[action[self.indexOfAgent]]
        secondTerm = (np.sqrt(firstTerm) *
                      np.absolute(BellmanErrors[self.indexOfAgent]) *
                      self._sgn(-1.0 * self.linGradient.activate(
                          np.r_[asarray(state),
                                one_to_n(action[self.indexOfAgent], self.
                                         num_actions[self.indexOfAgent])])[0]))
        target = c_pi
        target[action[self.indexOfAgent]] = self._gamma(firstTerm -
                                                        self.bn * secondTerm)
        inp = r_[asarray(state)]
        ds = SupervisedDataSet(self.num_features,
                               self.num_actions[self.indexOfAgent])
        ds.addSample(inp, target)
        BackpropTrainer(self.linPolicy,
                        learningrate=1.0,
                        weightdecay=self.weightdecay).trainOnDataset(ds)

        #update bn, cn
        self.bn = self.bn * self.decayBn
        self.cn = self.cn * self.decayCn