def _boltzmannProbs(qvalues, temperature=1.): if temperature == 0: tmp = zeros(len(qvalues)) tmp[r_argmax(qvalues)] = 1. else: tmp = qvalues / temperature tmp -= max(tmp) tmp = exp(clip(tmp, -20, 0)) return tmp / sum(tmp)
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target = reward + self.rewardDiscount * max(self._qValues(next_state)) inp = r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ = BackpropTrainer(self.linQ, weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features + self.num_actions, 1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #update estimate of average policy self.averagePolicy.append(copy.deepcopy(self.linPolicy)) if len(self.averagePolicy) > self.maxNumberofAverage: self.averagePolicy.pop(np.random.randint(len(self.averagePolicy))) #update policy function approximator delta = None cumRewardOfCurrentPolicy = 0.0 values = self._qValues(state) pi = self._pi(state) for elem_action in range(self.num_actions): cumRewardOfCurrentPolicy = pi[elem_action] * values[elem_action] cumRewardOfAveragePolicy = 0.0 api = self._piAvr(state) for elem_action in range(self.num_actions): cumRewardOfAveragePolicy = api[elem_action] * values[elem_action] if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy: delta = self.deltaW else: delta = self.deltaL #Update policy bestAction = r_argmax(self._qValues(state)) target = one_to_n(bestAction, self.num_actions) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy = BackpropTrainer(self.linPolicy, learningrate=(delta), weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs( epochs=self.trainingEpochPerUpdateWight)
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target=reward + self.rewardDiscount * max(self._qValues(next_state)) inp=r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features+self.num_actions,1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #update estimate of average policy self.averagePolicy.append(copy.deepcopy(self.linPolicy)) if len(self.averagePolicy) > self.maxNumberofAverage: self.averagePolicy.pop(np.random.randint(len(self.averagePolicy))) #update policy function approximator delta=None cumRewardOfCurrentPolicy=0.0 values=self._qValues(state) pi=self._pi(state) for elem_action in range(self.num_actions): cumRewardOfCurrentPolicy=pi[elem_action]*values[elem_action] cumRewardOfAveragePolicy=0.0 api=self._piAvr(state) for elem_action in range(self.num_actions): cumRewardOfAveragePolicy=api[elem_action]*values[elem_action] if cumRewardOfCurrentPolicy > cumRewardOfAveragePolicy: delta=self.deltaW else: delta=self.deltaL #Update policy bestAction=r_argmax(self._qValues(state)) target=one_to_n(bestAction, self.num_actions) inp=r_[asarray(state)] ds = SupervisedDataSet(self.num_features,self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, learningrate=(delta), weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target=reward + self.rewardDiscount * max(self._qValues(next_state)) inp=r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ=BackpropTrainer(self.linQ,weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features+self.num_actions,1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #Update policy bestAction=r_argmax(self._qValues(state)) target= one_to_n(bestAction, self.num_actions) inp=r_[asarray(state)] ds = SupervisedDataSet(self.num_features,self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy=BackpropTrainer(self.linPolicy, learningrate=self.delta, weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs(epochs=self.trainingEpochPerUpdateWight)
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator target = reward + self.rewardDiscount * max(self._qValues(next_state)) inp = r_[asarray(state), one_to_n(action, self.num_actions)] self.trainer4LinQ = BackpropTrainer(self.linQ, weightdecay=self.weightdecay) ds = SupervisedDataSet(self.num_features + self.num_actions, 1) ds.addSample(inp, target) self.trainer4LinQ.trainOnDataset(ds) #Update policy bestAction = r_argmax(self._qValues(state)) target = one_to_n(bestAction, self.num_actions) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions) ds.addSample(inp, target) self.trainer4LinPolicy = BackpropTrainer(self.linPolicy, learningrate=self.delta, weightdecay=self.weightdecay) self.trainer4LinPolicy.setData(ds) self.trainer4LinPolicy.trainEpochs( epochs=self.trainingEpochPerUpdateWight)
def _greedyAction(self, state): return r_argmax(self._qValues(state))
def _updateWeights(self, state, action, reward, next_state): """ state and next_state are vectors, action is an integer. """ #update Q-value function approximator (estimate Q-value instead of V) BellmanErrors = np.zeros(self.num_agents) for iAgent in range(self.num_agents): vValC = self._qValues(state, iAgent) vValN = self._qValues(next_state, iAgent) vArgMaxValC = r_argmax(vValC) vArgMaxValN = r_argmax(vValN) BellmanError = (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[vArgMaxValC] target = vValC[action[iAgent]] + self.cn * ( (reward[iAgent] + self.rewardDiscount * vValN[vArgMaxValN]) - vValC[action[iAgent]]) BellmanErrors[iAgent] = BellmanError inp = r_[state, one_to_n(action[iAgent], self.num_actions[iAgent])] ds = SupervisedDataSet( self.num_features + self.num_actions[iAgent], 1) ds.addSample(inp, target) BackpropTrainer(self.linQ[iAgent], learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) #Estimate gradient grad = self.linGradient.activate( np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])])[0] target = grad + self.cn * (np.sum(BellmanErrors, axis=0) - grad) inp = np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])] ds = SupervisedDataSet( self.num_features + self.num_actions[self.indexOfAgent], 1) ds.addSample(inp, target) BackpropTrainer(self.linGradient, learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) # print str(self.indexOfAgent) + "-th agents optimization info.:" # print "All Bellman errors: "+str(np.sum(BellmanErrors, axis=0)) # print "Self Bellman error: " + str(np.absolute(BellmanErrors[self.indexOfAgent])) # print "Self Q-value: " + str(self._qValues(state,self.indexOfAgent)) #Update policy c_pi = self._pi(state) # print "Policy: " + str(c_pi) firstTerm = c_pi[action[self.indexOfAgent]] secondTerm = (np.sqrt(firstTerm) * np.absolute(BellmanErrors[self.indexOfAgent]) * self._sgn(-1.0 * self.linGradient.activate( np.r_[asarray(state), one_to_n(action[self.indexOfAgent], self. num_actions[self.indexOfAgent])])[0])) target = c_pi target[action[self.indexOfAgent]] = self._gamma(firstTerm - self.bn * secondTerm) inp = r_[asarray(state)] ds = SupervisedDataSet(self.num_features, self.num_actions[self.indexOfAgent]) ds.addSample(inp, target) BackpropTrainer(self.linPolicy, learningrate=1.0, weightdecay=self.weightdecay).trainOnDataset(ds) #update bn, cn self.bn = self.bn * self.decayBn self.cn = self.cn * self.decayCn