Example #1
0
 def update_policy(self, s, a, game):
     delta = self.delta
     if a == np.argmax(self.Q[s]):
         self.pi[s][a] += delta
     else:
         self.pi[s][a] -= delta / (self.action_num - 1)
     StationaryAgent.normalize(self.pi[s])
     self.pi_history.append(deepcopy(self.pi))
Example #2
0
 def update_policy(self, s, a, env):
     self.initialize_solvers()
     for solver, lib in self.solvers:
         try:
             self.pi[s] = self.lp_solve(self.Q[s], solver, lib)
             StationaryAgent.normalize(self.pi[s])
             self.pi_history.append(deepcopy(self.pi))
         except Exception as e:
             print('optimization using {} failed: {}'.format(solver, e))
             continue
         else:
             break
Example #3
0
    def update_policy(self, s, a, game):
        if a == np.argmax(self.Q[s]):
            delta = self.delta1
            vi = np.zeros(self.action_num)
            vi[a] = 1.
        else:
            delta = self.delta2
            vi = np.zeros(self.action_num)
            vi[a] = 0.

        self.pi[s] = (1 - delta) * self.pi[s] + delta * vi
        StationaryAgent.normalize(self.pi[s])
        self.pi_history.append(deepcopy(self.pi))
Example #4
0
 def update_policy(self, s, a, game):
     V = np.dot(self.pi[s], self.Q[s])
     delta_hat_A = np.zeros(self.action_num)
     delta_A = np.zeros(self.action_num)
     for ai in range(self.action_num):
         if self.pi[s][ai] == 1:
             delta_hat_A[ai] = self.Q[s][ai] - V
         else:
             delta_hat_A[ai] = (self.Q[s][ai] - V) / (1 - self.pi[s][ai])
         delta_A[ai] = delta_hat_A[ai] - self.gamma * abs(
             delta_hat_A[ai]) * self.pi[s][ai]
     self.pi[s] += self.eta * delta_A
     StationaryAgent.normalize(self.pi[s])
     self.pi_history.append(deepcopy(self.pi))
Example #5
0
 def act(self, s, exploration, game):
     if exploration and random.random() < self.episilon:
         return random.randint(0, self.action_num - 1)
     else:
         if self.verbose:
             for s in self.Q.keys():
                 print('{}--------------'.format(self.id_))
                 print('Q of agent {}: state {}: {}'.format(
                     self.id_, s, str(self.Q[s])))
                 # print('QAof agent {}: state {}: {}'.format(self.id_, s, str(self.Q_A[s])))
                 # self.Q_A
                 print('pi of agent {}: state {}: {}'.format(
                     self.id_, s, self.pi[s]))
                 # print('pi of opponent agent {}: state{}: {}'.format(self.id_, s, self.opponent_best_pi[s]))
                 print('{}--------------'.format(self.id_))
         # print()
         return StationaryAgent.sample(self.pi[s])