Example #1
0
 def update_policy(self, s, a, game):
     if self.a_policy == 'softmax':
         self.pi[s] = utils.softmax(np.sum(np.multiply(self.Q[s], self.opponent_best_pi[s]), 1))
     else:
         Q = np.sum(np.multiply(self.Q[s], self.opponent_best_pi[s]), 1)
         self.pi[s] = (Q == np.max(Q)).astype(np.double)
     self.pi_history.append(deepcopy(self.pi))
     self.opponent_best_pi_history.append(deepcopy(self.opponent_best_pi))
     print('opponent pi of {}: {}'.format(self.id_, self.opponent_best_pi))
Example #2
0
    def update_policy(self, s, a, game):
        # print('Qs {}'.format(self.Q[s]))
        # print('OPI {}'.format(self.opponent_best_pi[s]))
        # print('pis: ' + str(np.dot(self.Q[s], self.opponent_best_pi[s])))
        self.pi[s] = utils.softmax(np.dot(self.Q[s], self.opponent_pi[s]))

        # print('pis: ' + str(np.sum(np.dot(self.Q[s], self.opponent_best_pi[s]))))
        self.pi_history.append(deepcopy(self.pi))
        self.opponent_pi_history.append(deepcopy(self.opponent_pi))
        print('opponent pi of {}: {}'.format(self.id_, self.opponent_pi[s]))