Example #1
0
    def build_policy(self, action, flip):
        labels_n = len(ActionLabelsRed)
        move_lookup = {move: i for move, i in zip(ActionLabelsRed, range(labels_n))}
        policy = np.zeros(labels_n)

        policy[move_lookup[action]] = 1

        if flip:
            policy = flip_policy(policy)
        return policy
 def action(self, env: CChessEnv) -> str:
     value = self.search_moves(env)  # MCTS search
     policy = self.calc_policy(
         env)  # policy will not be flipped in `calc_policy`
     if not env.red_to_move:
         pol = flip_policy(policy)
     else:
         pol = policy
     my_action = int(
         np.random.choice(range(self.labels_n),
                          p=self.apply_temperature(pol, env.num_halfmoves)))
     # my_action = np.argmax(self.apply_temperature(pol, env.num_halfmoves))
     # no resign
     self.moves.append([env.observation, list(policy)
                        ])  # do not need flip anymore when training
     return self.labels[my_action]