def build_policy(self, action, flip): labels_n = len(ActionLabelsRed) move_lookup = {move: i for move, i in zip(ActionLabelsRed, range(labels_n))} policy = np.zeros(labels_n) policy[move_lookup[action]] = 1 if flip: policy = flip_policy(policy) return policy
def action(self, env: CChessEnv) -> str: value = self.search_moves(env) # MCTS search policy = self.calc_policy( env) # policy will not be flipped in `calc_policy` if not env.red_to_move: pol = flip_policy(policy) else: pol = policy my_action = int( np.random.choice(range(self.labels_n), p=self.apply_temperature(pol, env.num_halfmoves))) # my_action = np.argmax(self.apply_temperature(pol, env.num_halfmoves)) # no resign self.moves.append([env.observation, list(policy) ]) # do not need flip anymore when training return self.labels[my_action]