def eval(agent:Agent, env: Checkers, color:str, n_games=100): agent.net.eval() opponent = Agent(gamma=agent.gamma, epsilon=1, lr=0, input_dims=[8*8 + 1], batch_size=agent.batch_size, action_space=agent.action_space, eps_dec=0, max_mem_size=0 ) opponent.net.eval() initial_state = env.save_state() score = {'black': 0, 'white': 0} for i in tqdm(range(n_games)): env.restore_state(initial_state) winner = None moves = torch.tensor(env.legal_moves()) board, turn, last_moved_piece = env.save_state() brain = agent if turn == color else opponent board_tensor = torch.from_numpy(env.flat_board()).view(-1).float() encoded_turn = torch.tensor([1.]) if turn == 'black' else torch.tensor([0.]) observation = torch.cat([board_tensor, encoded_turn]) while not winner: action = brain.choose_action(observation) while not action_is_legal(action, moves): action = brain.choose_action(observation) new_board, new_turn, _, moves, winner = env.move(*action.tolist()) moves = torch.tensor(moves) board_tensor = torch.from_numpy(env.flat_board()).view(-1).float() encoded_turn = torch.tensor([1. if turn == 'black' else 0.]) observation = torch.cat([board_tensor, encoded_turn]) brain = agent if turn == color else opponent score[winner] +=1 agent.net.train() return score[color] / n_games
batch_size=args.batch_size, action_space=action_space, input_dims=[8 * 8 + 1], lr=args.lr, eps_dec=args.epsilon_decay), 'white': Agent(gamma=args.gamma, epsilon=args.epsilon, batch_size=args.batch_size, action_space=action_space, input_dims=[8 * 8 + 1], lr=args.lr, eps_dec=args.epsilon_decay) } env = Checkers() initial_state = env.save_state() eps_history = [] score = {'black': 0, 'white': 0} os.makedirs(args.checkpoints_dir, exist_ok=True) for i in range(args.games): print( f"episode={i}, score={score}, black_eps:{players['black'].epsilon}, white_eps:{players['white'].epsilon}" ) score = {'black': 0, 'white': 0} env.restore_state(initial_state) winner = None moves = torch.tensor(env.legal_moves()) board, turn, last_moved_piece = env.save_state() brain = players[turn]
class CheckersGame(Game): def __init__(self, history=[]): # Rollout statistics self.child_visits = [] # Terminal values for the first player # 1 for win # 0 for draw # -1 for loss # None for incomplete self.game_value = None # XXX Conventions: # - Black player moves first # - Ego-centric views assume the king row are at the top, i.e. starts at the bottom (Second player has the same view as absolute) self.ch = Checkers() # Action space self.actions = [] # Simple moves for from_sq in range(self.ch.n_positions): for to_sq in self.ch.neighbors[from_sq]: if to_sq is not None: simple_move = (from_sq, to_sq) self.actions.append(simple_move) assert 98 == len(self.actions), 'There should be 98 simple moves.' # Jumps for from_sq in range(self.ch.n_positions): row, col = self.ch.sq2pos(from_sq) # For each direction for di, (drow, dcol) in enumerate(Checkers.dir2del): next_row, next_col = row + 2 * drow, col + 2 * dcol if 0 <= next_row < self.ch.size and 0 <= next_col < self.ch.size: # Within bound to_sq = self.ch.pos2sq(next_row, next_col) jump = (from_sq, to_sq) self.actions.append(jump) self.num_actions = len(self.actions) assert 98 + 72 == self.num_actions, 'There should be 98 simple moves and 72 jumps.' # Inverse dictionary self.action2ind = { action: ind for ind, action in enumerate(self.actions) } # Square mapping from absolute to first player's ego-centric (reflect through the center) self.abs2ego_sq = {} for sq in range(self.ch.n_positions): row, col = self.ch.sq2pos(sq) re_row, re_col = -row + self.ch.size - 1, -col + self.ch.size - 1 re_sq = self.ch.pos2sq(re_row, re_col) self.abs2ego_sq[sq] = re_sq # Inverse self.ego2abs_sq = {re_sq: sq for sq, re_sq in self.abs2ego_sq.items()} # Move mapping from absolute to first player's ego-centric self.abs2ego_ac = {} for ac, (from_sq, to_sq) in enumerate(self.actions): ego_move = (self.abs2ego_sq[from_sq], self.abs2ego_sq[to_sq]) ego_ac = self.action2ind[ego_move] self.abs2ego_ac[ac] = ego_ac # Inverse self.ego2abs_ac = { ego_ac: ac for ac, ego_ac in self.abs2ego_ac.items() } # Fast forward to the last state by taking actions from history self.history = [] for action in history: self.apply(action) def clone(self): game = CheckersGame() state = self.ch.save_state() game.ch.restore_state(state) return game def apply(self, action_index): from_sq, to_sq = self.actions[action_index] board, turn, last_moved_piece, all_next_moves, winner = self.ch.move( from_sq, to_sq) # Terminate when one player wins if winner == 'black': self.game_value = 1 elif winner == 'white': self.game_value = -1 self.history.append(action_index) def legal_actions(self): moves = self.ch.legal_moves() action_idices = {self.action2ind[move] for move in moves} return action_idices def is_first_player_turn(self): return self.ch.turn == 'black' def ego_board_representation(self): # XXX Channels # 0 my men # 1 my kings # 2 opponent's men # 3 opponent's kings # 4 my last moved piece # QUESTION: try indicating the king row and skipping ego transform? rep = np.zeros((self.ch.size, self.ch.size, 5)) if self.ch.turn == 'white': # Same as the absolute view for sq in self.ch.board['white']['men']: row, col = self.ch.sq2pos(sq) rep[row, col, 0] = 1 for sq in self.ch.board['white']['kings']: row, col = self.ch.sq2pos(sq) rep[row, col, 1] = 1 for sq in self.ch.board['black']['men']: row, col = self.ch.sq2pos(sq) rep[row, col, 2] = 1 for sq in self.ch.board['black']['kings']: row, col = self.ch.sq2pos(sq) rep[row, col, 3] = 1 if self.ch._last_moved_piece is not None: row, col = self.ch.sq2pos(self.ch._last_moved_piece) rep[row, col, 4] = 1 else: # Need to invert the board for sq in self.ch.board['black']['men']: sq = self.abs2ego_sq[sq] row, col = self.ch.sq2pos(sq) rep[row, col, 0] = 1 for sq in self.ch.board['black']['kings']: sq = self.abs2ego_sq[sq] row, col = self.ch.sq2pos(sq) rep[row, col, 1] = 1 for sq in self.ch.board['white']['men']: sq = self.abs2ego_sq[sq] row, col = self.ch.sq2pos(sq) rep[row, col, 2] = 1 for sq in self.ch.board['white']['kings']: sq = self.abs2ego_sq[sq] row, col = self.ch.sq2pos(sq) rep[row, col, 3] = 1 if self.ch._last_moved_piece is not None: sq = self.abs2ego_sq[self.ch._last_moved_piece] row, col = self.ch.sq2pos(sq) rep[row, col, 4] = 1 return rep def ego_sample(self, state_index: int): # Fast forward game = CheckersGame(list(self.history[:state_index])) # Ego-centric views of the current player rep = game.ego_board_representation() # Zero-sum game ego_val = self.game_value if game.is_first_player_turn() else ( 0 - self.game_value) # Ego-centric actions if game.is_first_player_turn(): # Invert actions for the first player visits = np.zeros(self.num_actions) for i in range(self.num_actions): visits[self.abs2ego_ac[i]] = self.child_visits[state_index][i] else: visits = np.asarray(self.child_visits[state_index]) return rep, ego_val, visits def ego2abs_policy(self, is_first_player, ego_policy): if is_first_player: policy = np.zeros(self.num_actions) for ego_ac, pr in enumerate(ego_policy): policy[self.ego2abs_ac[ego_ac]] = pr else: policy = ego_policy return policy