def get_state_legal_actions(board: Board, side: Side, north_moved: bool) -> List[Move]: # If this is the first move of NORTH, then NORTH can use the pie rule action legal_moves = [] if north_moved or side == side.SOUTH else [ Move(side, 0) ] for i in range(1, board.holes + 1): if board.board[side.get_index(side)][i] > 0: legal_moves.append(Move(side, i)) return legal_moves
def _run_game(mcts, state): while True: msg = protocol.read_msg() try: msg_type = protocol.get_msg_type(msg) if msg_type == MsgType.START: first = protocol.interpret_start_msg(msg) if first: move = mcts.search(state) protocol.send_msg(protocol.create_move_msg(move.index)) else: state.our_side = Side.NORTH elif msg_type == MsgType.STATE: move_turn = protocol.interpret_state_msg(msg) state.perform_move(Move(state.side_to_move, move_turn.move)) if not move_turn.end: if move_turn.again: move = mcts.search(state) # pie rule; optimal move is to swap if move.index == 0: protocol.send_msg(protocol.create_swap_msg()) else: protocol.send_msg( protocol.create_move_msg(move.index)) logging.info("Next side: " + str(state.side_to_move)) logging.info("The board:\n" + str(state.board)) elif msg_type == MsgType.END: break else: logging.warning("Not sure what I got " + str(msg_type)) except InvalidMessageException as e: logging.error(str(e))
def expand(self, node: AlphaNode): # Tactical workaround the pie move if Move(node.state.side_to_move, 0) in node.unexplored_moves: node.unexplored_moves.remove(Move(node.state.side_to_move, 0)) dist, value = self.network.evaluate_state(node.state) for index, prior in enumerate(dist): expansion_move = Move(node.state.side_to_move, index + 1) if node.state.is_legal(expansion_move): child_state = MancalaEnv.clone(node.state) child_state.perform_move(expansion_move) child_node = AlphaNode(state=child_state, prior=prior, move=expansion_move, parent=node) node.put_child(child_node) # go down the tree return node_utils.select_child_with_maximum_action_value(node)
def _run(self) -> Rollout: # Choose randomly the side to play self.trainer_side = Side.SOUTH if random.randint( 0, 1) == 0 else Side.NORTH # Reset the environment so everything is in a clean state. self.env.reset() rollout = Rollout() while not self.env.is_game_over(): # There is no choice if only one action is left. Taking that action automatically must be seen as # a characteristic behaviour of the environment. This helped the learning of the agent # to be more numerically stable (this is an empirical observation). if len(self.env.get_legal_moves()) == 1: action_left_to_perform = self.env.get_legal_moves()[0] self.env.perform_move(action_left_to_perform) continue if self.env.side_to_move == self.trainer_side: # If the agent is playing as NORTH, it's input would be a flipped board flip_board = self.env.side_to_move == Side.NORTH state = self.env.board.get_board_image(flipped=flip_board) mask = self.env.get_action_mask_with_no_pie() action, value = self.ac_net.sample(state, mask) # Because the pie move with index 0 is ignored, the action indexes must be shifted by one reward = self.env.perform_move( Move(self.trainer_side, action + 1)) rollout.add(state, action, reward, value, mask) else: assert self.env.side_to_move == Side.opposite( self.trainer_side) action = self.opp_agent.produce_action( self.env.board.get_board_image(), self.env.get_action_mask_with_no_pie(), self.env.side_to_move) self.env.perform_move(Move(self.env.side_to_move, action + 1)) # We replace the partial reward of the last move with the final reward of the game final_reward = self.env.compute_final_reward(self.trainer_side) rollout.update_last_reward(final_reward) if self.env.get_winner() == self.trainer_side: rollout.add_win() return rollout
def test_move_has_required_effects(self): self.game.perform_move(Move(Side.SOUTH, 5)) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 5), 0) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 6), 8) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 7), 8) self.assertEqual(self.game.board.get_seeds_in_store(Side.SOUTH), 1) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 1), 8) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 2), 8) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 3), 8) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 4), 8) self.game.perform_move(Move(Side.NORTH, 4)) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 4), 0) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 5), 8) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 6), 8) self.assertEqual(self.game.board.get_seeds(Side.NORTH, 7), 8) self.assertEqual(self.game.board.get_seeds_in_store(Side.NORTH), 1) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 1), 8) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 2), 8) self.assertEqual(self.game.board.get_seeds(Side.SOUTH, 3), 8)
def simulate(self, root: AlphaNode) -> float: """ runs a simulation from the root to the end of the game :param root: the starting node for the simulation :return: the rollout policy; reward for taking this path combining value network with game's winner """ node: AlphaNode = AlphaNode.clone(root) while not node.is_terminal(): move_index, _ = self.network.sample_state(node.state) move = Move(node.state.side_to_move, move_index + 1) node.state.perform_move(move) return node.state
def pg_train_policy(self): flip_board = self.env.side_to_move == Side.NORTH state = self.env.board.get_board_image(flipped=flip_board) valid_actions_mask = self.env.get_actions_mask() action = self.agent.sample_action(state, valid_actions_mask) seeds_in_store_before = self.env.board.get_seeds_in_store( self.agent_side) self.env.perform_move(Move(self.agent_side, action)) seeds_in_store_after = self.env.board.get_seeds_in_store( self.agent_side) reward = (seeds_in_store_after - seeds_in_store_before) / 10.0 self.agent.store_rollout(state, action, reward, valid_actions_mask)
def simulate(self, root: Node) -> MancalaEnv: node = Node.clone(root) while not node.is_terminal(): legal_moves = node.state.get_legal_moves() moves = [-1e80 for _ in range(node.state.board.holes + 1)] for move in legal_moves: moves[move.index] = evaluation.get_score( state=self._make_temp_child(node, move), parent_side=node.state.side_to_move) moves_dist = np.asarray(moves, dtype=np.float64).flatten() exp = np.exp(moves_dist - np.max(moves_dist)) dist = exp / np.sum(exp) move_to_make = int(np.random.choice(range(len(moves)), p=dist)) node.state.perform_move( Move(side=node.state.side_to_move, index=move_to_make)) return node.state
def _run_game(player: Player, state: MancalaEnv): our_agent_states = [] their_agent_states = [] both_agent_states = [] our_side = Side.SOUTH while True: msg = protocol.read_msg() try: msg_type = protocol.get_msg_type(msg) if msg_type == MsgType.START: first = protocol.interpret_start_msg(msg) if first: move = player.get_play(state) protocol.send_msg(protocol.create_move_msg(move.index)) else: our_side = Side.NORTH elif msg_type == MsgType.STATE: move_turn = protocol.interpret_state_msg(msg) if move_turn.move == 0: our_side = Side.opposite(our_side) move_to_perform = Move(state.side_to_move, move_turn.move) observed_state = ObservedState(state=state, action_taken=move_to_perform) both_agent_states.append(observed_state) if state.side_to_move == our_side: our_agent_states.append(observed_state) else: their_agent_states.append(observed_state) state.perform_move(move_to_perform) if not move_turn.end: if move_turn.again: move = player.get_play(state) # pie rule; optimal move is to swap if move.index == 0: protocol.send_msg(protocol.create_swap_msg()) else: protocol.send_msg( protocol.create_move_msg(move.index)) elif msg_type == MsgType.END: args = parser.parse_args() run_id = '%06d' % args.run_number run_category = args.category _our_agent_file_path = _checkpoint_file_path + "/our-agent/" + run_category + run_id _their_agent_file_path = _checkpoint_file_path + "/their-agent/" + run_category + run_id _both_agent_file_path = _checkpoint_file_path + "/both-agent/" + run_category + run_id np.save(file=_our_agent_file_path, arr=np.array(our_agent_states)) np.save(file=_their_agent_file_path, arr=np.array(their_agent_states)) np.save(file=_both_agent_file_path, arr=np.array(both_agent_states)) break else: print("Not sure what I got " + str(msg_type)) except InvalidMessageException as _e: print(str(_e))
def __init__(self, state: MancalaEnv, action_taken: Move): self.state = MancalaEnv.clone(state) self.action_taken = Move.clone(action_taken)
def test_side_to_move_doesnt_change(self): self.game.perform_move(Move(Side.SOUTH, 1)) self.assertEqual(self.game.side_to_move, Side.NORTH)
def test_legal_moves_contains_all_moves(self): self.assertEqual(len(set(self.game.get_legal_moves())), 7) self.game.perform_move(Move(Side.SOUTH, 3)) self.assertEqual(len(set(self.game.get_legal_moves())), 8)
def test_is_legal_move_returns_true_for_the_pie_rule2(self): env = MancalaEnv() env.perform_move(Move(Side.SOUTH, 5)) self.assertTrue(env.is_legal(Move(Side.NORTH, 0)))
def test_is_legal_move_returns_true_for_the_pie_rule(self): board = self.game.board MancalaEnv.make_move(board, Move(Side.SOUTH, 6), False) self.assertTrue(MancalaEnv.is_legal_action(board, Move(Side.NORTH, 0), False))
def test_cloning_immutability(self): clone = MancalaEnv.clone(self.game) self.game.perform_move(Move(Side.SOUTH, 3)) self.assertEqual(clone.board.get_seeds(Side.SOUTH, 3), 7) self.assertEqual(clone.side_to_move, Side.SOUTH)