def update_env_after_move(board: Board, move: Move, north_moved): if not KalahEnvironment.is_permitted(board, move, north_moved): raise Exception('Move not permitted') if move.index == 0: KalahEnvironment.swap_sides(board) return Side.opposite(move.side) seeds_to_sow = board.get_seeds(move.side, move.index) board.set_seeds(move.side, move.index, 0) holes = board.holes receiving_holes = 2 * holes + 1 rounds = seeds_to_sow // receiving_holes remaining_seeds = seeds_to_sow % receiving_holes if rounds != 0: for hole in range(1, holes + 1): board.add_seeds(Side.NORTH, hole, rounds) board.add_seeds(Side.SOUTH, hole, rounds) board.add_seeds_to_store(move.side, rounds) sow_side = move.side sow_hole = move.index for _ in range(remaining_seeds): sow_hole += 1 if sow_hole == 1: sow_side = Side.opposite(sow_side) if sow_hole > holes: if sow_side == move.side: sow_hole = 0 board.add_seeds_to_store(sow_side, 1) continue else: sow_side = Side.opposite(sow_side) sow_hole = 1 board.add_seeds(sow_side, sow_hole, 1) if sow_side == move.side and sow_hole > 0 and board.get_seeds(sow_side, sow_hole) == 1 \ and board.get_seeds_op(sow_side, sow_hole) > 0: board.add_seeds_to_store(move.side, 1 + board.get_seeds_op(sow_side, sow_hole)) board.set_seeds(move.side, sow_hole, 0) board.set_seeds_op(move.side, sow_hole, 0) game_over = KalahEnvironment.game_finished(board) if game_over: finished_side = Side.NORTH if KalahEnvironment.side_has_no_seeds(board, Side.NORTH) else Side.SOUTH seeds = 0 collecting_side = Side.opposite(finished_side) for hole in range(1, board.holes + 1): seeds += board.get_seeds(collecting_side, hole) board.set_seeds(collecting_side, hole, 0) board.add_seeds_to_store(collecting_side, seeds) if sow_hole == 0 and (move.side == Side.NORTH or north_moved): return move.side return Side.opposite(move.side)
def h1(state: KalahEnvironment, side: Side) -> float: my_mancala = state.board.get_seeds_in_store(side) opponent_mancala = state.board.get_seeds_in_store(Side.opposite(side)) diff = my_mancala - opponent_mancala return diff
def set_seeds_op(self, side: Side, hole: int, seeds: int): if hole < 1 or hole > self.holes: raise ValueError( 'Hole number must be between 1 and number of holes') if seeds < 0: raise ValueError('There has to be a non-negative number of seeds') self.board[Side.get_index(Side.opposite(side))][self.holes + 1 - hole] = seeds
def get_winner(self) -> Side or None: if not self.has_game_ended(): raise Exception('Game has not ended') last_move_side = Side.NORTH if KalahEnvironment.side_has_no_seeds(self.board, Side.NORTH) else Side.SOUTH other_side = Side.opposite(last_move_side) last_move_side_seeds = self.board.get_seeds_in_store(other_side) for hole in range(1, self.board.holes + 1): last_move_side_seeds += self.board.get_seeds(other_side, hole) other_side_seeds = self.board.get_seeds_in_store(last_move_side) if other_side_seeds > last_move_side_seeds: return last_move_side elif other_side_seeds < last_move_side_seeds: return other_side else: return None
def env_runner(env, trainer_side, ac_net, opp_agent): """ The logic of the thread runner. In brief, it constantly keeps on running the policy, and as long as the rollout exceeds a certain length, the thread runner appends the policy to the queue. """ rollout = Rollout() while not env.has_game_ended(): # There is no choice if only one action is left. Taking that action automatically must be seen as # a characteristic behaviour of the environment. This helped the learning of the agent # to be more numerically stable (this is an empirical observation). if len(env.get_valid_moves()) == 1: action_left_to_perform = env.get_valid_moves()[0] env.do_move(action_left_to_perform) continue if env.side_to_play == trainer_side: # If the agent is playing as NORTH, it's input would be a flipped board flip_board = env.side_to_play == Side.NORTH state = env.board.get_board_image(flipped=flip_board) mask = env.get_mask() action, value = ac_net.sample(state, mask) # Because the pie move with index 0 is ignored, the action indexes must be shifted by one reward = env.do_move(Move(trainer_side, action + 1)) rollout.add(state, action, reward, value, mask) else: assert env.side_to_play == Side.opposite(trainer_side) action = opp_agent.produce_action(env.board.get_board_image(), env.get_mask(), env.side_to_play) env.do_move(Move(env.side_to_play, action + 1)) # We replace the partial reward of the last move with the final reward of the game final_reward = env.calculate_score_diff(trainer_side) rollout.update_last_reward(final_reward) if env.get_winner() == trainer_side: rollout.add_win() return rollout
def calculate_score_diff(self, side: Side): diff = self.board.get_seeds_in_store(side) - self.board.get_seeds_in_store(Side.opposite(side)) return diff
def do_move(self, move: Move): if move.index == 0: self.my_side = Side.opposite(self.my_side) self.side_to_play = KalahEnvironment.update_env_after_move(self.board, move, self.north_has_moved) if move.side == Side.NORTH: self.north_has_moved = True
def get_seeds_op(self, side: Side, hole: int): if hole < 1 or hole > self.holes: raise ValueError( 'Hole number must be between 1 and number of holes') return self.board[Side.get_index(Side.opposite(side))][self.holes + 1 - hole]