def _scoring_store_diff(state: MancalaEnv, parent_side: Side) -> int: """Calculates the differences between two stores.""" our_seeds = state.board.get_seeds_in_store(parent_side) their_seeds = state.board.get_seeds_in_store(Side.opposite(parent_side)) reward = our_seeds - their_seeds return reward
def set_seeds_op(self, side: Side, hole: int, seeds: int): if hole < 1 or hole > self.holes: raise ValueError( 'Hole number must be between 1 and number of holes') if seeds < 0: raise ValueError('There has to be a non-negative number of seeds') self.board[Side.get_index(Side.opposite(side))][self.holes + 1 - hole] = seeds
def perform_move(self, move: Move) -> int: """Performs a move and returns the reward for this move.""" seeds_in_store_before = self.board.get_seeds_in_store(move.side) if move.index == 0: # pie move self.our_side = Side.opposite(self.our_side) self.side_to_move = MancalaEnv.make_move(self.board, move, self.north_moved) if move.side == Side.NORTH: self.north_moved = True seeds_in_store_after = self.board.get_seeds_in_store(move.side) # Return a partial reward proportional to the number of captured seeds. return (seeds_in_store_after - seeds_in_store_before) / 100.0
def _run(self) -> Rollout: # Choose randomly the side to play self.trainer_side = Side.SOUTH if random.randint( 0, 1) == 0 else Side.NORTH # Reset the environment so everything is in a clean state. self.env.reset() rollout = Rollout() while not self.env.is_game_over(): # There is no choice if only one action is left. Taking that action automatically must be seen as # a characteristic behaviour of the environment. This helped the learning of the agent # to be more numerically stable (this is an empirical observation). if len(self.env.get_legal_moves()) == 1: action_left_to_perform = self.env.get_legal_moves()[0] self.env.perform_move(action_left_to_perform) continue if self.env.side_to_move == self.trainer_side: # If the agent is playing as NORTH, it's input would be a flipped board flip_board = self.env.side_to_move == Side.NORTH state = self.env.board.get_board_image(flipped=flip_board) mask = self.env.get_action_mask_with_no_pie() action, value = self.ac_net.sample(state, mask) # Because the pie move with index 0 is ignored, the action indexes must be shifted by one reward = self.env.perform_move( Move(self.trainer_side, action + 1)) rollout.add(state, action, reward, value, mask) else: assert self.env.side_to_move == Side.opposite( self.trainer_side) action = self.opp_agent.produce_action( self.env.board.get_board_image(), self.env.get_action_mask_with_no_pie(), self.env.side_to_move) self.env.perform_move(Move(self.env.side_to_move, action + 1)) # We replace the partial reward of the last move with the final reward of the game final_reward = self.env.compute_final_reward(self.trainer_side) rollout.update_last_reward(final_reward) if self.env.get_winner() == self.trainer_side: rollout.add_win() return rollout
def get_winner(self) -> Side or None: """ :return: The winning Side of the game or none if there is a tie. """ if not self.is_game_over(): raise ValueError( 'This method should be called only when the game is over') finished_side = Side.NORTH if MancalaEnv.holes_empty( self.board, Side.NORTH) else Side.SOUTH not_finished_side = Side.opposite(finished_side) not_finished_side_seeds = self.board.get_seeds_in_store( not_finished_side) for hole in range(1, self.board.holes + 1): not_finished_side_seeds += self.board.get_seeds( not_finished_side, hole) finished_side_seeds = self.board.get_seeds_in_store(finished_side) if finished_side_seeds > not_finished_side_seeds: return finished_side elif finished_side_seeds < not_finished_side_seeds: return not_finished_side return None
def get_seeds_op(self, side: Side, hole: int): if hole < 1 or hole > self.holes: raise ValueError( 'Hole number must be between 1 and number of holes') return self.board[Side.get_index(Side.opposite(side))][self.holes + 1 - hole]
def test_side_opposite_is_correct(self): self.assertEqual(Side.opposite(Side.NORTH), Side.SOUTH) self.assertEqual(Side.opposite(Side.SOUTH), Side.NORTH)
def _run_game(player: Player, state: MancalaEnv): our_agent_states = [] their_agent_states = [] both_agent_states = [] our_side = Side.SOUTH while True: msg = protocol.read_msg() try: msg_type = protocol.get_msg_type(msg) if msg_type == MsgType.START: first = protocol.interpret_start_msg(msg) if first: move = player.get_play(state) protocol.send_msg(protocol.create_move_msg(move.index)) else: our_side = Side.NORTH elif msg_type == MsgType.STATE: move_turn = protocol.interpret_state_msg(msg) if move_turn.move == 0: our_side = Side.opposite(our_side) move_to_perform = Move(state.side_to_move, move_turn.move) observed_state = ObservedState(state=state, action_taken=move_to_perform) both_agent_states.append(observed_state) if state.side_to_move == our_side: our_agent_states.append(observed_state) else: their_agent_states.append(observed_state) state.perform_move(move_to_perform) if not move_turn.end: if move_turn.again: move = player.get_play(state) # pie rule; optimal move is to swap if move.index == 0: protocol.send_msg(protocol.create_swap_msg()) else: protocol.send_msg( protocol.create_move_msg(move.index)) elif msg_type == MsgType.END: args = parser.parse_args() run_id = '%06d' % args.run_number run_category = args.category _our_agent_file_path = _checkpoint_file_path + "/our-agent/" + run_category + run_id _their_agent_file_path = _checkpoint_file_path + "/their-agent/" + run_category + run_id _both_agent_file_path = _checkpoint_file_path + "/both-agent/" + run_category + run_id np.save(file=_our_agent_file_path, arr=np.array(our_agent_states)) np.save(file=_their_agent_file_path, arr=np.array(their_agent_states)) np.save(file=_both_agent_file_path, arr=np.array(both_agent_states)) break else: print("Not sure what I got " + str(msg_type)) except InvalidMessageException as _e: print(str(_e))
def compute_final_reward(self, side: Side): """Returns a reward for the specified side for moving to the current state.""" reward = self.board.get_seeds_in_store( side) - self.board.get_seeds_in_store(Side.opposite(side)) return reward
def make_move(board: Board, move: Move, north_moved): if not MancalaEnv.is_legal_action(board, move, north_moved): raise ValueError( 'Move is illegal: Board: \n {} \n Move:\n {}/{} \n {}'.format( board, move.index, move.side, north_moved)) # This is a pie move if move.index == 0: MancalaEnv.switch_sides(board) return Side.opposite(move.side) seeds_to_sow = board.get_seeds(move.side, move.index) board.set_seeds(move.side, move.index, 0) holes = board.holes # Place seeds in all holes excepting the opponent's store receiving_holes = 2 * holes + 1 # Rounds needed to sow all the seeds rounds = seeds_to_sow // receiving_holes # Seeds remaining after all the rounds remaining_seeds = seeds_to_sow % receiving_holes # Sow the seeds for the full rounds if rounds != 0: for hole in range(1, holes + 1): board.add_seeds(Side.NORTH, hole, rounds) board.add_seeds(Side.SOUTH, hole, rounds) board.add_seeds_to_store(move.side, rounds) # Sow the remaining seeds sow_side = move.side sow_hole = move.index for _ in range(remaining_seeds): sow_hole += 1 if sow_hole == 1: sow_side = Side.opposite(sow_side) if sow_hole > holes: if sow_side == move.side: sow_hole = 0 board.add_seeds_to_store(sow_side, 1) continue else: sow_side = Side.opposite(sow_side) sow_hole = 1 board.add_seeds(sow_side, sow_hole, 1) # Capture the opponent's seeds from the opposite hole if the last seed # is placed in an empty hole and there are seeds in the opposite hole if sow_side == move.side and sow_hole > 0 \ and board.get_seeds(sow_side, sow_hole) == 1 \ and board.get_seeds_op(sow_side, sow_hole) > 0: board.add_seeds_to_store( move.side, 1 + board.get_seeds_op(sow_side, sow_hole)) board.set_seeds(move.side, sow_hole, 0) board.set_seeds_op(move.side, sow_hole, 0) # If the game is over, collect the seeds not in the store and put them there game_over = MancalaEnv.game_over(board) if game_over: finished_side = Side.NORTH if MancalaEnv.holes_empty( board, Side.NORTH) else Side.SOUTH seeds = 0 collecting_side = Side.opposite(finished_side) for hole in range(1, board.holes + 1): seeds += board.get_seeds(collecting_side, hole) board.set_seeds(collecting_side, hole, 0) board.add_seeds_to_store(collecting_side, seeds) # Return the side which is next to move if sow_hole == 0 and (move.side == Side.NORTH or north_moved): return move.side # Last seed was placed in the store, so side moves again return Side.opposite(move.side)