def test_spawning_random_boards(): board0 = Board.init_random((100, 100), 5) assert np.argwhere(board0.as_array()).shape == (5, 2) board1 = Board.init_random((4, 4), 2) assert np.argwhere(board1.as_array()).shape == (2, 2) board2 = Board.init_random((5, 5), 0) assert np.argwhere(board2.as_array()).shape == (0, 2)
def test_game_over_checking_works_as_expected(): not_game_over_strings = ["1 1 1 1", "0 0 0 0", "0 1 0 0", "3 2 1 3 4 5 6 7 8"] for s in not_game_over_strings: board = board_from_string(s) assert not Board.is_game_over(board) game_over_strings = ["1 2 3 4", "1 2 3 4 5 6 7 8 9", "4 3 7 2"] for s in game_over_strings: board = board_from_string(s) assert Board.is_game_over(board)
def test_get_empty_positions(self): board = Board() board.move(PLAYER1, 2, 1) board.move(PLAYER2, 1, 0) emptyPositions = board.get_empty_positions() assert [1, 0] not in emptyPositions assert [2, 1] not in emptyPositions
def test_get_neighbors(): b = Board() assert set(b.get_neighbors(1, 1)) == { (0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2), }
def test_next_status_die_overpopulation(): b = Board() b.cells[0][1] = True b.cells[1][2] = True b.cells[0][0] = True b.cells[1][0] = True b.cells[1][1] = True assert b.get_next_status(1, 1) is False
def run(): n = 4 width, height = 6, 6 model_file = os.path.join(results_dir, "zero_17_4_15:36", "policy_1450.model") try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2 3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def test_next_status_relive(): b = Board() b.cells[0][1] = True b.cells[1][2] = True b.cells[0][0] = True b.cells[1][1] = False assert b.get_next_status(1, 1) is True
def test_next_status_stay_alive_3(): b = Board() b.cells[0][1] = True b.cells[1][2] = True b.cells[0][0] = True b.cells[1][1] = True assert b.get_next_status(1, 1) is True
def _test_swipe(self, before, left, right, up, down): """Helper function. All arguments are strings""" bfs = board_from_string actions = [Action.LEFT, Action.RIGHT, Action.UP, Action.DOWN] for s, a in zip([left, right, up, down], actions): B0 = bfs(before) actual = Board.apply_action_on_board(B0, a) expected = board_from_string(s) err_msg = f"Action {a}: Got {actual.render()}, expected {expected.render()}" assert actual == expected, err_msg
def step(self, action: int): """A single step in the game rewards: the natural logarithm of difference of the 2048 scoring. We also add some penalty in case of no-op """ self._step_counter += 1 done = False if self._step_counter > 100 and self._step_counter / (self.t + 1) > 5: # add 1 to avoid DivisionByZero for `self.t`. Yes, it happenend. done = True action = Action(1 + action) # discrete -> enum (which is 1-indexed) modified_board = Board.apply_action_on_board(self.board, action) info = {"step": self.t} if Board.is_game_over(modified_board): done = True reward = 0 else: # An action is invalid if it doesn't change the board. valid_action = modified_board != self.board if not valid_action: # We penalize the agent for doing no-op moves!!! >:( penalty = -0.1 info["no-op"] = True else: modified_board = Board.spawn_random_tile(modified_board) penalty = 0 self.t += 1 info["no-op"] = False diff = modified_board.score - self.board.score reward = np.log(1 + diff) + penalty reward = np.clip(reward, -11, 10) # TODO: move to a wrapper. self.board = modified_board return self.board, reward, done, info
def board_from_string(s, shape=None, dtype=np.int32) -> Board: """Utility function to create boards that are visually easy to see + validate Args: s: the string we want to make a board from shape: if None, we try to make a square array of `s` """ sep = " " arr = np.fromstring(re.sub(r"\s+", sep, s.strip("\n")), sep=sep) shape = repeat(int(np.sqrt(arr.size)), 2) if shape is None else shape arr = arr.astype(dtype).reshape(*shape) return Board.from_array(arr)
def observation(self, board): """Convert observation to numpy array with a unique channel for each tile. A `Board` cannot be used as an observaton. RLlib will complain and crash because RLlib expects arrays as observations. Therefore, we convert the `Board` to a numpy array, where the first channel has value 1 if it's empty. The second channel correspond to tiles with value 2, the third with value 3 and so on. The number of channels in the observation will be `1 + log2(max_tile_value)`. For example, `max_tile_value == 256` --> we have 9 tile values. Note: We assume all tiles are a multiple of 2! Returns: A dict with the following keys and values: - 'valid_action_mask': np.ndarray(4, float) The available actions - 'board': np.ndarray((n_rows, n_cols, n_channels), float) The board (in one-hot format). """ channel_indices = np.log2(np.where(board.values == 0, 1, board.values)) frac_values, _ = np.modf(channel_indices) if not frac_values.max() == 0: raise ValueError( "Unexpected input: got a tile that was not a power of 2. Can't " "safely convert observation.") channel_indices = channel_indices.astype(int) yy, xx = np.meshgrid(*[range(dim) for dim in channel_indices.shape]) one_hot_board = np.zeros(self.env.observation_space["board"].shape) if K.image_data_format() == "channels_first": one_hot_board[channel_indices.ravel(), yy.ravel(), xx.ravel()] = 1.0 else: one_hot_board[yy.ravel(), xx.ravel(), channel_indices.ravel()] = 1.0 valid_action_mask = np.zeros(4) for action in Board.get_available_actions(board): index = action.value - 1 # enums are 1-indexed, so we subtract by 1. valid_action_mask[index] = 1.0 processed_obs = { "valid_action_mask": valid_action_mask, "board": one_hot_board } return processed_obs
def test_blinker(): b = Board() b.cells[2][1] = True b.cells[2][2] = True b.cells[2][3] = True b.next_step() assert all(( b.cells[3][2], b.cells[2][2], b.cells[1][2], ))
def test_available_actions_work_as_expected(): s = """1 2 3 1 4 5 6 7 8""" board = board_from_string(s) available_actions = Board.get_available_actions(board) assert available_actions == set([Action.DOWN, Action.UP]) s = """ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16""" board = board_from_string(s) available_actions = Board.get_available_actions(board) assert available_actions == set([]) s = """ 1 2 2 4 5 2 7 8 9 10 11 12 13 14 15 16""" board = board_from_string(s) available_actions = Board.get_available_actions(board) assert available_actions == set([Action.LEFT, Action.RIGHT, Action.UP, Action.DOWN])
def reset(self): self.t = 0 self._step_counter = 0 self.board = Board.init_random(shape=self.env_config["board_shape"], n_tiles=2) return self.board
def test_move(self): board = Board() board.move(PLAYER1, 2, 1) assert board.board_state[2][1] == PLAYER1
def test_alive_neighbors(): b = Board() b.cells[0][1] = True assert b.get_alive_neighbors(1, 1) == [(0, 1)]
def train_nn(data_filename, board_height=6, board_width=6, n_in_row=4, batch_size=32, epochs=15, learning_rate=5e-3, check_freq=200): train = True dataset = Dataset(file_name=data_filename, default_bs=batch_size, n_samples=2000, augument=True) if train: teacher = PolicyValueNet(board_width=board_width, board_height=board_height, model_file=os.path.join(results_dir, 'zero_17_4_15:36', 'policy_1500.model')) test_x = get_test_x() teacher_probs, teacher_value = teacher.policy_value(test_x) # teacher_move = np.random.choice(np.arange(36), p=teacher_probs) tf.reset_default_graph() student = PolicyValueNet(board_width=board_width, board_height=board_height) save_dir = os.path.join(results_dir, "student") if not os.path.exists(save_dir): os.makedirs(save_dir) else: student = PolicyValueNet(board_width=board_width, board_height=board_height, #model_file=os.path.join(results_dir, 'zero_17_4_15:36', 'policy_1500.model')) model_file=os.path.join(results_dir, "student", 'current_policy.model')) board = Board(width=board_width, height=board_height, n_in_row=n_in_row) game = Game(board) pure_mcts_playout_num = 1000 c_puct = 5 n_playouts = 400 n_games = 10 log_freq = 50 counter = 0 correct_counter = 0 start_time = time() if train: while dataset.current_epoch < epochs: states, probs, winners = dataset.next_batch() if dataset.current_epoch >= 10: learning_rate = 1e-3 elif dataset.current_epoch >= 30: learning_rate = 5e-4 loss, entropy = student.train_step(states, probs, winners, lr=learning_rate) if (counter + 1) % log_freq == 0: logger.info("{0:} time: {1:4.3f}, loss: {2:.4f}, entropy: {3:.4f}".format( dataset.get_progress(), time() - start_time, loss, entropy)) counter += 1 if (counter + 1) % check_freq == 0: student_probs, student_value = student.policy_value(test_x) # student_move = np.random.choice(np.arange(36), p=student_probs) kl = d_kl(student_probs, teacher_probs) mse = np.mean(np.square(student_value - teacher_value)) prob_mse = np.mean(np.square(student_probs - teacher_probs)) max_abs = np.max(np.abs(student_probs - teacher_probs)) logger.info("evaluation: Dkl: {:.4}, MSE: {:.4}, prob_mse: {:.5}, max: {:.3}".format(kl, mse, prob_mse, max_abs)) student.save_model(os.path.join(save_dir, 'current_policy.model')) winners = [] current_mcts_player = MCTSPlayer(student.policy_value_fn, c_puct=c_puct, n_playout=n_playouts, is_selfplay=False) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=pure_mcts_playout_num) with trange(n_games) as t: for i in t: winner = game.start_play(pure_mcts_player, current_mcts_player, start_player=1, is_shown=0) winners.append(winner) t.set_postfix(wins=sum(np.array(winners) == 2)) logger.info("Evaluation: n_playouts:{}, wins: {}, ties:{}".format(pure_mcts_playout_num, sum(np.array(winners) == 2), sum(np.array(winners) == -1)))
default=1, help='An initial temperature') args = parser.parse_args() if args.init_model == 'best_policy.model': files = os.listdir(results_dir) files.sort() most_recent = files[-1] args.init_model = os.path.join(results_dir, most_recent, 'best_policy.model') else: args.init_model = os.path.join(results_dir, args.init_model) logger.info(args) board = Board(width=args.board_width, height=args.board_height, n_in_row=args.n_in_row) game = Game(board) policy_value_net = PolicyValueNet(args.board_width, args.board_height, model_file=args.init_model) mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=args.c_puct, n_playout=args.n_playout, is_selfplay=1) states_buf, probs_buf, winners_buf = [], [], [] counter = 0 while len(states_buf) < args.n_examples: _, (states, probs, winners) = game.start_self_play(mcts_player, temp=args.temperature,
def setup(self, app_root): self.board = Board(app_root)
class NineMenMorrisGame(Widget): def callback(instance): if instance.text == "Person": print('p') NineMenMorrisGame.against="person" elif instance.text=="AI": print('a') NineMenMorrisGame.against="ai" NineMenMorrisGame.phase = 0 NineMenMorrisGame.popup.dismiss() box = BoxLayout(orientation='vertical', padding=(10)) box.add_widget(Label(text="Play against a person or AI?",font_size=13)) popup = Popup(title='Select Opponent', title_size=(30), title_align='center', content=box, size_hint=(None, None), size=(200, 200), auto_dismiss=False) box.add_widget(Button(text="Person", on_press=callback)) box.add_widget(Button(text="AI", on_press=callback)) popup.open() phase = 2 against = "none" turn = 1 validTurn = False lastPhase = 0 white1 = ObjectProperty(None) white2 = ObjectProperty(None) white3 = ObjectProperty(None) white4 = ObjectProperty(None) white5 = ObjectProperty(None) white6 = ObjectProperty(None) white7 = ObjectProperty(None) white8 = ObjectProperty(None) white9 = ObjectProperty(None) black1 = ObjectProperty(None) black2 = ObjectProperty(None) black3 = ObjectProperty(None) black4 = ObjectProperty(None) black5 = ObjectProperty(None) black6 = ObjectProperty(None) black7 = ObjectProperty(None) black8 = ObjectProperty(None) black9 = ObjectProperty(None) def setup(self, app_root): self.board = Board(app_root) def on_touch_down(self, touch): # Black turn if not self.turn % 2: print('phase {}'.format(self.phase)) self.board.prevBlackMills = self.board.blackMills() #Human Player for Black Pieces # Placement phase for person if self.against == "person" and self.phase == 0: pieceName = 'black' + str(int((self.turn + 1) / 2)) piece = getattr(self, pieceName) self.validTurn = self.board.place(piece, touch) # Moving phase for person if self.against == "person" and self.phase == 1: if self.board.selected: self.validTurn = self.board.move(touch, 'black') else: self.board.select(touch, 'black') # Removing phase for person if self.against == "person" and self.phase == 2: self.validTurn = self.board.remove(touch, 'white') if self.validTurn: self.phase = self.lastPhase # AI Player for Black Pieces # Placement phase for ai if self.against == "ai" and self.phase == 0: pieceName = 'black' + str(int((self.turn + 1) / 2)) piece = getattr(self, pieceName) #self.validTurn = self.board.place(piece, touch) self.validTurn = self.board.placeAI(piece) # Moving phase for ai if self.against == "ai" and self.phase == 1: if self.board.selected: self.validTurn = self.board.moveAI('black') else: self.board.selectAI('black') # Removing phase for ai if self.against == "ai" and self.phase == 2: #self.validTurn = self.board.remove(touch, 'white') self.validTurn = self.board.removeAI('white') if self.validTurn: self.phase = self.lastPhase # Check for new mills if self.board.blackMills() > self.board.prevBlackMills: print('black made a new mill') self.validTurn = False # still your turn self.lastPhase = self.phase # last phase self.phase = 2 # next click will be removal else: print('phase {}'.format(self.phase)) self.board.prevWhiteMills = self.board.whiteMills() # Placement phase if self.phase == 0: pieceName = 'white' + str(int((self.turn + 1) / 2)) piece = getattr(self, pieceName) self.validTurn = self.board.place(piece, touch) # Moving phase if self.phase == 1: if self.board.selected: self.validTurn = self.board.move(touch, 'white') else: self.board.select(touch, 'white') # Removing phase if self.phase == 2: self.validTurn = self.board.remove(touch, 'black') if self.validTurn: self.phase = self.lastPhase # Check for new mills if self.board.whiteMills() > self.board.prevWhiteMills: print('white made a new mill') self.validTurn = False # still your turn self.lastPhase = self.phase # last phase self.phase = 2 # next click will be removal if self.validTurn: self.turn += 1 self.validTurn = False if self.turn % 2: print('black - phase {} - mills {}'.format(self.phase, self.board.blackMills())) else: print('white - phase {} - mills {}'.format(self.phase, self.board.whiteMills())) if self.turn > 18 and self.phase != 2: self.phase = 1 if self.board.trashedBlack >= 7 or self.board.trashedWhite >= 7: print('game over')
def test_checkStatus_in_progress(self): board = Board() board.move(PLAYER1, 0, 0) board.move(PLAYER1, 0, 1) board.move(PLAYER2, 0, 2) board.move(PLAYER2, 1, 0) board.move(PLAYER1, 1, 1) board.move(PLAYER1, 1, 2) board.move(PLAYER1, 2, 0) board.move(PLAYER2, 2, 1) emptyPositions = board.get_empty_positions() assert len(emptyPositions) == 1 assert board.checkStatus() == -1
def test_create_board(): b = Board() assert len(b.cells) == Config.board_size[0] assert len(b.cells[0]) == Config.board_size[1] assert not any((any(c) for c in b.cells))
def test_checkStatus_player_one_win(self): board = Board() board.move(PLAYER1, 0, 0) board.move(PLAYER1, 0, 1) board.move(PLAYER2, 0, 2) board.move(PLAYER2, 1, 0) board.move(PLAYER1, 1, 1) board.move(PLAYER1, 1, 2) board.move(PLAYER2, 2, 0) board.move(PLAYER1, 2, 1) emptyPositions = board.get_empty_positions() assert len(emptyPositions) == 0 assert board.checkStatus() == 1
def __init__(self, init_model=None, board_width=6, board_height=6, n_in_row=4, learning_rate=2e-3, n_playouts=400, batch_size=512, train_steps=5, check_freq=100, n_iters=1500, save_dir=None, debug=False): # params of the board and the game self.board_width = board_width self.board_height = board_height self.n_in_row = n_in_row self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learning_rate = learning_rate self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.initial_temp = 1.0 # the temperature param self.n_playouts = n_playouts # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = batch_size # mini-batch size for training self.states_buffer = deque(maxlen=self.buffer_size) self.probs_buffer = deque(maxlen=self.buffer_size) self.winners_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.train_steps = train_steps # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = check_freq self.game_batch_num = n_iters self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.save_dir = save_dir self.debug = debug self.save_freq = 100 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playouts, is_selfplay=True)