def test_mcts_from_root_with_equal_priors(self): class MockModel: def predict(self, board): # starting board is: # [0, 0, 1, -1] return np.array([0.26, 0.24, 0.24, 0.26]), 0.0001 game = Connect2Game() args = {'num_simulations': 50} model = MockModel() mcts = MCTS(game, model, args) canonical_board = [0, 0, 0, 0] print("starting") root = mcts.run(model, canonical_board, to_play=1, add_exploration_noise=False) # the best move is to play at index 1 or 2 best_outer_move = max(root.children[0].visit_count, root.children[0].visit_count) best_center_move = max(root.children[1].visit_count, root.children[2].visit_count) self.assertGreater(best_center_move, best_outer_move)
def __init__(self, policy_value_fn, c_puct=5, n_playout=2000, is_selfplay=0): self.mcts = MCTS(policy_value_fn, c_puct, n_playout) self.is_selfplay = is_selfplay
def play_game(): tree = MCTS() board = new_domineering_board() board.to_pretty_string() while True: row_col = input("enter row,col: ") row, col = map(int, row_col.split(",")) stdout.write('You choose ({}, {})'.format(row, col)) index = conf.BOARD_Y_SIZE * (row - 1) + (col - 1) if (board.tup[index] is not None) and ( board.is_valid_move(index + conf.BOARD_Y_SIZE)): raise RuntimeError("Invalid move") board = board.make_move(index) board.to_pretty_string() if board.terminal: stdout.write("\nWinner is {}".format( conf.PLAYERS_NAME[board.winner])) break # You can train as you go, or only at the beginning. # Here, we train as we go, doing fifty rollouts each turn. for _ in range(conf.TRAINING_EPOCHS): tree.do_rollout(board) board = tree.choose(board) board.to_pretty_string() if board.terminal: stdout.write("\nWinner is {}".format( conf.PLAYERS_NAME[board.winner])) break
def exceute_episode(self): train_examples = [] current_player = 1 state = self.game.get_init_board() while True: canonical_board = self.game.get_canonical_board( state, current_player) self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1) action_probs = [0 for _ in range(self.game.get_action_size())] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temperature=0) state, current_player = self.game.get_next_state( state, current_player, action) reward = self.game.get_reward_for_player(state, current_player) if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] ret.append( (hist_state, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret
def play_game(): tree = MCTS() board = new_tic_tac_toe_board() print(board.to_pretty_string()) while True: row_col = input("enter row,col: ") row, col = map(int, row_col.split(",")) index = 3 * (row - 1) + (col - 1) if board.tup[index] is not None: raise RuntimeError("Invalid move") board = board.make_move(index) print(board.to_pretty_string()) if board.terminal: break # You can train as you go, or only at the beginning. # Here, we train as we go, doing fifty rollouts each turn. for _ in range(2): tree.do_rollout(board) print(tree.children) print(len(tree.children[board])) for b in tree.children[board]: print(colored(b.to_pretty_string(), 'green')) board = tree.choose(board) print(board.to_pretty_string()) if board.terminal: break
def play_game_ocba(budget=1000, optimum=0, n0=5, sigma_0=1): mcts = MCTS(policy='ocba', budget=budget, optimum=optimum, n0=n0, sigma_0=sigma_0) tree = new_tree() for _ in range(budget): mcts.do_rollout(tree) next_tree = mcts.choose(tree) return (mcts, tree, next_tree)
class myPlayer(AdvancePlayer): def __init__(self, _id): super().__init__(_id) self.opponent_id = None self.tree = MCTS() def SelectMove(self, moves, game_state): self.opponent_id = next( filter(lambda ps: ps.id != self.id, game_state.players)).id self.board = new_azul_board(game_state, self.id, self.opponent_id) for _ in range(30): self.tree.do_rollout(self.board) best_board = self.tree.choose(self.board) return best_board.game_state.lastmove
def test_mcts_finds_best_move_with_equal_priors(self): class MockModel: def predict(self, board): return np.array([0.51, 0.49, 0, 0]), 0.0001 game = Connect2Game() args = {'num_simulations': 25} model = MockModel() mcts = MCTS(game, model, args) canonical_board = [0, 0, -1, 1] root = mcts.run(model, canonical_board, to_play=1) # the better move is to play at index 1 self.assertLess(root.children[0].visit_count, root.children[1].visit_count)
def play_game_uct(budget=1000, exploration_weight=1, optimum=0, n0=2, sigma_0=1): mcts = MCTS(policy='uct', exploration_weight=exploration_weight, budget=budget, n0=n0, sigma_0=sigma_0) tree = new_tree() for _ in range(budget): mcts.do_rollout(tree) next_tree = mcts.choose(tree) return (mcts, tree, next_tree)
def exceute_episode(self): train_examples = [] current_player = 1 episode_step = 0 state = self.game.get_init_board() while True: episode_step += 1 canonical_board = self.game.get_canonical_board( state, current_player) temp = int(episode_step < self.args['tempThreshold']) add_exploration_noise = temp > 0 self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1, add_exploration_noise=add_exploration_noise) action_probs = [0 for _ in range(self.game.get_action_size())] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temp) state, current_player = self.game.get_next_state( state, current_player, action) reward = self.game.get_game_ended(state, current_player) if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] ret.append( (hist_state, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret
def test_mcts_finds_best_move_with_really_bad_priors(self): class MockModel: def predict(self, board): # starting board is: # [0, 0, 1, -1] return np.array([0.3, 0.7, 0, 0]), 0.0001 game = Connect2Game() args = {'num_simulations': 25} model = MockModel() mcts = MCTS(game, model, args) canonical_board = [0, 0, 1, -1] print("starting") root = mcts.run(model, canonical_board, to_play=1) # the best move is to play at index 1 self.assertGreater(root.children[1].visit_count, root.children[0].visit_count)
def exceute_episode(self): train_examples = [] current_player = 1 state = gogame.init_state(self.args['boardSize']) while True: #print("while True") canonical_board = gogame.canonical_form(state) self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1) action_probs = [ 0 for _ in range((self.args['boardSize'] * self.args['boardSize']) + 1) ] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temperature=1) state = gogame.next_state(state, action, canonical=False) current_player = -current_player reward = gogame.winning( state) * current_player if gogame.game_ended(state) else None if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] tfBoard = np.array( [hist_state[0], hist_state[1], hist_state[3]]).transpose().tolist() #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))])) ret.append( (tfBoard, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret
def mcts_playout(depth, num_iter, num_rollout, exploration_weight): root, leaf_nodes_dict = make_binary_tree(depth=depth) leaf_nodes_dict_sorted = sorted(leaf_nodes_dict.items(), key=lambda x: x[1], reverse=True) print("Expected (max) leaf node: {}, value: {}".format( leaf_nodes_dict_sorted[0][0], leaf_nodes_dict_sorted[0][1])) print("Expected (min) leaf node: {}, value: {}".format( leaf_nodes_dict_sorted[-1][0], leaf_nodes_dict_sorted[-1][1])) mcts = MCTS(exploration_weight=exploration_weight) while True: # we run MCTS simulation for many times for _ in range(num_iter): mcts.run(root, num_rollout=num_rollout) # we choose the best greedy action based on simulation results root = mcts.choose(root) # we repeat until root is terminal if root.is_terminal(): print("Found optimal (max) leaf node: {}, value: {}".format( root, root.value)) return root.value
def test_mcts_finds_best_move_with_really_really_bad_priors(self): class MockModel: def predict(self, board): # starting board is: # [-1, 0, 0, 0] return np.array([0, 0.3, 0.3, 0.3]), 0.0001 game = Connect2Game() args = {'num_simulations': 100} model = MockModel() mcts = MCTS(game, model, args) canonical_board = [-1, 0, 0, 0] root = mcts.run(model, canonical_board, to_play=1, add_exploration_noise=False) # the best move is to play at index 1 self.assertGreater(root.children[1].visit_count, root.children[2].visit_count) self.assertGreater(root.children[1].visit_count, root.children[3].visit_count)
class MCTSAI(AI): tree = MCTS() def __init__(self, name: str, nRollout: int = 5): self.nRollout: int = nRollout super().__init__(name) def play(self, state: ReversiState): for _ in range(self.nRollout): self.tree.do_rollout(state) return self.tree.choose(state) to_char = lambda v: ("⚫" if v is True else ("⚪" if v is False else " "))
class MCTSPlayer: """AI player based on MCTS""" def __init__(self, policy_value_fn, c_puct=5, n_playout=2000, is_selfplay=0): self.mcts = MCTS(policy_value_fn, c_puct, n_playout) self.is_selfplay = is_selfplay def reset_player(self): self.mcts.update_with_move(-1) def get_action(self, board, temp=1e-3, return_prob=0): sensible_moves = board.possible_moves() # the pi vector returned by MCTS as in the alphaGo Zero paper move_probs = np.zeros(217) if sensible_moves: acts, probs = self.mcts.get_move_probs(board, temp) counter = 0 for act in acts: index = action_space[act] move_probs[index] = probs[counter] counter += 1 if self.is_selfplay: # add Dirichlet Noise for exploration (needed for # self-play training) move = np.random.choice( acts, p=0.75 * probs + 0.25 * np.random.dirichlet(0.3 * np.ones(len(probs)))) # update the root node and reuse the search tree self.mcts.update_with_move(move) else: # with the default temp=1e-3, it is almost equivalent # to choosing the move with the highest prob move = np.random.choice(acts, p=probs) # reset the root node self.mcts.update_with_move(-1) if return_prob: return move, move_probs else: return move else: print("WARNING ! Game is over.")
def __init__(self, game, model, args): self.game = game self.model = model self.args = args self.mcts = MCTS(self.game, self.model, self.args)
class Trainer: def __init__(self, game, model, args): self.game = game self.model = model self.args = args self.mcts = MCTS(self.game, self.model, self.args) def exceute_episode(self): train_examples = [] current_player = 1 episode_step = 0 state = self.game.get_init_board() while True: episode_step += 1 canonical_board = self.game.get_canonical_board( state, current_player) self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1) action_probs = [0 for _ in range(self.game.get_action_size())] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temperature=0) state, current_player = self.game.get_next_state( state, current_player, action) reward = self.game.get_reward_for_player(state, current_player) if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] ret.append( (hist_state, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret def learn(self): for i in range(1, self.args['numIters'] + 1): print("{}/{}".format(i, self.args['numIters'])) train_examples = [] for eps in range(self.args['numEps']): iteration_train_examples = self.exceute_episode() train_examples.extend(iteration_train_examples) shuffle(train_examples) self.train(train_examples) filename = self.args['checkpoint_path'] self.save_checkpoint(folder=".", filename=filename) def train(self, examples): optimizer = optim.Adam(self.model.parameters(), lr=5e-4) pi_losses = [] v_losses = [] for epoch in range(self.args['epochs']): self.model.train() batch_idx = 0 while batch_idx < int(len(examples) / self.args['batch_size']): sample_ids = np.random.randint(len(examples), size=self.args['batch_size']) boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) boards = torch.FloatTensor(np.array(boards).astype(np.float64)) target_pis = torch.FloatTensor(np.array(pis)) target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) # predict boards = boards.contiguous().cuda() target_pis = target_pis.contiguous().cuda() target_vs = target_vs.contiguous().cuda() # compute output out_pi, out_v = self.model(boards) l_pi = self.loss_pi(target_pis, out_pi) l_v = self.loss_v(target_vs, out_v) total_loss = l_pi + l_v pi_losses.append(float(l_pi)) v_losses.append(float(l_v)) optimizer.zero_grad() total_loss.backward() optimizer.step() batch_idx += 1 print() print("Policy Loss", np.mean(pi_losses)) print("Value Loss", np.mean(v_losses)) print("Examples:") print(out_pi[0].detach()) print(target_pis[0]) def loss_pi(self, targets, outputs): loss = -(targets * torch.log(outputs)).sum(dim=1) return loss.mean() def loss_v(self, targets, outputs): loss = torch.sum((targets - outputs.view(-1))**2) / targets.size()[0] return loss def save_checkpoint(self, folder, filename): if not os.path.exists(folder): os.mkdir(folder) filepath = os.path.join(folder, filename) torch.save({ 'state_dict': self.model.state_dict(), }, filepath)
class Trainer: def __init__(self, game, model, args): self.game = game self.model = model self.args = args self.mcts = MCTS(self.game, self.model, self.args) def exceute_episode(self): train_examples = [] current_player = 1 state = gogame.init_state(self.args['boardSize']) while True: #print("while True") canonical_board = gogame.canonical_form(state) self.mcts = MCTS(self.game, self.model, self.args) root = self.mcts.run(self.model, canonical_board, to_play=1) action_probs = [ 0 for _ in range((self.args['boardSize'] * self.args['boardSize']) + 1) ] for k, v in root.children.items(): action_probs[k] = v.visit_count action_probs = action_probs / np.sum(action_probs) train_examples.append( (canonical_board, current_player, action_probs)) action = root.select_action(temperature=1) state = gogame.next_state(state, action, canonical=False) current_player = -current_player reward = gogame.winning( state) * current_player if gogame.game_ended(state) else None if reward is not None: ret = [] for hist_state, hist_current_player, hist_action_probs in train_examples: # [Board, currentPlayer, actionProbabilities, Reward] tfBoard = np.array( [hist_state[0], hist_state[1], hist_state[3]]).transpose().tolist() #ret.append(np.array([tfBoard,tfBoard, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))])) ret.append( (tfBoard, hist_action_probs, reward * ((-1)**(hist_current_player != current_player)))) return ret def learn(self): for i in range(1, self.args['numIters'] + 1): print("numIters: {}/{}".format(i, self.args['numIters'])) train_examples = [] for eps in range(self.args['numEps']): print("numEps: {}/{}".format(eps, self.args['numEps'])) iteration_train_examples = self.exceute_episode() train_examples.extend(iteration_train_examples) shuffle(train_examples) self.train(train_examples) def train(self, trainD): # Define the checkpoint checkpoint = keras.callbacks.ModelCheckpoint( self.args['checkpointPath'], monitor="val_loss", mode="min", save_best_only=True, verbose=0) # train the network print("Training network...") x = [i[0] for i in trainD] x = np.array(x) y1 = [i[1] for i in trainD] y2 = [i[2] for i in trainD] y1 = np.array(y1) y2 = np.array(y2) history = self.model.model.fit(x, y={ "action_output": y1, "Value_output": y2 }, validation_split=0.2, batch_size=self.args['batchSize'], epochs=self.args['epochs'], verbose=1, callbacks=[checkpoint]) # print accurary of the best epoch self.model.model.load_weights(self.args['checkpointPath'])
def __init__(self, _id): super().__init__(_id) self.opponent_id = None self.tree = MCTS()