def simulate_batch(self, g, p, m, n, k, rollout_batch_size=1, verbose=False): # Tracking score in batch score = np.zeros(len(self.player_names), dtype=int) print("\nSimulating games:\n") for i in range(1, g + 1): print("Game {0}/{1}".format(i, g)) # The shared brain between both bots brain = MCTS(m, rollout_batch_size) starting_player_idx = random.choice([ idx for idx in range(len(self.player_names)) ]) if p == "Mix" else p # Creating the main game game = Game([ Player(player_name, brain) for player_name in self.player_names ], starting_player_idx, n, k, verbose) # And a state manager to help the brain understand the game state_manager = StateManager(game) # While the game is still active, update the brain to the current game.state and perform the next action while not game.over: brain.update(game.state) game.select_stones(game.current_player().take_turn( game, state_manager)) # Increment the score of the winning player score[game.winning_player_idx] += 1 return [(self.player_names[i], score[i]) for i in range(len(score))]
class BasicPlayer(Player): def __init__(self, game, num_playouts: int, c_puct: float=5): self.num_playouts = num_playouts self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct) self._self_play = False self._c_puct = c_puct super(BasicPlayer, self).__init__(game) def get_action(self, last_move, return_probs=False, temperature=0.1): if last_move: self._mcts.update_with_move(last_move) action_probs = self._mcts.get_action_probs(self._game, num_playouts=self.num_playouts, temperature=temperature) actions, probs = zip(*action_probs.items()) i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs)) print(actions, probs, i) action = actions[i] self._mcts.update_with_move(action) if return_probs: full_probs = np.zeros(config.BOARD_SIZE*config.BOARD_SIZE) rows, cols = zip(*actions) full_probs[np.array(rows) * config.BOARD_SIZE + np.array(cols)] = np.array(probs) return action, full_probs.ravel() else: return action def policy_value_fn(self, board, to_dict=False): value = 0 moves = available_moves(board) p = 1 / len(moves) prior_probs = {move: p for move in moves} return prior_probs, value def reset(self): self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
def play_game(): """ Play a sample game between two UCT players where each player gets a different number of UCT iterations. """ board = chess.Board() board.reset() print(chess.svg.board(board)) state = ChessState(board=board, side_to_move=board.turn) while state.get_moves(): print(str(state)) if state.player_just_moved == chess.BLACK: m = MCTS.search(root_state=state, max_iteration=1000, verbose=False) # White else: m = MCTS.search(root_state=state, max_iteration=1, verbose=False) # Black print("Best Move: " + str(m) + "\n") state.do_move(m) if state.get_result(state.player_just_moved) == 1.0: print("Player " + players[int(state.player_just_moved)] + " wins!") elif state.get_result(state.player_just_moved) == 0.0: print("Player " + players[int(not state.player_just_moved)] + "wins!") else: print("Nobody wins!")
def __init__(self, game, network: nn.Module, c_puct: float=5): self._network = network network.eval() self._c_puct = c_puct self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct) self._self_play = False super(AlphaPlayer, self).__init__(game)
def test_predict(self): game_root = Game() root = Node(game_root) model = MagicMock() prediction = [ np.array([[0.25]]), np.reshape(np.arange(0.001, 0.897, step=0.001), newshape=(1, 896)) ] model.predict.return_value = prediction action_encoder = ActionEncoder(DirectionResolver()) mcts = MCTS(root, config={ 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2 }, model=model, state_encoder=StateEncoder(), action_encoder=action_encoder) value, probs, allowed_actions = mcts.predict_state_value(game_root) self.assertEqual(value, 0.25) self.assertCountEqual( allowed_actions, action_encoder.convert_moves_to_action_ids( game_root.get_possible_moves_from_current_player_perspective()) ) for idx, prob in enumerate(probs): if idx in allowed_actions: self.assertTrue(prob > 0.01) else: self.assertTrue(prob < np.exp(-40))
def test_evaluate_leaf(self): game_root = Game() root = Node(game_root) model = MagicMock() prediction = [ np.array([[0.25]]), np.reshape(np.arange(0.001, 0.897, step=0.001), newshape=(1, 896)) ] model.predict.return_value = prediction action_encoder = ActionEncoder(DirectionResolver()) mcts = MCTS(root, config={ 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2 }, model=model, state_encoder=StateEncoder(), action_encoder=action_encoder) _, probs, _ = mcts.predict_state_value(game_root) value = mcts.evaluate_leaf(root) self.assertEqual(value, 0.25) self.assertEqual(len(root.edges), 7) self.assertEqual(root.edges[0].action, 8) self.assertEqual(root.edges[0].stats['P'], probs[8]) self.assertEqual(root.edges[1].action, 104) self.assertEqual(root.edges[1].stats['P'], probs[104])
def __init__(self, neural_net, **kwargs): self.name = kwargs.get('name', 'student') self.learning = kwargs.get('learning', True) self.think_time = kwargs.get('think_time', DEFAULT_TRAIN_THINK_TIME) self.nn = neural_net self.mcts = MCTS(neural_net, learning=self.learning, think_time=self.think_time) self.last_run = {}
class StudentAgent(Agent): ''' This agent plays moves driven by a probability distribution ''' def __init__(self, neural_net, **kwargs): self.name = kwargs.get('name', 'student') self.learning = kwargs.get('learning', True) self.think_time = kwargs.get('think_time', DEFAULT_TRAIN_THINK_TIME) self.nn = neural_net self.mcts = MCTS(neural_net, learning=self.learning, think_time=self.think_time) self.last_run = {} def move(self, state, **kwargs): temp = kwargs.get('temp', 0) pre_known_node = kwargs.get('root', None) root = self.mcts.search(root=pre_known_node, state=state) move, probabilities = self.mcts.get_playing_move(temp) self.last_run['stats'] = self.mcts.stats() self.last_run['probabilities'] = probabilities self.last_run['chosen_child'] = root.children[move] self.last_run['confidence'] = root.children[move].N / root.N self.last_run['predicted_outcome'] = root.Q self.last_run['last_move'] = move return move def evaluate(self, state, **kwargs): if not state.is_over(): print('Valid moves:' + str(state.valid_moves())) print(self.str_stats()) def calculate_real_distribution(self, visit_count_distribution, temp): distribution = visit_count_distribution**temp distribution = distribution / distribution.sum() return distribution def str_stats(self): s = self.last_run['stats'] move = self.last_run['last_move'] out = '-' * 80 + '\n' out += '| Simulations: %13d | Time (s): %13.2f | Node/s: %13.2f |\n' % ( s['n'], s['time (s)'], s['node/s']) out += '-' * 80 + '\n' out += '| children_p: %-65s|\n' % s['children_p'].round(2).tolist() out += '-' * 80 + '\n' out += '| Visits: %-69s|\n' % s['ranks'] out += '-' * 80 + '\n' out += '| NN value: %16.2f | Win chance: %10.2f%% | Max depth: %10d |\n' % ( s['nn_value'], s['win_chance'] * 100, s['max_depth']) out += '=' * 80 + '\n' out += '| Preferred move: %-20d | Final move: %-26d|\n' % ( s['children_p'].argmax(), move) out += '-' * 80 + '\n' return out
def build_mcts(self, state): self.root = Node(state) self.mcts = MCTS(self.root, self.model, self.state_encoder, self.action_encoder, config=self.config)
class AlphaPlayer(Player): def __init__(self, game, network: nn.Module, c_puct: float=5): self._network = network network.eval() self._c_puct = c_puct self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct) self._self_play = False super(AlphaPlayer, self).__init__(game) def get_action(self, last_move, return_probs=False, temperature=0.1): if last_move: self._mcts.update_with_move(last_move) action_probs = self._mcts.get_action_probs(self._game, num_playouts=config.NUM_PLAYOUTS, temperature=temperature) actions, probs = zip(*action_probs.items()) if self._self_play: i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs)*0.75+np.random.dirichlet(0.3*np.ones(len(actions)))*0.25) else: i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs)) action = actions[i] self._mcts.update_with_move(action) if return_probs: full_probs = np.zeros(config.BOARD_SIZE*config.BOARD_SIZE) rows, cols = zip(*actions) full_probs[np.array(rows) * config.BOARD_SIZE + np.array(cols)] = np.array(probs) return action, full_probs.ravel() else: return action def set_self_play(self, value: bool): self._self_play = value def policy_value_fn(self, board, to_dict=False): x = board_to_state(board) x = th.tensor(x).float().to(self._network.device).unsqueeze(0) with th.no_grad(): prior_probs, value = self._network(x) if to_dict: prior_probs = prior_probs.cpu().view(*board.shape).numpy() moves = available_moves(board) rows, cols = zip(*moves) prior_probs = dict(zip(moves, prior_probs[np.array(rows), np.array(cols)])) value = value[0, 0].item() return prior_probs, value def reset(self): self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
def test_move_to_leaf(self): game = Game() root = Node(game) action_encoder = ActionEncoder(DirectionResolver()) mcts = MCTS(root, config={ 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2 }, model=None, state_encoder=None, action_encoder=action_encoder) puct = MagicMock() mcts.puct = puct child1 = Node(game.move(game.get_possible_moves()[0])) child2 = Node(game.move(game.get_possible_moves()[1])) child3 = Node(game.move(game.get_possible_moves()[2])) edge1 = Edge( root, child1, 0.33, action_encoder.convert_move_to_action_id( game.get_possible_moves()[0])) edge2 = Edge( root, child2, 0.34, action_encoder.convert_move_to_action_id( game.get_possible_moves()[1])) edge3 = Edge( root, child3, 0.33, action_encoder.convert_move_to_action_id( game.get_possible_moves()[2])) root.edges.append(edge1) root.edges.append(edge2) root.edges.append(edge3) puct.puct.return_value = edge2 leaf, value, done, breadcrumbs = mcts.move_to_leaf() self.assertEquals(leaf, child2) self.assertEquals(value, 0) self.assertEquals(done, 0) self.assertEquals(False, 0) self.assertEquals(True, 1)
def test_backfill(self): game_root = Game() root = Node(game_root) action_encoder = ActionEncoder(DirectionResolver()) position1 = game_root.move(game_root.get_possible_moves()[0]) child1 = Node(position1) edge1 = Edge( root, child1, 0.3, action_encoder.convert_move_to_action_id( game_root.get_possible_moves()[0])) position2 = position1.move(position1.get_possible_moves()[0]) child2 = Node(position2) edge2 = Edge( child1, child2, 0.2, action_encoder.convert_move_to_action_id( game_root.get_possible_moves()[0])) edge2.stats['N'] = 4 edge2.stats['W'] = 1 mcts = MCTS(root, config={ 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2 }, model=None, state_encoder=None, action_encoder=action_encoder) mcts.backfill(child2, -1, [edge2, edge1]) self.assertEquals(edge2.stats['N'], 5) self.assertEquals(edge2.stats['W'], 2) self.assertEquals(edge2.stats['Q'], 2 / 5) self.assertEquals(edge1.stats['N'], 1) self.assertEquals(edge1.stats['W'], -1) self.assertEquals(edge1.stats['Q'], -1)
def test_integration(self): HIDDEN_CNN_LAYERS = [{ 'filters': 75, 'kernel_size': (4, 4) }, { 'filters': 75, 'kernel_size': (4, 4) }, { 'filters': 75, 'kernel_size': (4, 4) }, { 'filters': 75, 'kernel_size': (4, 4) }, { 'filters': 75, 'kernel_size': (4, 4) }, { 'filters': 75, 'kernel_size': (4, 4) }] model = Residual_CNN(0.0001, 0.1, (2, 4, 8), 32 * 4, HIDDEN_CNN_LAYERS, momentum=0.9) game_root = Game() root = Node(game_root) mcts = MCTS(root, config={ 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2 }, model=model, state_encoder=StateEncoder(), action_encoder=ActionEncoder(DirectionResolver())) mcts.predict_state_value(game_root) mcts.evaluate_leaf(root)
writer = csv.writer(f) writer.writerow(run_statistics_cols) run_statistics_rows = [] for i in PROBLEMS_INDEXES_LIST: H0, H_final = create_grover_hamiltonians(N_QUBITS) for final_t in T_LIST: start_time = time.time() QuantumAnnealerEnv.init_env_from_params(H0, H_final, final_t, num_x_components=M, l=l, delta=DELTA, max_t_points=MAX_t_POINTS) n_actions = QuantumAnnealerEnv.n_actions mcts = MCTS(QuantumAnnealerEnv) mcts.initialize_search() best_merit = -float('inf') best_node_path: MCTSNode = None for j in range(NUM_EPISODES_PER_PROBLEM): curr_best_node, best_merit = execute_regular_mcts_episode( mcts, num_expansion=N_EXP, num_simulations=N_SIM, best_merit=best_merit) best_node_path = best_node_path if curr_best_node is None else curr_best_node if j % 10 == 0 and j != 0 and j != NUM_EPISODES_PER_PROBLEM - 1 and best_node_path is not None: print(f'after {j} episodes of mcts the best fidelity is:') print(best_node_path.get_fidelity_of_node())
def reset(self): self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
def __init__(self, game, num_playouts: int, c_puct: float=5): self.num_playouts = num_playouts self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct) self._self_play = False self._c_puct = c_puct super(BasicPlayer, self).__init__(game)
def find_best_move(self, max_iteration=1000): root_state = ChessState(self.board, self.board.turn) best_move = chess.Move.uci(MCTS.search(root_state, max_iteration)) return best_move
def agent(hps): ''' Agent function ''' tf.reset_default_graph() # storage result = {} env_steps, ep_return = [], [ ] # will indicate the timestep for the learning curve losses, gn = [], [] best_R = -np.Inf Env = make_game(hps.game) D = Database(max_size=max(hps.data_size, hps.n_mcts * hps.steps_per_ep), batch_size=hps.batch_size) model = Model(Env, lr=hps.lr, n_mix=hps.n_mix, clip_gradient_norm=hps.clip_gradient_norm, loss_type=hps.loss_type, bound=hps.bound, temp=hps.temp, entropy_l=hps.entropy_l) #with tf.Session() as sess,sess.as_default(): with tf.Session() as sess: if hps.tfdb: sess = tf_debug.LocalCLIDebugWrapperSession(sess) model.sess = sess sess.run(tf.global_variables_initializer()) global_t_mcts = 0 global_t = 0 for ep in range(hps.n_eps): start = time.time() root_index = Env.reset() root = None R = 0.0 # episode reward t = 0 # episode steps seed = np.random.randint(1e7) Env.seed(seed) a_store = [] while True: # run an episode if hps.timeit: now = time.time() root = MCTS(root_index, root, Env, N=hps.n_mcts, model=model, c=hps.c, bootstrap_V=hps.bootstrap_V, block_loop=hps.block_loop, sigma_tree=hps.sigma_tree, backup_Q=hps.backup_Q, backup_sigma_tree=hps.backup_sigma_tree, seed=seed, a_his=a_store, alpha=hps.alpha, C_widening=hps.C_widening, use_prior=hps.use_prior, timeit=hps.timeit, random_action_frac=hps.random_action_frac) if hps.timeit: print( 'One MCTS search takes {} seconds'.format(time.time() - now)) if hps.verbose_mcts: display_info(root, '{}'.format(t), hps.c) probs, a_list, V, a, a_argmax = root.return_results( decision_type=hps.decision_type, loss_type=hps.loss_type, temperature=hps.temp, V_decision=hps.V_decision) for k, prob in enumerate(probs): D.store((root.index, V, a_list[k], np.array([prob]))) #if count == 0: # print('Warning',[child_action.n for child_action in root.child_actions],display_info(root,'{}'.format(t),hps.c)) # Make the step a_store.append(a) s1, r, terminal, _ = Env.step(a) R += r t += 1 global_t += 1 global_t_mcts += hps.n_mcts #if hps.verbose: # if (t % 50) == 0: # print('Overall step {}, root currently returns V {}, and considers a {} with counts {}'.format(global_t,V,a_list,probs)) if terminal or (t > hps.steps_per_ep): if hps.verbose: print('Episode terminal, total reward {}, steps {}'. format(R, t)) ep_return.append(R) env_steps.append(global_t_mcts) break # break out, start new episode else: root = root.forward(a_argmax, s1, r, terminal, model) # saving result.update({'steps': env_steps, 'return': ep_return}) if hps.verbose: result.update({'gn': gn, 'loss': losses}) #if R > best_R: # result.update({'seed':seed,'actions':a_store,'R':best_R}) # best_R = R store_safely(hps.result_dir, 'result', result) # Train if (global_t_mcts > hps.n_t) or (ep > hps.n_eps): break # end learning else: n_epochs = hps.n_epochs * (np.ceil( hps.n_mcts / 20)).astype(int) #print(n_epochs) loss = model.train(D, n_epochs, hps.lr) losses.append(loss['total_loss']) gn.append(loss['gn']) if hps.verbose: print( 'Time {}, Episode {}, Return {}, V {}, gn {}, Vloss {}, piloss {}' .format(global_t_mcts, ep, R, loss['V'], loss['gn'], loss['V_loss'], loss['pi_loss'])) print('Actions {}, probs {}'.format(np.array(a_list), probs)) print('One full episode loop + training in {} seconds'.format( time.time() - start)) return result
from games.nim.game import Game from games.nim.statemanager import StateManager from games.player import Player from src.mcts import MCTS play = True while play: # Create the brain of the MCTS AI(s), the list of Players, set up the Game and initialize the StateManager brain = MCTS(m=1000, rollout_batch_size=100) players = [Player("Tobias"), Player("AI-bert", brain=brain)] game = Game(players, starting_player_idx=0, total_nr_of_stones=15, max_selection=3) state_manager = StateManager(game) # Play until the game ends while not game.over: # Update the internal state of the AI(s) brain brain.update(game.state) # Request action from current player and execute it game.select_stones(game.current_player().take_turn( game, state_manager)) # Ask user if a new game should be started input_given = False while not input_given: inpt = input("Do you want to play another game of NIM? (y/n): ") if inpt == "y": input_given = True