Ejemplo n.º 1
0
 def simulate_batch(self,
                    g,
                    p,
                    m,
                    n,
                    k,
                    rollout_batch_size=1,
                    verbose=False):
     # Tracking score in batch
     score = np.zeros(len(self.player_names), dtype=int)
     print("\nSimulating games:\n")
     for i in range(1, g + 1):
         print("Game {0}/{1}".format(i, g))
         # The shared brain between both bots
         brain = MCTS(m, rollout_batch_size)
         starting_player_idx = random.choice([
             idx for idx in range(len(self.player_names))
         ]) if p == "Mix" else p
         # Creating the main game
         game = Game([
             Player(player_name, brain) for player_name in self.player_names
         ], starting_player_idx, n, k, verbose)
         # And a state manager to help the brain understand the game
         state_manager = StateManager(game)
         # While the game is still active, update the brain to the current game.state and perform the next action
         while not game.over:
             brain.update(game.state)
             game.select_stones(game.current_player().take_turn(
                 game, state_manager))
         # Increment the score of the winning player
         score[game.winning_player_idx] += 1
     return [(self.player_names[i], score[i]) for i in range(len(score))]
Ejemplo n.º 2
0
class BasicPlayer(Player):
    def __init__(self, game, num_playouts: int, c_puct: float=5):
        self.num_playouts = num_playouts
        self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct)
        self._self_play = False
        self._c_puct = c_puct
        super(BasicPlayer, self).__init__(game)

    def get_action(self, last_move, return_probs=False, temperature=0.1):
        if last_move:
            self._mcts.update_with_move(last_move)
        action_probs = self._mcts.get_action_probs(self._game, num_playouts=self.num_playouts, temperature=temperature)
        actions, probs = zip(*action_probs.items())
        i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs))
        print(actions, probs, i)
        action = actions[i]
        self._mcts.update_with_move(action)
        if return_probs:
            full_probs = np.zeros(config.BOARD_SIZE*config.BOARD_SIZE)
            rows, cols = zip(*actions)
            full_probs[np.array(rows) * config.BOARD_SIZE + np.array(cols)] = np.array(probs)
            return action, full_probs.ravel()
        else:
            return action

    def policy_value_fn(self, board, to_dict=False):
        value = 0
        moves = available_moves(board)
        p = 1 / len(moves)
        prior_probs = {move: p for move in moves}
        return prior_probs, value

    def reset(self):
        self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
Ejemplo n.º 3
0
def play_game():
    """
    Play a sample game between two UCT players where each player gets a different number of UCT iterations.
    """
    board = chess.Board()
    board.reset()
    print(chess.svg.board(board))
    state = ChessState(board=board, side_to_move=board.turn)

    while state.get_moves():
        print(str(state))
        if state.player_just_moved == chess.BLACK:
            m = MCTS.search(root_state=state, max_iteration=1000, verbose=False)  # White
        else:
            m = MCTS.search(root_state=state, max_iteration=1, verbose=False)  # Black

        print("Best Move: " + str(m) + "\n")
        state.do_move(m)

    if state.get_result(state.player_just_moved) == 1.0:
        print("Player " + players[int(state.player_just_moved)] + " wins!")
    elif state.get_result(state.player_just_moved) == 0.0:
        print("Player " + players[int(not state.player_just_moved)] + "wins!")
    else:
        print("Nobody wins!")
Ejemplo n.º 4
0
 def __init__(self, game, network: nn.Module, c_puct: float=5):
     self._network = network
     network.eval()
     self._c_puct = c_puct
     self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct)
     self._self_play = False
     super(AlphaPlayer, self).__init__(game)
Ejemplo n.º 5
0
    def test_predict(self):
        game_root = Game()
        root = Node(game_root)
        model = MagicMock()

        prediction = [
            np.array([[0.25]]),
            np.reshape(np.arange(0.001, 0.897, step=0.001), newshape=(1, 896))
        ]
        model.predict.return_value = prediction

        action_encoder = ActionEncoder(DirectionResolver())
        mcts = MCTS(root,
                    config={
                        'ALPHA': 0.8,
                        'CPUCT': 1,
                        'EPSILON': 0.2
                    },
                    model=model,
                    state_encoder=StateEncoder(),
                    action_encoder=action_encoder)

        value, probs, allowed_actions = mcts.predict_state_value(game_root)

        self.assertEqual(value, 0.25)
        self.assertCountEqual(
            allowed_actions,
            action_encoder.convert_moves_to_action_ids(
                game_root.get_possible_moves_from_current_player_perspective())
        )
        for idx, prob in enumerate(probs):
            if idx in allowed_actions:
                self.assertTrue(prob > 0.01)
            else:
                self.assertTrue(prob < np.exp(-40))
Ejemplo n.º 6
0
    def test_evaluate_leaf(self):
        game_root = Game()
        root = Node(game_root)
        model = MagicMock()

        prediction = [
            np.array([[0.25]]),
            np.reshape(np.arange(0.001, 0.897, step=0.001), newshape=(1, 896))
        ]
        model.predict.return_value = prediction

        action_encoder = ActionEncoder(DirectionResolver())
        mcts = MCTS(root,
                    config={
                        'ALPHA': 0.8,
                        'CPUCT': 1,
                        'EPSILON': 0.2
                    },
                    model=model,
                    state_encoder=StateEncoder(),
                    action_encoder=action_encoder)
        _, probs, _ = mcts.predict_state_value(game_root)
        value = mcts.evaluate_leaf(root)
        self.assertEqual(value, 0.25)
        self.assertEqual(len(root.edges), 7)
        self.assertEqual(root.edges[0].action, 8)
        self.assertEqual(root.edges[0].stats['P'], probs[8])

        self.assertEqual(root.edges[1].action, 104)
        self.assertEqual(root.edges[1].stats['P'], probs[104])
Ejemplo n.º 7
0
 def __init__(self, neural_net, **kwargs):
     self.name = kwargs.get('name', 'student')
     self.learning = kwargs.get('learning', True)
     self.think_time = kwargs.get('think_time', DEFAULT_TRAIN_THINK_TIME)
     self.nn = neural_net
     self.mcts = MCTS(neural_net,
                      learning=self.learning,
                      think_time=self.think_time)
     self.last_run = {}
Ejemplo n.º 8
0
class StudentAgent(Agent):
    '''
        This agent plays moves driven by a probability distribution
    '''
    def __init__(self, neural_net, **kwargs):
        self.name = kwargs.get('name', 'student')
        self.learning = kwargs.get('learning', True)
        self.think_time = kwargs.get('think_time', DEFAULT_TRAIN_THINK_TIME)
        self.nn = neural_net
        self.mcts = MCTS(neural_net,
                         learning=self.learning,
                         think_time=self.think_time)
        self.last_run = {}

    def move(self, state, **kwargs):
        temp = kwargs.get('temp', 0)
        pre_known_node = kwargs.get('root', None)
        root = self.mcts.search(root=pre_known_node, state=state)
        move, probabilities = self.mcts.get_playing_move(temp)
        self.last_run['stats'] = self.mcts.stats()
        self.last_run['probabilities'] = probabilities
        self.last_run['chosen_child'] = root.children[move]
        self.last_run['confidence'] = root.children[move].N / root.N
        self.last_run['predicted_outcome'] = root.Q
        self.last_run['last_move'] = move
        return move

    def evaluate(self, state, **kwargs):
        if not state.is_over():
            print('Valid moves:' + str(state.valid_moves()))
            print(self.str_stats())

    def calculate_real_distribution(self, visit_count_distribution, temp):
        distribution = visit_count_distribution**temp
        distribution = distribution / distribution.sum()
        return distribution

    def str_stats(self):
        s = self.last_run['stats']
        move = self.last_run['last_move']

        out = '-' * 80 + '\n'
        out += '| Simulations: %13d | Time (s): %13.2f | Node/s: %13.2f |\n' % (
            s['n'], s['time (s)'], s['node/s'])
        out += '-' * 80 + '\n'
        out += '| children_p: %-65s|\n' % s['children_p'].round(2).tolist()
        out += '-' * 80 + '\n'
        out += '| Visits: %-69s|\n' % s['ranks']
        out += '-' * 80 + '\n'
        out += '| NN value: %16.2f | Win chance: %10.2f%% | Max depth: %10d |\n' % (
            s['nn_value'], s['win_chance'] * 100, s['max_depth'])
        out += '=' * 80 + '\n'
        out += '| Preferred move: %-20d | Final move: %-26d|\n' % (
            s['children_p'].argmax(), move)
        out += '-' * 80 + '\n'
        return out
Ejemplo n.º 9
0
 def build_mcts(self, state):
     self.root = Node(state)
     self.mcts = MCTS(self.root,
                      self.model,
                      self.state_encoder,
                      self.action_encoder,
                      config=self.config)
Ejemplo n.º 10
0
class AlphaPlayer(Player):
    def __init__(self, game, network: nn.Module, c_puct: float=5):
        self._network = network
        network.eval()
        self._c_puct = c_puct
        self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct)
        self._self_play = False
        super(AlphaPlayer, self).__init__(game)

    def get_action(self, last_move, return_probs=False, temperature=0.1):
        if last_move:
            self._mcts.update_with_move(last_move)
        action_probs = self._mcts.get_action_probs(self._game, num_playouts=config.NUM_PLAYOUTS, temperature=temperature)
        actions, probs = zip(*action_probs.items())
        if self._self_play:
            i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs)*0.75+np.random.dirichlet(0.3*np.ones(len(actions)))*0.25)
        else:
            i = np.random.choice(np.arange(len(actions)), p=np.asarray(probs))
        action = actions[i]
        self._mcts.update_with_move(action)
        if return_probs:
            full_probs = np.zeros(config.BOARD_SIZE*config.BOARD_SIZE)
            rows, cols = zip(*actions)
            full_probs[np.array(rows) * config.BOARD_SIZE + np.array(cols)] = np.array(probs)
            return action, full_probs.ravel()
        else:
            return action

    def set_self_play(self, value: bool):
        self._self_play = value

    def policy_value_fn(self, board, to_dict=False):
        x = board_to_state(board)
        x = th.tensor(x).float().to(self._network.device).unsqueeze(0)
        with th.no_grad():
            prior_probs, value = self._network(x)
        if to_dict:
            prior_probs = prior_probs.cpu().view(*board.shape).numpy()
            moves = available_moves(board)
            rows, cols = zip(*moves)
            prior_probs = dict(zip(moves, prior_probs[np.array(rows), np.array(cols)]))
            value = value[0, 0].item()
        return prior_probs, value

    def reset(self):
        self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
Ejemplo n.º 11
0
    def test_move_to_leaf(self):
        game = Game()
        root = Node(game)
        action_encoder = ActionEncoder(DirectionResolver())
        mcts = MCTS(root,
                    config={
                        'ALPHA': 0.8,
                        'CPUCT': 1,
                        'EPSILON': 0.2
                    },
                    model=None,
                    state_encoder=None,
                    action_encoder=action_encoder)

        puct = MagicMock()
        mcts.puct = puct

        child1 = Node(game.move(game.get_possible_moves()[0]))
        child2 = Node(game.move(game.get_possible_moves()[1]))
        child3 = Node(game.move(game.get_possible_moves()[2]))
        edge1 = Edge(
            root, child1, 0.33,
            action_encoder.convert_move_to_action_id(
                game.get_possible_moves()[0]))
        edge2 = Edge(
            root, child2, 0.34,
            action_encoder.convert_move_to_action_id(
                game.get_possible_moves()[1]))
        edge3 = Edge(
            root, child3, 0.33,
            action_encoder.convert_move_to_action_id(
                game.get_possible_moves()[2]))
        root.edges.append(edge1)
        root.edges.append(edge2)
        root.edges.append(edge3)
        puct.puct.return_value = edge2

        leaf, value, done, breadcrumbs = mcts.move_to_leaf()

        self.assertEquals(leaf, child2)
        self.assertEquals(value, 0)
        self.assertEquals(done, 0)
        self.assertEquals(False, 0)
        self.assertEquals(True, 1)
Ejemplo n.º 12
0
    def test_backfill(self):
        game_root = Game()
        root = Node(game_root)
        action_encoder = ActionEncoder(DirectionResolver())
        position1 = game_root.move(game_root.get_possible_moves()[0])
        child1 = Node(position1)
        edge1 = Edge(
            root, child1, 0.3,
            action_encoder.convert_move_to_action_id(
                game_root.get_possible_moves()[0]))

        position2 = position1.move(position1.get_possible_moves()[0])
        child2 = Node(position2)
        edge2 = Edge(
            child1, child2, 0.2,
            action_encoder.convert_move_to_action_id(
                game_root.get_possible_moves()[0]))
        edge2.stats['N'] = 4
        edge2.stats['W'] = 1

        mcts = MCTS(root,
                    config={
                        'ALPHA': 0.8,
                        'CPUCT': 1,
                        'EPSILON': 0.2
                    },
                    model=None,
                    state_encoder=None,
                    action_encoder=action_encoder)

        mcts.backfill(child2, -1, [edge2, edge1])

        self.assertEquals(edge2.stats['N'], 5)
        self.assertEquals(edge2.stats['W'], 2)
        self.assertEquals(edge2.stats['Q'], 2 / 5)

        self.assertEquals(edge1.stats['N'], 1)
        self.assertEquals(edge1.stats['W'], -1)
        self.assertEquals(edge1.stats['Q'], -1)
Ejemplo n.º 13
0
    def test_integration(self):
        HIDDEN_CNN_LAYERS = [{
            'filters': 75,
            'kernel_size': (4, 4)
        }, {
            'filters': 75,
            'kernel_size': (4, 4)
        }, {
            'filters': 75,
            'kernel_size': (4, 4)
        }, {
            'filters': 75,
            'kernel_size': (4, 4)
        }, {
            'filters': 75,
            'kernel_size': (4, 4)
        }, {
            'filters': 75,
            'kernel_size': (4, 4)
        }]
        model = Residual_CNN(0.0001,
                             0.1, (2, 4, 8),
                             32 * 4,
                             HIDDEN_CNN_LAYERS,
                             momentum=0.9)
        game_root = Game()
        root = Node(game_root)
        mcts = MCTS(root,
                    config={
                        'ALPHA': 0.8,
                        'CPUCT': 1,
                        'EPSILON': 0.2
                    },
                    model=model,
                    state_encoder=StateEncoder(),
                    action_encoder=ActionEncoder(DirectionResolver()))

        mcts.predict_state_value(game_root)
        mcts.evaluate_leaf(root)
Ejemplo n.º 14
0
            writer = csv.writer(f)
            writer.writerow(run_statistics_cols)
    run_statistics_rows = []
    for i in PROBLEMS_INDEXES_LIST:
        H0, H_final = create_grover_hamiltonians(N_QUBITS)
        for final_t in T_LIST:
            start_time = time.time()
            QuantumAnnealerEnv.init_env_from_params(H0,
                                                    H_final,
                                                    final_t,
                                                    num_x_components=M,
                                                    l=l,
                                                    delta=DELTA,
                                                    max_t_points=MAX_t_POINTS)
            n_actions = QuantumAnnealerEnv.n_actions
            mcts = MCTS(QuantumAnnealerEnv)
            mcts.initialize_search()
            best_merit = -float('inf')
            best_node_path: MCTSNode = None
            for j in range(NUM_EPISODES_PER_PROBLEM):

                curr_best_node, best_merit = execute_regular_mcts_episode(
                    mcts,
                    num_expansion=N_EXP,
                    num_simulations=N_SIM,
                    best_merit=best_merit)
                best_node_path = best_node_path if curr_best_node is None else curr_best_node

                if j % 10 == 0 and j != 0 and j != NUM_EPISODES_PER_PROBLEM - 1 and best_node_path is not None:
                    print(f'after {j} episodes of mcts the best fidelity is:')
                    print(best_node_path.get_fidelity_of_node())
Ejemplo n.º 15
0
 def reset(self):
     self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), self._c_puct)
Ejemplo n.º 16
0
 def __init__(self, game, num_playouts: int, c_puct: float=5):
     self.num_playouts = num_playouts
     self._mcts = MCTS(partial(self.policy_value_fn, to_dict=True), c_puct)
     self._self_play = False
     self._c_puct = c_puct
     super(BasicPlayer, self).__init__(game)
Ejemplo n.º 17
0
 def find_best_move(self, max_iteration=1000):
     root_state = ChessState(self.board, self.board.turn)
     best_move = chess.Move.uci(MCTS.search(root_state, max_iteration))
     return best_move
Ejemplo n.º 18
0
def agent(hps):
    ''' Agent function '''
    tf.reset_default_graph()

    # storage
    result = {}
    env_steps, ep_return = [], [
    ]  # will indicate the timestep for the learning curve
    losses, gn = [], []
    best_R = -np.Inf

    Env = make_game(hps.game)
    D = Database(max_size=max(hps.data_size, hps.n_mcts * hps.steps_per_ep),
                 batch_size=hps.batch_size)
    model = Model(Env,
                  lr=hps.lr,
                  n_mix=hps.n_mix,
                  clip_gradient_norm=hps.clip_gradient_norm,
                  loss_type=hps.loss_type,
                  bound=hps.bound,
                  temp=hps.temp,
                  entropy_l=hps.entropy_l)

    #with tf.Session() as sess,sess.as_default():
    with tf.Session() as sess:
        if hps.tfdb:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        model.sess = sess
        sess.run(tf.global_variables_initializer())
        global_t_mcts = 0
        global_t = 0

        for ep in range(hps.n_eps):
            start = time.time()
            root_index = Env.reset()
            root = None
            R = 0.0  # episode reward
            t = 0  # episode steps
            seed = np.random.randint(1e7)
            Env.seed(seed)
            a_store = []

            while True:
                # run an episode
                if hps.timeit: now = time.time()
                root = MCTS(root_index,
                            root,
                            Env,
                            N=hps.n_mcts,
                            model=model,
                            c=hps.c,
                            bootstrap_V=hps.bootstrap_V,
                            block_loop=hps.block_loop,
                            sigma_tree=hps.sigma_tree,
                            backup_Q=hps.backup_Q,
                            backup_sigma_tree=hps.backup_sigma_tree,
                            seed=seed,
                            a_his=a_store,
                            alpha=hps.alpha,
                            C_widening=hps.C_widening,
                            use_prior=hps.use_prior,
                            timeit=hps.timeit,
                            random_action_frac=hps.random_action_frac)
                if hps.timeit:
                    print(
                        'One MCTS search takes {} seconds'.format(time.time() -
                                                                  now))
                if hps.verbose_mcts: display_info(root, '{}'.format(t), hps.c)

                probs, a_list, V, a, a_argmax = root.return_results(
                    decision_type=hps.decision_type,
                    loss_type=hps.loss_type,
                    temperature=hps.temp,
                    V_decision=hps.V_decision)
                for k, prob in enumerate(probs):
                    D.store((root.index, V, a_list[k], np.array([prob])))
                    #if count == 0:
                    #    print('Warning',[child_action.n for child_action in root.child_actions],display_info(root,'{}'.format(t),hps.c))

                # Make the step
                a_store.append(a)
                s1, r, terminal, _ = Env.step(a)
                R += r
                t += 1
                global_t += 1
                global_t_mcts += hps.n_mcts

                #if hps.verbose:
                #    if (t % 50) == 0:
                #        print('Overall step {}, root currently returns V {}, and considers a {} with counts {}'.format(global_t,V,a_list,probs))

                if terminal or (t > hps.steps_per_ep):
                    if hps.verbose:
                        print('Episode terminal, total reward {}, steps {}'.
                              format(R, t))
                    ep_return.append(R)
                    env_steps.append(global_t_mcts)
                    break  # break out, start new episode
                else:
                    root = root.forward(a_argmax, s1, r, terminal, model)

            # saving
            result.update({'steps': env_steps, 'return': ep_return})
            if hps.verbose:
                result.update({'gn': gn, 'loss': losses})
            #if R > best_R:
            #    result.update({'seed':seed,'actions':a_store,'R':best_R})
            #    best_R = R
            store_safely(hps.result_dir, 'result', result)

            # Train
            if (global_t_mcts > hps.n_t) or (ep > hps.n_eps):
                break  # end learning
            else:
                n_epochs = hps.n_epochs * (np.ceil(
                    hps.n_mcts / 20)).astype(int)
                #print(n_epochs)
                loss = model.train(D, n_epochs, hps.lr)
                losses.append(loss['total_loss'])
                gn.append(loss['gn'])

            if hps.verbose:
                print(
                    'Time {}, Episode {}, Return {}, V {}, gn {}, Vloss {}, piloss {}'
                    .format(global_t_mcts, ep, R, loss['V'], loss['gn'],
                            loss['V_loss'], loss['pi_loss']))
                print('Actions {}, probs {}'.format(np.array(a_list), probs))
                print('One full episode loop + training in {} seconds'.format(
                    time.time() - start))

    return result
Ejemplo n.º 19
0
from games.nim.game import Game
from games.nim.statemanager import StateManager
from games.player import Player
from src.mcts import MCTS

play = True
while play:

    # Create the brain of the MCTS AI(s), the list of Players, set up the Game and initialize the StateManager
    brain = MCTS(m=1000, rollout_batch_size=100)
    players = [Player("Tobias"), Player("AI-bert", brain=brain)]
    game = Game(players,
                starting_player_idx=0,
                total_nr_of_stones=15,
                max_selection=3)
    state_manager = StateManager(game)

    # Play until the game ends
    while not game.over:
        # Update the internal state of the AI(s) brain
        brain.update(game.state)
        # Request action from current player and execute it
        game.select_stones(game.current_player().take_turn(
            game, state_manager))

    # Ask user if a new game should be started
    input_given = False
    while not input_given:
        inpt = input("Do you want to play another game of NIM? (y/n): ")
        if inpt == "y":
            input_given = True