Ejemplo n.º 1
0
def test_mcts_can_self_play_noughts_and_crosses():
    nac = NoughtsAndCrosses()
    estimator = create_trivial_estimator(nac)
    player1 = MCTSPlayer(nac, estimator, 100, 0.5)
    player2 = MCTSPlayer(nac, estimator, 100, 0.5)
    players = {1: player1, 2: player2}

    actions, game_states, utility = play(nac, players)

    assert len(actions) == len(game_states) - 1
    assert game_states[0] == nac.initial_state
    assert nac.is_terminal(game_states[-1])
Ejemplo n.º 2
0
def test_evaluator_on_noughts_and_crosses():
    np.random.seed(0)

    nac = NoughtsAndCrosses()
    estimator = create_trivial_estimator(nac)
    player1 = MCTSPlayer(nac, estimator, 100, 0.5)
    player2 = MCTSPlayer(nac, estimator, 100, 0.5)
    players = {1: player1, 2: player2}

    # Check the evaluators aren't equal.
    assert player1 is not player2

    player1_results = evaluate(nac, players, 20)
Ejemplo n.º 3
0
def test_evaluator_can_compare_two_mcts_players_with_trivial_estimator():
    np.random.seed(0)
    mock_game = MockGame()

    estimator = create_trivial_estimator(mock_game)
    player1 = MCTSPlayer(mock_game, estimator, 100, 0.5)
    player2 = MCTSPlayer(mock_game, estimator, 100, 0.5)
    players = {1: player1, 2: player2}

    # Check the players aren't equal.
    assert player1 is not player2

    player1_results, _ = evaluate(mock_game, players, 100)

    assert player1_results == {1: 100, -1: 0, 0: 0}
Ejemplo n.º 4
0
def test_mcts_noughts_and_crosses_player_gives_optimal_moves(
        state, optimal_actions):
    # seed the random number generator.
    np.random.seed(0)

    nac = NoughtsAndCrosses()
    estimator = create_trivial_estimator(nac)
    player = MCTSPlayer(game=nac,
                        estimator=estimator,
                        mcts_iters=100,
                        c_puct=0.5,
                        tau=1)
    action, action_probs = player.choose_action(state,
                                                return_probabilities=True)
    print(action_probs)

    assert max(action_probs, key=action_probs.get) in optimal_actions
def train_network(solved_states, evaluate_every):
    print("Converting solved states to training data.")
    training_data = solved_states_to_training_data(solved_states)
    np.random.shuffle(training_data)
    dev_fraction = 0.02
    num_dev = int(dev_fraction * len(training_data))
    dev_data = training_data[:num_dev]
    training_data = training_data[num_dev:]

    # Comparison players for evaluation
    mcts_iters = 10
    game = ConnectFour()
    trivial_estimator = create_trivial_estimator(game)
    rollout_estimator = create_rollout_estimator(game, 50)
    random_player = RandomPlayer(game)
    c_puct = 0.5
    trivial_mcts_player = MCTSPlayer(game, trivial_estimator, mcts_iters,
                                     c_puct, 0.01)
    rollout_mcts_player = MCTSPlayer(game, rollout_estimator, mcts_iters,
                                     c_puct, 0.01)
    # fixed_comparison_players = {1: random_player,
    #                             2: trivial_mcts_player,
    #                             3: rollout_mcts_player}

    fixed_comparison_players = {1: random_player}

    supervised_player_no = len(fixed_comparison_players) + 1
    supervised_players_queue = deque(maxlen=2)

    # Hyperparameters
    learning_rate = 1e-4
    batch_size = 32
    l2_weight = 1e-1
    value_weight = 1e-2
    num_train = len(training_data)

    checkpoint_every = evaluate_every
    num_steps = 1000

    # Build the hyperparameter string
    hyp_string = ("lr={},batch_size={},value_weight={},l2_weight={},"
                  "num_train={}").format(learning_rate, batch_size,
                                         value_weight, l2_weight, num_train)

    game_name = 'connect_four-sl'

    current_time_format = time.strftime('%Y-%m-%d_%H:%M:%S')
    path = "experiments/{}-{}-{}/".format(game_name, hyp_string,
                                          current_time_format)
    checkpoint_path = path + 'checkpoints/'
    game_results_file_name = path + "game_results.pickle"

    estimator = ConnectFourNet(learning_rate=learning_rate,
                               l2_weight=l2_weight,
                               value_weight=value_weight,
                               action_indices=game.action_indices)

    summary_path = path + 'logs/'
    scalar_names = [
        'dev_loss', 'dev_loss_value', 'dev_loss_probs', 'dev_accuracy'
    ]
    summary_scalars = SummaryScalars(scalar_names)

    verbose = True
    training_iters = -1

    writer = tf.summary.FileWriter(summary_path)

    dev_optimal_actions = [(state, optimal_actions)
                           for state, optimal_actions, value in dev_data]

    for step in range(num_steps):
        print("Step: {}".format(step))
        optimise_estimator(estimator,
                           training_data,
                           batch_size,
                           training_iters,
                           mode='supervised',
                           writer=writer,
                           verbose=verbose)

        # Now compute dev loss
        dev_loss, dev_loss_value, dev_loss_probs = estimator.loss(
            dev_data, batch_size)
        dev_accuracy = compute_accuracy(estimator, dev_optimal_actions)
        print("Dev loss: {}, dev loss value: {}, dev loss probs: {}, "
              "dev accuracy: {}".format(dev_loss, dev_loss_value,
                                        dev_loss_probs, dev_accuracy))

        summary_scalars.run(
            {
                'dev_loss': dev_loss,
                'dev_loss_value': dev_loss_value,
                'dev_loss_probs': dev_loss_probs,
                'dev_accuracy': dev_accuracy
            }, estimator.global_step, writer)

        if step % checkpoint_every == 0 and step > 0:
            checkpoint_name = compute_checkpoint_name(step, checkpoint_path)
            estimator.save(checkpoint_name)

            new_estimator = ConnectFourNet(learning_rate=learning_rate,
                                           l2_weight=l2_weight,
                                           value_weight=value_weight,
                                           action_indices=game.action_indices)
            new_estimator.restore(checkpoint_name)

            new_player = MCTSPlayer(game, new_estimator, mcts_iters, c_puct)

            supervised_players = {
                j: player
                for j, player in supervised_players_queue
            }
            comparison_players = {
                **fixed_comparison_players,
                **supervised_players
            }

            game_results = run_gauntlet(game,
                                        (supervised_player_no, new_player),
                                        comparison_players, 1)

            update_results(game_results, game_results_file_name)

            # elo to writer

            supervised_players_queue.appendleft(
                (supervised_player_no, new_player))
            supervised_player_no += 1
Ejemplo n.º 6
0
from alphago.games import NoughtsAndCrosses, ConnectFour
from alphago.evaluator import run_tournament, compare_against_players
from alphago.player import RandomPlayer, MCTSPlayer
from alphago.estimator import create_trivial_estimator, create_rollout_estimator
from alphago.elo import elo

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import numpy as np
import tqdm
tqdm.tqdm.monitor_interval = 0

game = ConnectFour()

trivial_estimator = create_trivial_estimator(game)
rollout_estimator_10 = create_rollout_estimator(game, 10)
rollout_estimator_100 = create_rollout_estimator(game, 100)
rollout_estimator_200 = create_rollout_estimator(game, 200)

mcts_args = 10, 0.5, 0.01
random_player = RandomPlayer(game)
trivial_mcts_player = MCTSPlayer(game, trivial_estimator, *mcts_args)
rollout_mcts_player_10 = MCTSPlayer(game, rollout_estimator_10, *mcts_args)
rollout_mcts_player_100 = MCTSPlayer(game, rollout_estimator_100, *mcts_args)
rollout_mcts_player_200 = MCTSPlayer(game, rollout_estimator_200, *mcts_args)

players = {
    2: random_player,
    3: trivial_mcts_player,
    4: rollout_mcts_player_10,
Ejemplo n.º 7
0
from alphago.player import MCTSPlayer, RandomPlayer, OptimalPlayer
from alphago.games import NoughtsAndCrosses, connect_four
from alphago.utilities import memoize_instance
from alphago.estimator import create_trivial_estimator
from alphago.evaluator import evaluate, play

nac = NoughtsAndCrosses(3, 6)
# memoize_instance(nac)

trivial_estimator = create_trivial_estimator(nac)
player2 = MCTSPlayer(nac, trivial_estimator, 30, 0.5, 0.01)
player1 = MCTSPlayer(nac, trivial_estimator, 30, 0.5, 0.01)

evaluate(nac, {2: player2, 1: player1}, 1000)
Ejemplo n.º 8
0
def test_trivial_estimator():
    mock_game = MockGame()
    trivial_estimator = create_trivial_estimator(mock_game)

    assert trivial_estimator(5) == ({0: 1 / 3, 1: 1 / 3, 2: 1 / 3}, 0)
Ejemplo n.º 9
0
                        'second.')
    parser.add_argument('--checkpoint',
                        help='The checkpoint path to use for the estimator. '
                        'If not given, then use a trivial estimator.')
    parser.add_argument('--mcts_iters',
                        help='If 0, then just use the raw '
                        'network.')
    parser.add_argument('--tau',
                        help='Defaults to 1. Set closer to 0 for '
                        'more exploitation.')
    parser.add_argument('--c_puct', help='Defaults to 0.5')

    args = parser.parse_args()

    mcts_iters = int(args.mcts_iters) if args.mcts_iters is not None else 1000
    tau = float(args.tau) if args.tau is not None else 1
    c_puct = float(args.c_puct) if args.c_puct is not None else 0.5

    if args.player is not None:
        human = int(args.player)
    else:
        human = np.random.choice([1, 2])

    if args.checkpoint:
        estimator = load_net(args.checkpoint)
    else:
        cf = ConnectFour()
        estimator = create_trivial_estimator(cf)

    play_game(human, estimator, mcts_iters, c_puct, tau)
Ejemplo n.º 10
0
"""This program plays noughts and crosses using Monte Carlo Tree Search and a
trivial evaluator. For nonterminal states, the evaluator returns the uniform
probability distribution over available actions and a value of 0. In a terminal
state, we back up the utility returned by the game.
"""
import numpy as np

from alphago.games.noughts_and_crosses import NoughtsAndCrosses
from alphago.estimator import create_trivial_estimator
from alphago.player import MCTSPlayer
if __name__ == "__main__":

    nac = NoughtsAndCrosses()
    evaluator = create_trivial_estimator(nac.legal_actions)

    state = nac.INITIAL_STATE
    computer_player_no = np.random.choice([1, 2])
    computer_player = MCTSPlayer(nac,
                                 evaluator,
                                 mcts_iters=2000,
                                 c_puct=0.5,
                                 tau=0.01)
    human_player_no = 1 if computer_player_no == 2 else 2
    print("You are player: {}".format(human_player_no))
    while not nac.is_terminal(state):
        player_no = nac.current_player(state)
        next_states = nac.legal_actions(state)
        if player_no == computer_player_no:
            action = computer_player.choose_action(state)
            computer_player.update(action)
            print("Taking action: {}".format(action))
import alphago.games.noughts_and_crosses as nac
from alphago.estimator import create_trivial_estimator
from alphago.evaluator import evaluate
from alphago.player import MCTSPlayer, OptimalPlayer
from alphago.alphago import train


def compare_against_optimal(game, player, num_games):

    optimal_player_no = 1 if player.player_no == 2 else 2
    optimal_player = OptimalPlayer(optimal_player_no, game)

    players = {
        player.player_no: player,
        optimal_player.player_no: optimal_player
    }

    return evaluate(game, players, num_games)


if __name__ == "__main__":

    num_games = 200
    trivial_estimator = create_trivial_estimator(nac.compute_next_states)

    train(nac, )

    player = MCTSPlayer(1, nac, trivial_estimator, 10, 1)
    compare_against_optimal(nac, player, num_games)