def test_mcts_can_self_play_noughts_and_crosses(): nac = NoughtsAndCrosses() estimator = create_trivial_estimator(nac) player1 = MCTSPlayer(nac, estimator, 100, 0.5) player2 = MCTSPlayer(nac, estimator, 100, 0.5) players = {1: player1, 2: player2} actions, game_states, utility = play(nac, players) assert len(actions) == len(game_states) - 1 assert game_states[0] == nac.initial_state assert nac.is_terminal(game_states[-1])
def test_evaluator_on_noughts_and_crosses(): np.random.seed(0) nac = NoughtsAndCrosses() estimator = create_trivial_estimator(nac) player1 = MCTSPlayer(nac, estimator, 100, 0.5) player2 = MCTSPlayer(nac, estimator, 100, 0.5) players = {1: player1, 2: player2} # Check the evaluators aren't equal. assert player1 is not player2 player1_results = evaluate(nac, players, 20)
def test_evaluator_can_compare_two_mcts_players_with_trivial_estimator(): np.random.seed(0) mock_game = MockGame() estimator = create_trivial_estimator(mock_game) player1 = MCTSPlayer(mock_game, estimator, 100, 0.5) player2 = MCTSPlayer(mock_game, estimator, 100, 0.5) players = {1: player1, 2: player2} # Check the players aren't equal. assert player1 is not player2 player1_results, _ = evaluate(mock_game, players, 100) assert player1_results == {1: 100, -1: 0, 0: 0}
def test_mcts_noughts_and_crosses_player_gives_optimal_moves( state, optimal_actions): # seed the random number generator. np.random.seed(0) nac = NoughtsAndCrosses() estimator = create_trivial_estimator(nac) player = MCTSPlayer(game=nac, estimator=estimator, mcts_iters=100, c_puct=0.5, tau=1) action, action_probs = player.choose_action(state, return_probabilities=True) print(action_probs) assert max(action_probs, key=action_probs.get) in optimal_actions
def train_network(solved_states, evaluate_every): print("Converting solved states to training data.") training_data = solved_states_to_training_data(solved_states) np.random.shuffle(training_data) dev_fraction = 0.02 num_dev = int(dev_fraction * len(training_data)) dev_data = training_data[:num_dev] training_data = training_data[num_dev:] # Comparison players for evaluation mcts_iters = 10 game = ConnectFour() trivial_estimator = create_trivial_estimator(game) rollout_estimator = create_rollout_estimator(game, 50) random_player = RandomPlayer(game) c_puct = 0.5 trivial_mcts_player = MCTSPlayer(game, trivial_estimator, mcts_iters, c_puct, 0.01) rollout_mcts_player = MCTSPlayer(game, rollout_estimator, mcts_iters, c_puct, 0.01) # fixed_comparison_players = {1: random_player, # 2: trivial_mcts_player, # 3: rollout_mcts_player} fixed_comparison_players = {1: random_player} supervised_player_no = len(fixed_comparison_players) + 1 supervised_players_queue = deque(maxlen=2) # Hyperparameters learning_rate = 1e-4 batch_size = 32 l2_weight = 1e-1 value_weight = 1e-2 num_train = len(training_data) checkpoint_every = evaluate_every num_steps = 1000 # Build the hyperparameter string hyp_string = ("lr={},batch_size={},value_weight={},l2_weight={}," "num_train={}").format(learning_rate, batch_size, value_weight, l2_weight, num_train) game_name = 'connect_four-sl' current_time_format = time.strftime('%Y-%m-%d_%H:%M:%S') path = "experiments/{}-{}-{}/".format(game_name, hyp_string, current_time_format) checkpoint_path = path + 'checkpoints/' game_results_file_name = path + "game_results.pickle" estimator = ConnectFourNet(learning_rate=learning_rate, l2_weight=l2_weight, value_weight=value_weight, action_indices=game.action_indices) summary_path = path + 'logs/' scalar_names = [ 'dev_loss', 'dev_loss_value', 'dev_loss_probs', 'dev_accuracy' ] summary_scalars = SummaryScalars(scalar_names) verbose = True training_iters = -1 writer = tf.summary.FileWriter(summary_path) dev_optimal_actions = [(state, optimal_actions) for state, optimal_actions, value in dev_data] for step in range(num_steps): print("Step: {}".format(step)) optimise_estimator(estimator, training_data, batch_size, training_iters, mode='supervised', writer=writer, verbose=verbose) # Now compute dev loss dev_loss, dev_loss_value, dev_loss_probs = estimator.loss( dev_data, batch_size) dev_accuracy = compute_accuracy(estimator, dev_optimal_actions) print("Dev loss: {}, dev loss value: {}, dev loss probs: {}, " "dev accuracy: {}".format(dev_loss, dev_loss_value, dev_loss_probs, dev_accuracy)) summary_scalars.run( { 'dev_loss': dev_loss, 'dev_loss_value': dev_loss_value, 'dev_loss_probs': dev_loss_probs, 'dev_accuracy': dev_accuracy }, estimator.global_step, writer) if step % checkpoint_every == 0 and step > 0: checkpoint_name = compute_checkpoint_name(step, checkpoint_path) estimator.save(checkpoint_name) new_estimator = ConnectFourNet(learning_rate=learning_rate, l2_weight=l2_weight, value_weight=value_weight, action_indices=game.action_indices) new_estimator.restore(checkpoint_name) new_player = MCTSPlayer(game, new_estimator, mcts_iters, c_puct) supervised_players = { j: player for j, player in supervised_players_queue } comparison_players = { **fixed_comparison_players, **supervised_players } game_results = run_gauntlet(game, (supervised_player_no, new_player), comparison_players, 1) update_results(game_results, game_results_file_name) # elo to writer supervised_players_queue.appendleft( (supervised_player_no, new_player)) supervised_player_no += 1
from alphago.games import NoughtsAndCrosses, ConnectFour from alphago.evaluator import run_tournament, compare_against_players from alphago.player import RandomPlayer, MCTSPlayer from alphago.estimator import create_trivial_estimator, create_rollout_estimator from alphago.elo import elo import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import numpy as np import tqdm tqdm.tqdm.monitor_interval = 0 game = ConnectFour() trivial_estimator = create_trivial_estimator(game) rollout_estimator_10 = create_rollout_estimator(game, 10) rollout_estimator_100 = create_rollout_estimator(game, 100) rollout_estimator_200 = create_rollout_estimator(game, 200) mcts_args = 10, 0.5, 0.01 random_player = RandomPlayer(game) trivial_mcts_player = MCTSPlayer(game, trivial_estimator, *mcts_args) rollout_mcts_player_10 = MCTSPlayer(game, rollout_estimator_10, *mcts_args) rollout_mcts_player_100 = MCTSPlayer(game, rollout_estimator_100, *mcts_args) rollout_mcts_player_200 = MCTSPlayer(game, rollout_estimator_200, *mcts_args) players = { 2: random_player, 3: trivial_mcts_player, 4: rollout_mcts_player_10,
from alphago.player import MCTSPlayer, RandomPlayer, OptimalPlayer from alphago.games import NoughtsAndCrosses, connect_four from alphago.utilities import memoize_instance from alphago.estimator import create_trivial_estimator from alphago.evaluator import evaluate, play nac = NoughtsAndCrosses(3, 6) # memoize_instance(nac) trivial_estimator = create_trivial_estimator(nac) player2 = MCTSPlayer(nac, trivial_estimator, 30, 0.5, 0.01) player1 = MCTSPlayer(nac, trivial_estimator, 30, 0.5, 0.01) evaluate(nac, {2: player2, 1: player1}, 1000)
def test_trivial_estimator(): mock_game = MockGame() trivial_estimator = create_trivial_estimator(mock_game) assert trivial_estimator(5) == ({0: 1 / 3, 1: 1 / 3, 2: 1 / 3}, 0)
'second.') parser.add_argument('--checkpoint', help='The checkpoint path to use for the estimator. ' 'If not given, then use a trivial estimator.') parser.add_argument('--mcts_iters', help='If 0, then just use the raw ' 'network.') parser.add_argument('--tau', help='Defaults to 1. Set closer to 0 for ' 'more exploitation.') parser.add_argument('--c_puct', help='Defaults to 0.5') args = parser.parse_args() mcts_iters = int(args.mcts_iters) if args.mcts_iters is not None else 1000 tau = float(args.tau) if args.tau is not None else 1 c_puct = float(args.c_puct) if args.c_puct is not None else 0.5 if args.player is not None: human = int(args.player) else: human = np.random.choice([1, 2]) if args.checkpoint: estimator = load_net(args.checkpoint) else: cf = ConnectFour() estimator = create_trivial_estimator(cf) play_game(human, estimator, mcts_iters, c_puct, tau)
"""This program plays noughts and crosses using Monte Carlo Tree Search and a trivial evaluator. For nonterminal states, the evaluator returns the uniform probability distribution over available actions and a value of 0. In a terminal state, we back up the utility returned by the game. """ import numpy as np from alphago.games.noughts_and_crosses import NoughtsAndCrosses from alphago.estimator import create_trivial_estimator from alphago.player import MCTSPlayer if __name__ == "__main__": nac = NoughtsAndCrosses() evaluator = create_trivial_estimator(nac.legal_actions) state = nac.INITIAL_STATE computer_player_no = np.random.choice([1, 2]) computer_player = MCTSPlayer(nac, evaluator, mcts_iters=2000, c_puct=0.5, tau=0.01) human_player_no = 1 if computer_player_no == 2 else 2 print("You are player: {}".format(human_player_no)) while not nac.is_terminal(state): player_no = nac.current_player(state) next_states = nac.legal_actions(state) if player_no == computer_player_no: action = computer_player.choose_action(state) computer_player.update(action) print("Taking action: {}".format(action))
import alphago.games.noughts_and_crosses as nac from alphago.estimator import create_trivial_estimator from alphago.evaluator import evaluate from alphago.player import MCTSPlayer, OptimalPlayer from alphago.alphago import train def compare_against_optimal(game, player, num_games): optimal_player_no = 1 if player.player_no == 2 else 2 optimal_player = OptimalPlayer(optimal_player_no, game) players = { player.player_no: player, optimal_player.player_no: optimal_player } return evaluate(game, players, num_games) if __name__ == "__main__": num_games = 200 trivial_estimator = create_trivial_estimator(nac.compute_next_states) train(nac, ) player = MCTSPlayer(1, nac, trivial_estimator, 10, 1) compare_against_optimal(nac, player, num_games)