def _init_bot(bot_type, game, player_id): """Initializes a bot by type.""" rng = np.random.RandomState(FLAGS.seed) if bot_type == "mcts": evaluator = mcts.RandomRolloutEvaluator(FLAGS.rollout_count, rng) return mcts.MCTSBot( game, FLAGS.uct_c, FLAGS.max_simulations, evaluator, random_state=rng, solve=FLAGS.solve, verbose=FLAGS.verbose) if bot_type == "az": model = az_model.Model.from_checkpoint(FLAGS.az_path) evaluator = az_evaluator.AlphaZeroEvaluator(game, model) return mcts.MCTSBot( game, FLAGS.uct_c, FLAGS.max_simulations, evaluator, random_state=rng, child_selection_fn=mcts.SearchNode.puct_value, solve=FLAGS.solve, verbose=FLAGS.verbose) if bot_type == "random": return uniform_random.UniformRandomBot(player_id, rng) if bot_type == "human": return human.HumanBot() if bot_type == "gtp": bot = gtp.GTPBot(game, FLAGS.gtp_path) for cmd in FLAGS.gtp_cmd: bot.gtp_cmd(cmd) return bot raise ValueError("Invalid bot type: %s" % bot_type)
def test_works_with_mcts(self): game = pyspiel.load_game("tic_tac_toe") model = build_model(game) evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) bot = mcts.MCTSBot( game, 1., 20, evaluator, solve=False, dirichlet_noise=(0.25, 1.)) root = bot.mcts_search(game.new_initial_state()) self.assertEqual(root.explore_count, 20)
def main(_): game = pyspiel.load_game("tic_tac_toe") # 1. Define a model model = model_lib.Model( FLAGS.nn_model, game.observation_tensor_shape(), game.num_distinct_actions(), nn_width=FLAGS.nn_width, nn_depth=FLAGS.nn_depth, weight_decay=1e-4, learning_rate=0.01, path=None) print("Model type: {}({}, {}), size: {} variables".format( FLAGS.nn_model, FLAGS.nn_width, FLAGS.nn_depth, model.num_trainable_variables)) # 2. Create an MCTS bot using the model evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) bot = mcts.MCTSBot(game, 1., 20, evaluator, solve=False, dirichlet_noise=(0.25, 1.)) # 3. Build an AlphaZero instance a0 = alpha_zero.AlphaZero(game, bot, model, replay_buffer_capacity=FLAGS.replay_buffer_capacity, action_selection_transition=4) # 4. Create a bot using min-max search. It can never lose tic-tac-toe, so # a success condition for our AlphaZero bot is to draw all games with it. minimax_bot = MinimaxBot(game) # 5. Run training loop for num_round in range(FLAGS.num_rounds): logging.info("------------- Starting round %s out of %s -------------", num_round, FLAGS.num_rounds) if num_round % FLAGS.evaluation_frequency == 0: num_evaluations = 50 logging.info("Playing %s games against the minimax player.", num_evaluations) (_, losses, draws) = bot_evaluation(game, [minimax_bot, a0.bot], num_evaluations=50) logging.info("Result against Minimax player: %s losses and %s draws.", losses, draws) logging.info("Running %s games of self play", FLAGS.num_self_play_games) a0.self_play(num_self_play_games=FLAGS.num_self_play_games) logging.info("Training the net for %s epochs.", FLAGS.num_training_epochs) a0.update(FLAGS.num_training_epochs, batch_size=FLAGS.batch_size, verbose=True) logging.info("Cache: %s", evaluator.cache_info()) evaluator.clear_cache()
def actor(*, config, game, logger, queue): """An actor process runner that generates games and returns trajectories.""" logger.print("Initializing model") model = _init_model_from_config(config) logger.print("Initializing bots") az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) bots = [ _init_bot(config, game, az_evaluator, False), _init_bot(config, game, az_evaluator, False), ] for game_num in itertools.count(): if not update_checkpoint(logger, queue, model, az_evaluator): return queue.put(_play_game(logger, game_num, game, bots, config.temperature, config.temperature_drop))
def evaluator(*, game, config, logger, checkpoint, queue): """A process that plays the latest checkpoint vs standard MCTS.""" results = Buffer(config.evaluation_window) logger.print("Initializing model") # Load a new model if there's not a checkpoint, otherwise load the checkpoint. if checkpoint is None: model = _init_model_from_config(config) else: model = _init_model_from_checkpoint(checkpoint, config.path) logger.print("Initializing bots") az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) random_evaluator = mcts.RandomRolloutEvaluator() for game_num in itertools.count(): if not update_checkpoint(logger, queue, model, az_evaluator): return az_player = game_num % 2 difficulty = (game_num // 2) % config.eval_levels max_simulations = int(config.max_simulations * (10**(difficulty / 2))) bots = [ _init_bot(config, game, az_evaluator, True), mcts.MCTSBot(game, config.uct_c, max_simulations, random_evaluator, solve=True, verbose=False) ] if az_player == 1: bots = list(reversed(bots)) trajectory = _play_game(logger, game_num, game, bots, temperature=1, temperature_drop=0) results.append(trajectory.returns[az_player]) queue.put((difficulty, trajectory.returns[az_player])) logger.print("AZ: {}, MCTS: {}, AZ avg/{}: {:.3f}".format( trajectory.returns[az_player], trajectory.returns[1 - az_player], len(results), np.mean(results.data)))
def evaluator(*, game, config, logger, num, queue): """A process that plays the latest checkpoint vs standard MCTS.""" max_simulations = config.max_simulations * (3**num) logger.print("Running MCTS with", max_simulations, "simulations") results = Buffer(config.evaluation_window) logger.print("Initializing model") model = _init_model_from_config(config) logger.print("Initializing bots") az_evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) random_evaluator = mcts.RandomRolloutEvaluator() az_player = 0 bots = [ _init_bot(config, game, az_evaluator, True), mcts.MCTSBot(game, config.uct_c, max_simulations, random_evaluator, solve=True, verbose=False) ] for game_num in itertools.count(): if not update_checkpoint(logger, queue, model, az_evaluator): return trajectory = _play_game(logger, game_num, game, bots, temperature=1, temperature_drop=0) results.append(trajectory.returns[az_player]) logger.print("AZ: {}, MCTS: {}, AZ avg/{}: {:.3f}".format( trajectory.returns[az_player], trajectory.returns[1 - az_player], len(results), np.mean(results.data))) # Swap players for the next game bots = list(reversed(bots)) az_player = 1 - az_player
def test_evaluator_caching(self): game = pyspiel.load_game("tic_tac_toe") model = build_model(game) evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) state = game.new_initial_state() obs = state.observation_tensor() act_mask = state.legal_actions_mask() action = state.legal_actions()[0] policy = np.zeros(len(act_mask), dtype=float) policy[action] = 1 train_inputs = [model_lib.TrainInput(obs, act_mask, policy, value=1)] value = evaluator.evaluate(state) self.assertEqual(value[0], -value[1]) value = value[0] value2 = evaluator.evaluate(state)[0] self.assertEqual(value, value2) prior = evaluator.prior(state) prior2 = evaluator.prior(state) np.testing.assert_array_equal(prior, prior2) info = evaluator.cache_info() self.assertEqual(info.misses, 1) self.assertEqual(info.hits, 3) for _ in range(20): model.update(train_inputs) # Still equal due to not clearing the cache value3 = evaluator.evaluate(state)[0] self.assertEqual(value, value3) info = evaluator.cache_info() self.assertEqual(info.misses, 1) self.assertEqual(info.hits, 4) evaluator.clear_cache() info = evaluator.cache_info() self.assertEqual(info.misses, 0) self.assertEqual(info.hits, 0) # Now they differ from before value4 = evaluator.evaluate(state)[0] value5 = evaluator.evaluate(state)[0] self.assertNotEqual(value, value4) self.assertEqual(value4, value5) info = evaluator.cache_info() self.assertEqual(info.misses, 1) self.assertEqual(info.hits, 1) value6 = evaluator.evaluate(game.new_initial_state())[0] self.assertEqual(value4, value6) info = evaluator.cache_info() self.assertEqual(info.misses, 1) self.assertEqual(info.hits, 2)
def main(_): game = pyspiel.load_game("tic_tac_toe") num_actions = game.num_distinct_actions() observation_shape = game.observation_tensor_shape() # 1. Define a keras net if FLAGS.net_type == "resnet": net = model_lib.keras_resnet(observation_shape, num_actions, num_residual_blocks=1, num_filters=256, data_format="channels_first") elif FLAGS.net_type == "mlp": net = model_lib.keras_mlp(observation_shape, num_actions, num_layers=2, num_hidden=64) else: raise ValueError( ("Invalid value for 'net_type'. Must be either 'mlp' or " "'resnet', but was %s") % FLAGS.net_type) model = model_lib.Model(net, l2_regularization=1e-4, learning_rate=0.01, device=FLAGS.device) # 2. Create an MCTS bot using the previous keras net evaluator = evaluator_lib.AlphaZeroEvaluator(game, model) bot = mcts.MCTSBot(game, 1., 20, evaluator, solve=False, dirichlet_noise=(0.25, 1.)) # 3. Build an AlphaZero instance a0 = alpha_zero.AlphaZero( game, bot, model, replay_buffer_capacity=FLAGS.replay_buffer_capacity, action_selection_transition=4) # 4. Create a bot using min-max search. It can never lose tic-tac-toe, so # a success condition for our AlphaZero bot is to draw all games with it. minimax_bot = MinimaxBot(game) # 5. Run training loop for num_round in range(FLAGS.num_rounds): logging.info("------------- Starting round %s out of %s -------------", num_round, FLAGS.num_rounds) if num_round % FLAGS.evaluation_frequency == 0: num_evaluations = 50 logging.info("Playing %s games against the minimax player.", num_evaluations) (_, losses, draws) = bot_evaluation(game, [minimax_bot, a0.bot], num_evaluations=50) logging.info( "Result against Minimax player: %s losses and %s draws.", losses, draws) logging.info("Running %s games of self play", FLAGS.num_self_play_games) a0.self_play(num_self_play_games=FLAGS.num_self_play_games) logging.info("Training the net for %s epochs.", FLAGS.num_training_epochs) a0.update(FLAGS.num_training_epochs, batch_size=FLAGS.batch_size, verbose=True) logging.info("Cache: %s", evaluator.cache_info()) evaluator.clear_cache()