Beispiel #1
0
def test_learner_against_rando(n_games=10000):
    # see how learner performs against a player making random moves
    metrics = run_simulator(
        p1=Player(strategy="basic_q", learning=False, load_Q=True),
        p2=Player(strategy="random"),
        n_games=n_games,
    )
    visualize_win_ratio(metrics, "Performance Over Time (testing)")
Beispiel #2
0
 def __init__(self, environment, No=100, discount_factor=1):
     Player.__init__(self)
     self.env = environment
     self.No = No
     self.disc_factor = discount_factor
     self.V = np.zeros(
         [self.env.dealer_max_value + 1, self.env.agent_max_value + 1])
     self.wins = 0.0
     self.iterations = 0.0
Beispiel #3
0
def train_learner_against_rando(n_games=20000):
    # load existing Q strategy and trains more against random strategy
    metrics = run_simulator(
        p1=Player(strategy="basic_q", learning=True, load_Q=True),
        p2=Player(strategy="random"),
        save_Q=True,
        n_games=n_games,
    )
    visualize_win_ratio(metrics, "Performance Over Time (testing)")
Beispiel #4
0
def test_learner_against_self():
    # see how the leaner performs against itself; results should be even
    metrics = run_simulator(
        p1=Player(
            strategy="basic_q",
            learning=False,
            load_Q=True,
        ),
        p2=Player(strategy="basic_q", learning=False, load_Q=True),
    )
    visualize_win_ratio(metrics, "Performance Over Time (testing)")
Beispiel #5
0
def run_simulator(
        n_games=1000,
        p1=Player(strategy="basic_q", learning=True),
        p2=Player(strategy="random", learning=False),
        save_Q=False,
):
    """Runs a number of tic tac toe games
	
	Arguments:
		n_games: number of tic tac toe games to be played
		p1: instance of the Player() class, should be used to specify the AI
			during training or testing
		p2: instance of the Player() class, should be used to specify the
			opponent
		save_Q: boolean, determines if p1's updated Q should be saved in a
			pickled file after training is complete

	Returns:
		metrics: dict of P1's wins, losses, and ties

	Use this function to run many games in a row. Depending on the 
	parameters for P1 and P2, this could be used for training or testing.
	This function also specifies the epsilon (exploration factor) decay
	over time as the learner moves from high exploration to low.
	"""
    games = [Game(p1=p1, p2=p2) for i in range(n_games)]
    metrics = []
    index = 0
    starting_epsilon = p1._epsilon

    for game in games:
        index += 1

        # run game and get results + P1's states and actions
        outcome, x_decisions, o_decisions = game.play_game()

        # log game results
        metrics.append(outcome)

        # update q learner after each game
        if p1._learning:
            p1.update_q(outcome, x_decisions)

            # reduce exploration factor after each game
            p1._epsilon = starting_epsilon - starting_epsilon * (1. * index /
                                                                 n_games)**2

    print Counter(metrics)

    # save pickled Q learning file
    if save_Q:
        pickle.dump(p1._Q, open(p1.q_file, "wb"))
    return metrics
Beispiel #6
0
def train_learner_against_self(n_sessions=5, games_per_session=10000):
    """Train a learner against itself
	
	Loads Q file for both P1 and P2. Learner (P1) starts with a high
	exploration factor that decays over time, while P2 uses no exploration
	factor. P1 updates its Q file over the course of *games_per_session*.

	After *games_per_session* have been played, P2 then updates its Q
	strategy to match P1's again, and the process continues iteratively for
	*n_sessions*
	"""
    for n in range(n_sessions):
        run_simulator(
            n_games=games_per_session,
            p1=Player(strategy="basic_q", learning=True, load_Q=True),
            p2=Player(strategy="basic_q", learning=False, load_Q=True),
            save_Q=True,
        )
        print "%d training sessions completed out of %d" % (n + 1, n_sessions)
Beispiel #7
0
 def SaveGame(self, request, context):
     game_history = GameHistory()
     game_history.observations = [tf.make_ndarray(observation) for observation in request.observations]
     game_history.actions = [Action(index) for index in request.actions]
     game_history.rewards = request.rewards
     game_history.to_plays = [Player(player_id) for player_id in request.to_plays]
     game_history.root_values = request.root_values
     game_history.policies = [policy.probabilities for policy in request.policies]
     self.replay_buffer.save_history(game_history)
     print('Number of games in buffer: {}'.format(len(self.replay_buffer.buffer)))
     return replay_buffer_pb2.SaveGameResponse(success=True)
Beispiel #8
0
    def run_mcts(self, root, num_moves):
        min_max_stats = MinMaxStats(self.config.known_bounds)

        for _ in range(self.config.num_simulations):
            # root.print()
            action, leaf, cur_moves = self.select_leaf(root, num_moves, min_max_stats)
            to_play = Player(cur_moves % self.config.game_config.num_players)

            batch_hidden_state = tf.expand_dims(leaf.parent.hidden_state, axis=0)
            network_output = self.network.recurrent_inference(batch_hidden_state, [action]).split_batch()[0]
            self.expand_node(node=leaf, to_play=to_play, actions=self.config.game_config.action_space,
                             network_output=network_output)
            self.backpropagate(leaf, network_output.value, to_play, min_max_stats)
Beispiel #9
0
def human_vs_human():
    Game(p1=Player(strategy="human"),
         p2=Player(strategy="human"),
         verbose=True).play_game()
Beispiel #10
0
def adversarial_training(
        p1=Player(strategy="basic_q", learning=False, load_Q=True),
        p2=Player(strategy="random", learning=False),
        p3=Player(strategy="basic_q", learning=True, load_Q=True),
        p4=Player(strategy="adversarial", learning=False),
):
    """Finds strategies that beat the AI for targeted training

	Arguments:
		p1: trained Q-learner in "test mode" (no learning)
		p2: cpu player choosing random moves
		p3: trained Q-leaner that will continue to train
		p4: p3's opponent, uses saved strategy uncovered by p2 that beat p1

	Plays a trained AI against a random player until (if) the random player
	wins. If the random player does win, it will save that strategy in its
	own Q file. It will then play this adversarial scenario many times so
	that the AI can learn a better strategy.

	Training is very specific and deep (many repetitions), so this is not
	meant to be a general training strategy and is best used after the AI
	is already sufficiently robust. Each time this function runs will only
	cover one adversarial example, so it may need to be run many times.
	"""

    # phase 1: AI vs. random opponent
    max_games = 50000
    games = [Game(p1=p1, p2=p2) for i in range(max_games)]

    for game in games:

        # run game and get results + P1's states and actions
        outcome, x_decisions, o_decisions = game.play_game()

        # build a Q network for player O only where X lost
        if outcome == "lost":

            # phase 2: adversarial training
            print "AI lost. Playing adversarial games..."
            Q_adversarial = {board: action for board, action in o_decisions}
            num_games = 10000
            a_games = [Game(
                p1=p3,
                p2=p4,
            ) for i in range(num_games)]

            starting_epsilon = p3._epsilon

            a_index = 0
            a_metrics = []
            for game in a_games:
                p4._Q = Q_adversarial
                a_index += 1
                outcome, x_decisions, o_decisions = game.play_game()
                a_metrics.append(outcome)
                p3.update_q(outcome, x_decisions)
                p3._epsilon = starting_epsilon - starting_epsilon * (
                    1. * a_index / num_games)**2

            print "Adversarial game outcomes:\n"
            print Counter(a_metrics)

            print "Building better, stronger Q..."
            pickle.dump(p3._Q, open(p3.q_file, "wb"))

            # end training
            return
    print "played %d games without losing!" % max_games
Beispiel #11
0
def test_learner_against_human():
    # tests Q-learner against human player ("O")
    p1 = Player(strategy="basic_q", load_Q=True)
    p2 = Player(strategy="human")
    game = Game(p1=p1, p2=p2, verbose=True)
    game.play_game()