예제 #1
0
    def generate_supervised_training_data(cls, games, labeling_strategy):
        """
        Generates training data by applying random moves to a board and labeling each sample with the move that :param labeling_strategy would have taken given the board.

        :param games: The number of games to be simulated
        :param labeling_strategy: The strategy used to label each sample. The label equals labeling_strategy.get_move(board)
        :return: a list of tuples(board_sample, move_label)
        """

        labeling_strategy.color = cls.config.BLACK

        generator = RandomPlayer()
        color_iterator = TicTacToeBaseExperiment.AlternatingColorIterator()

        start = datetime.now()
        training_set = []
        for game in range(games):
            board = TicTacToeBoard()
            for i in range(9):
                # generate training pair
                expert_move = labeling_strategy.get_move(board)
                training_set.append((board.copy(), expert_move))

                # prepare for next sample
                move = generator.get_move(board)
                board.apply_move(move, color_iterator.__next__())

        print("Generated %s training pairs form %s games in %s" % (len(training_set), games, datetime.now() - start))
        return training_set
예제 #2
0
    def test_DummyUpdate(self):
        board = TicTacToeBoard()
        value_function = PGStrategy(lr=0.001, weight_decay=0.003)
        value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))

        move = RandomPlayer.get_move(board)
        board.apply_move(move, config.BLACK)
        value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))

        move = RandomPlayer.get_move(board)
        board.apply_move(move, config.WHITE)
        value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))
def evaluate_against_base_players(player,
                                  evaluation_players=[
                                      RandomPlayer(),
                                      NovicePlayer(),
                                      ExperiencedPlayer(),
                                      ExpertPlayer()
                                  ],
                                  silent=True):
    """
    Standardized evaluation against base players.


    :param player: The player to be evaluated
    :param evaluation_players: A list of players against which the player should be evaluated
    :param silent: Flag controlling if output is written to console
    :return: a tuple (score, results) where score is the average score over all evaluation games (scalar (-1, 1)) and results is a list of
             tuples (name, score) where 'score' is the score (-1, 1) achieved when evaluated against the player named 'name'.
    """

    # Store original training values
    if issubclass(player.__class__, LearningPlayer):
        training_values = player.strategy.train, player.strategy.model.training
        player.strategy.train, player.strategy.model.training = False, False

    results = []
    for e_player in evaluation_players:
        simulation = TicTacToe([player, e_player])
        rewards, losses = simulation.run_simulations(config.EVALUATION_GAMES)
        results.append([e_player.__str__(), rewards])

        if not silent:
            print_results(player, e_player, rewards)

    # Restore original training values
    if issubclass(player.__class__, LearningPlayer):
        player.strategy.train, player.strategy.model.training = training_values

    avg_results = [(result[0], np.mean(result[1])) for result in results]
    avg_results.insert(
        0, ("Total Score", np.mean([
            res[1] for res in avg_results
        ])))  # Insert average overall score as first element of results

    results_overview = deepcopy(results)
    total = Counter(dict())
    for entry in results_overview:
        entry[1] = Counter(entry[1])
        total += entry[1]
    results_overview.insert(
        0, ("[Total Score]",
            total))  # Insert average overall score as first element of results

    if not silent:
        print("Overall score: %s" % avg_results[0][1])

    return avg_results[0][1], avg_results, results_overview
예제 #4
0
    def run(self, lr, silent=False):

        self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer(
            lr=lr)

        if self.opponent is not None:
            self.player2 = self.opponent
            self.simulation = TicTacToe([self.player1, self.player2])

        games_per_evaluation = self.games // self.evaluations
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):

            if self.opponent is None:
                self.player2 = choice(
                    (RandomPlayer(), NovicePlayer(), ExperiencedPlayer()
                     ))  # choice((RandomPlayer(), ExpertPlayer()))
                self.simulation = TicTacToe([self.player1, self.player2])

            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_loss(np.mean(losses))
            self.add_results(("Training Results", np.mean(results)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            if self.opponent is None:
                score, results, overview = evaluate_against_base_players(
                    self.player1)
            else:
                score, results, overview = evaluate_against_base_players(
                    self.player1, evaluation_players=[self.opponent])

            self.add_results(results)

            if not silent:
                if Printer.print_episode(episode * games_per_evaluation,
                                         self.games,
                                         datetime.now() - start_time):
                    overview = format_overview(overview)
                    self.plot_and_save(
                        "%s vs TRADITIONAL OPPONENT" % (self.player1),
                        "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" %
                        (self.player1, self.opponent,
                         episode * games_per_evaluation, self.evaluations,
                         config.time_diff(start_time)))

        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
예제 #5
0
    def run(self, lr, termination_criterion, silent=False):
        self.player = FCReinforcePlayer(lr=lr)
        self.player.color = config.BLACK

        generator = RandomPlayer()
        print("Pretraining %s on legal moves" % self.player.__str__())

        losses, rewards = [], []
        start = datetime.now()
        for game in range(1, self.max_games + 1):
            loss, reward = self.__run_episode__(generator)
            losses.append(loss)
            rewards.append(reward)

            if not silent:
                if Printer.print_episode(game, self.max_games,
                                         datetime.now() - start):
                    plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % (
                        self.player.__class__.__name__, LAYERS, lr)
                    plot_info = "%sGames - Final reward: %s \nTime: %s" % (
                        game, reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)
                    if (100 * game / self.max_games) % 10 == 0:
                        self.save_player(
                            self.player,
                            "using %s layers pretrained on legal moves for %s games lr: %s"
                            % (LAYERS, self.max_games, lr))

            if game > termination_criterion and sum(rewards[
                    -termination_criterion:]) / termination_criterion == 1:
                print(
                    "Reached training goal: %s games with only legal moves played -> terminating training."
                    % termination_criterion)
                self.save_player(
                    self.player,
                    "using %s layers pretrained on legal moves for %s games lr: %s"
                    % (LAYERS, self.max_games, lr))
                return losses, rewards

        print("Reached max training_games (%s) -> terminating training" %
              self.max_games)
        self.save_player(
            self.player,
            "using %s layers pretrained on legal moves for %s games lr: %s" %
            (LAYERS, self.max_games, lr))
        return losses, rewards
    def test_neverLose(self):
        GAMES = 10000

        player1 = SearchPlayer()
        player2 = SearchPlayer()
        random_player = RandomPlayer()

        simulation = TicTacToe([player1, player2])
        results, losses = simulation.run_simulations(GAMES // 100)
        self.assertEqual(
            len(results), results.count(0),
            "Perfect player mirror match resulted in a result other than draw")
        print("\nFirst 20 results: %s against self" % results[:20])

        simulation = TicTacToe([player1, random_player])
        results, losses = simulation.run_simulations(GAMES)
        self.assertEqual(0, results.count(-1),
                         "Perfect player lost against random")
        print("First 20 results: %s against random player" % results[:20])
        print("Win rate: %s vs random player" % (sum(results) / len(results)))
예제 #7
0
    def test_ConvReinforcePlayer(self):
        fc_player = FCReinforcePlayer(lr=1e-4)
        random_player = RandomPlayer()

        simulation = TicTacToe([fc_player, random_player])
        simulation.run_simulations(100)
예제 #8
0
    def test_DummyTrainReinforcePlayer(self):
        player1 = FCReinforcePlayer(lr=0.001)
        player2 = RandomPlayer()

        simulation = TicTacToe([player1, player2])
        simulation.run_simulations(10)
예제 #9
0
    def run(self, lr, silent=False):

        EVALUATION_GAMES = 10

        player = FCReinforcePlayer(lr=lr)
        player.color = config.BLACK

        expert = ExperiencedPlayer(deterministic=True, block_mid=True)
        expert.color = config.BLACK

        generator = RandomPlayer()
        color_iterator = self.AlternatingColorIterator()

        validation_set = self.generate_supervised_training_data(
            EVALUATION_GAMES,
            ExperiencedPlayer(deterministic=True, block_mid=True))

        print("Training ReinforcedPlayer supervised continuously with LR: %s" %
              lr)
        start = datetime.now()
        for game in range(self.games):
            rewards = []
            board = TicTacToeBoard()

            for i in range(9):
                expert_move = expert.get_move(board)
                player_move = player.get_move(board)

                reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS
                rewards.append(reward)

                # prepare for next sample
                move = generator.get_move(board)
                board.apply_move(move, color_iterator.__next__())

            average_reward = sum(rewards) / len(rewards)
            player.strategy.rewards = rewards
            loss = player.strategy.update()

            del rewards[:]
            self.add_results([("Losses", loss), ("Reward", average_reward)])

            if game % self.evaluation_period == 0:
                test_rewards = []
                for board, expert_move in validation_set:
                    # Evaluation mode
                    player.strategy.train, player.strategy.model.training = False, False
                    strategy_move = player.get_move(board)
                    player.strategy.train, player.strategy.model.training = True, True

                    test_reward = config.BLACK if expert_move == strategy_move else config.WHITE
                    test_rewards.append(test_reward)

                average_test_reward = sum(test_rewards) / len(test_rewards)
                del test_rewards[:]
                self.add_results(("Test reward", average_test_reward))

            if not silent:
                if Printer.print_episode(game + 1, self.games,
                                         datetime.now() - start):
                    plot_name = "Supervised Continuous training of %s" % (
                        player)
                    plot_info = "%s Games - Final reward: %s \nTime: %s" % (
                        game + 1, average_reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)

        return average_reward