예제 #1
0
    def run(self, lr, milestones=False, silent=False):
        self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer(
            lr=lr)

        # Player 2 has the same start conditions as Player 1 but does not train
        self.player2 = self.player1.copy(shared_weights=True)
        self.player2.strategy.train = False
        self.simulation = TicTacToe([
            self.player1, self.player2
        ])  # Players never change, therefore we only need one simulation

        games_per_evaluation = self.games // self.evaluations
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):
            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            # If milestones exist, use them with probability p
            if self.milestones and random() < 0.2:
                self.player2 = choice(self.milestones)
            else:
                self.player2 = self.player1.copy(shared_weights=True)

            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode
            self.player2.strategy.train = False

            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_loss(np.mean(losses))
            self.add_results(("Self", np.mean(results)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            score, results, overview = evaluate_against_base_players(
                self.player1)
            self.add_results(results)

            if not silent and Printer.print_episode(
                    episode * games_per_evaluation, self.games,
                    datetime.now() - start_time):
                self.plot_and_save(
                    "%s vs SELF" % (self.player1.__str__() +
                                    (" milestones" if milestones else "")),
                    "Train %s vs Self version of self\nGames: %s Evaluations: %s\nTime: %s"
                    % (self.player1, episode * games_per_evaluation,
                       self.evaluations, config.time_diff(start_time)))

            # If x/5th of training is completed, save milestone
            if milestones and (self.games / episode *
                               games_per_evaluation) % 5 == 0:
                self.milestones.append(self.player1.copy(shared_weights=False))
                self.milestones[-1].strategy.train = False

        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
예제 #2
0
    def run(self, lr, silent=False):

        self.player1 = self.pretrained_player if self.pretrained_player else FCBaseLinePlayer(
            lr=lr)

        if self.opponent is not None:
            self.player2 = self.opponent
            self.simulation = TicTacToe([self.player1, self.player2])

        games_per_evaluation = self.games // self.evaluations
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):

            if self.opponent is None:
                self.player2 = choice(
                    (RandomPlayer(), NovicePlayer(), ExperiencedPlayer()
                     ))  # choice((RandomPlayer(), ExpertPlayer()))
                self.simulation = TicTacToe([self.player1, self.player2])

            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_loss(np.mean(losses))
            self.add_results(("Training Results", np.mean(results)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            if self.opponent is None:
                score, results, overview = evaluate_against_base_players(
                    self.player1)
            else:
                score, results, overview = evaluate_against_base_players(
                    self.player1, evaluation_players=[self.opponent])

            self.add_results(results)

            if not silent:
                if Printer.print_episode(episode * games_per_evaluation,
                                         self.games,
                                         datetime.now() - start_time):
                    overview = format_overview(overview)
                    self.plot_and_save(
                        "%s vs TRADITIONAL OPPONENT" % (self.player1),
                        "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" %
                        (self.player1, self.opponent,
                         episode * games_per_evaluation, self.evaluations,
                         config.time_diff(start_time)))

        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
예제 #3
0
    def run(self, lr, termination_criterion, silent=False):
        self.player = FCReinforcePlayer(lr=lr)
        self.player.color = config.BLACK

        generator = RandomPlayer()
        print("Pretraining %s on legal moves" % self.player.__str__())

        losses, rewards = [], []
        start = datetime.now()
        for game in range(1, self.max_games + 1):
            loss, reward = self.__run_episode__(generator)
            losses.append(loss)
            rewards.append(reward)

            if not silent:
                if Printer.print_episode(game, self.max_games,
                                         datetime.now() - start):
                    plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % (
                        self.player.__class__.__name__, LAYERS, lr)
                    plot_info = "%sGames - Final reward: %s \nTime: %s" % (
                        game, reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)
                    if (100 * game / self.max_games) % 10 == 0:
                        self.save_player(
                            self.player,
                            "using %s layers pretrained on legal moves for %s games lr: %s"
                            % (LAYERS, self.max_games, lr))

            if game > termination_criterion and sum(rewards[
                    -termination_criterion:]) / termination_criterion == 1:
                print(
                    "Reached training goal: %s games with only legal moves played -> terminating training."
                    % termination_criterion)
                self.save_player(
                    self.player,
                    "using %s layers pretrained on legal moves for %s games lr: %s"
                    % (LAYERS, self.max_games, lr))
                return losses, rewards

        print("Reached max training_games (%s) -> terminating training" %
              self.max_games)
        self.save_player(
            self.player,
            "using %s layers pretrained on legal moves for %s games lr: %s" %
            (LAYERS, self.max_games, lr))
        return losses, rewards
예제 #4
0
    def run(self, lr, silent=False):
        self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer(
            lr=lr)

        # Player 2 has the same start conditions as Player 1 but does not train
        self.player2 = self.player1.copy(shared_weights=False)
        self.player2.strategy.train = False

        games_per_evaluation = self.games // self.evaluations
        self.replacements = []
        start_time = datetime.now()
        for episode in range(1, self.evaluations + 1):
            # train
            self.player1.strategy.train, self.player1.strategy.model.training = True, True  # training mode

            self.simulation = TicTacToe([self.player1, self.player2])
            results, losses = self.simulation.run_simulations(
                games_per_evaluation)
            self.add_results(("Losses", np.mean(losses)))

            # evaluate
            self.player1.strategy.train, self.player1.strategy.model.training = False, False  # eval mode
            score, results, overview = evaluate_against_base_players(
                self.player1)
            self.add_loss(np.mean(losses))
            self.add_results(results)

            if not silent and Printer.print_episode(
                    episode * games_per_evaluation, self.games,
                    datetime.now() - start_time):
                self.plot_and_save(
                    "%s vs BEST" % (self.player1),
                    "Train %s vs Best version of self\nGames: %s Evaluations: %s\nTime: %s"
                    % (self.player1, episode * games_per_evaluation,
                       self.evaluations, config.time_diff(start_time)))

            if evaluate_against_each_other(self.player1, self.player2):
                self.player2 = self.player1.copy(shared_weights=False)
                self.player2.strategy.train, self.player2.strategy.model.training = False, False
                self.replacements.append(episode)

        print("Best player replaced after episodes: %s" % self.replacements)
        self.final_score, self.final_results, self.results_overview = evaluate_against_base_players(
            self.player1, silent=False)
        return self
예제 #5
0
    def run(self, lr, silent=False):

        EVALUATION_GAMES = 10

        player = FCReinforcePlayer(lr=lr)
        player.color = config.BLACK

        expert = ExperiencedPlayer(deterministic=True, block_mid=True)
        expert.color = config.BLACK

        generator = RandomPlayer()
        color_iterator = self.AlternatingColorIterator()

        validation_set = self.generate_supervised_training_data(
            EVALUATION_GAMES,
            ExperiencedPlayer(deterministic=True, block_mid=True))

        print("Training ReinforcedPlayer supervised continuously with LR: %s" %
              lr)
        start = datetime.now()
        for game in range(self.games):
            rewards = []
            board = TicTacToeBoard()

            for i in range(9):
                expert_move = expert.get_move(board)
                player_move = player.get_move(board)

                reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS
                rewards.append(reward)

                # prepare for next sample
                move = generator.get_move(board)
                board.apply_move(move, color_iterator.__next__())

            average_reward = sum(rewards) / len(rewards)
            player.strategy.rewards = rewards
            loss = player.strategy.update()

            del rewards[:]
            self.add_results([("Losses", loss), ("Reward", average_reward)])

            if game % self.evaluation_period == 0:
                test_rewards = []
                for board, expert_move in validation_set:
                    # Evaluation mode
                    player.strategy.train, player.strategy.model.training = False, False
                    strategy_move = player.get_move(board)
                    player.strategy.train, player.strategy.model.training = True, True

                    test_reward = config.BLACK if expert_move == strategy_move else config.WHITE
                    test_rewards.append(test_reward)

                average_test_reward = sum(test_rewards) / len(test_rewards)
                del test_rewards[:]
                self.add_results(("Test reward", average_test_reward))

            if not silent:
                if Printer.print_episode(game + 1, self.games,
                                         datetime.now() - start):
                    plot_name = "Supervised Continuous training of %s" % (
                        player)
                    plot_info = "%s Games - Final reward: %s \nTime: %s" % (
                        game + 1, average_reward, config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)

        return average_reward
예제 #6
0
from TicTacToe.experiments.ticTacToeBaseExperiment import TicTacToeBaseExperiment
from TicTacToe.environment.evaluation import evaluate_against_each_other
from TicTacToe.players.basePlayers import RandomPlayer, ExperiencedPlayer, ExpertPlayer


class EvaluatePlayer(TicTacToeBaseExperiment):
    config = conf

    def __init__(self):
        super(EvaluatePlayer, self).__init__()

    def reset(self):
        super().__init__()

    def run(self, player1, player2):
        evaluate_against_each_other(player1, player2, silent=False)


if __name__ == '__main__':

    START_TIME = datetime.now()

    LR = 1e-5 + random() * 1e-9
    PLAYER1 = ExperiencedPlayer()
    PLAYER2 = ExpertPlayer()  # RandomPlayer()

    experiment = EvaluatePlayer()
    experiment.run(player1=PLAYER1, player2=PLAYER2)

    print("\n| Evaluation completed, took %s |" % conf.time_diff(START_TIME))
예제 #7
0
    def run(self, lr, silent=False):

        print(
            "Training PGStrategy supervised on %s games for %s Episodes - LR: %s"
            % (self.games, self.episodes, lr))
        TEST_GAMES = 1

        player = FCReinforcePlayer(lr=lr)
        player.color = config.BLACK

        expert = ExperiencedPlayer(deterministic=True, block_mid=True)
        expert.color = config.BLACK

        training_set = self.generate_supervised_training_data(
            self.games, expert)
        test_set = self.generate_supervised_training_data(TEST_GAMES, expert)

        start = datetime.now()
        for episode in range(self.episodes):
            rewards = []
            test_rewards = []

            for board, expert_move in training_set:
                # Training mode
                player.strategy.train, player.strategy.model.training = True, True

                strategy_move = player.get_move(board)
                reward = config.LABEL_WIN if expert_move == strategy_move else config.LABEL_LOSS
                rewards.append(reward)

            average_reward = sum(rewards) / len(rewards)
            player.strategy.rewards = rewards
            loss = player.strategy.update()

            for board, expert_move in test_set:
                # Evaluation mode
                player.strategy.train, player.strategy.model.training = False, False

                strategy_move = player.get_move(board)
                test_reward = config.BLACK if expert_move == strategy_move else config.WHITE
                test_rewards.append(test_reward)

            average_test_reward = sum(test_rewards) / len(test_rewards)

            self.add_results([("Losses", loss),
                              ("Average reward", average_reward),
                              ("Average test reward", average_test_reward)])

            if not silent:
                if Printer.print_episode(episode + 1, self.episodes,
                                         datetime.now() - start):
                    plot_name = "Supervised on %s games lr: %s" % (self.games,
                                                                   lr)
                    plot_info = "Lr: %s - %s Games - %s Episodes\nFinal Scores: %s / %s \nTime: %s" % (
                        lr, self.games, episode + 1,
                        '{:.2f}'.format(average_reward),
                        '{:.2f}'.format(average_test_reward),
                        config.time_diff(start))
                    self.plot_and_save(plot_name, plot_name + "\n" + plot_info)

        return average_reward, average_test_reward