def run(self, lr, termination_criterion, silent=False): self.player = FCReinforcePlayer(lr=lr) self.player.color = config.BLACK generator = RandomPlayer() print("Pretraining %s on legal moves" % self.player.__str__()) losses, rewards = [], [] start = datetime.now() for game in range(1, self.max_games + 1): loss, reward = self.__run_episode__(generator) losses.append(loss) rewards.append(reward) if not silent: if Printer.print_episode(game, self.max_games, datetime.now() - start): plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % ( self.player.__class__.__name__, LAYERS, lr) plot_info = "%sGames - Final reward: %s \nTime: %s" % ( game, reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) if (100 * game / self.max_games) % 10 == 0: self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) if game > termination_criterion and sum(rewards[ -termination_criterion:]) / termination_criterion == 1: print( "Reached training goal: %s games with only legal moves played -> terminating training." % termination_criterion) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards print("Reached max training_games (%s) -> terminating training" % self.max_games) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards
def run(self, lr, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer( lr=lr) if self.opponent is not None: self.player2 = self.opponent self.simulation = TicTacToe([self.player1, self.player2]) games_per_evaluation = self.games // self.evaluations start_time = datetime.now() for episode in range(1, self.evaluations + 1): if self.opponent is None: self.player2 = choice( (RandomPlayer(), NovicePlayer(), ExperiencedPlayer() )) # choice((RandomPlayer(), ExpertPlayer())) self.simulation = TicTacToe([self.player1, self.player2]) # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_loss(np.mean(losses)) self.add_results(("Training Results", np.mean(results))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode if self.opponent is None: score, results, overview = evaluate_against_base_players( self.player1) else: score, results, overview = evaluate_against_base_players( self.player1, evaluation_players=[self.opponent]) self.add_results(results) if not silent: if Printer.print_episode(episode * games_per_evaluation, self.games, datetime.now() - start_time): self.plot_and_save( "%s vs TRADITIONAL OPPONENT" % (self.player1), "Train %s vs %s\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, self.opponent, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self, lr, silent=False): self.player1 = self.pretrained_player if self.pretrained_player else FCReinforcePlayer( lr=lr) # Player 2 has the same start conditions as Player 1 but does not train self.player2 = self.player1.copy(shared_weights=False) self.player2.strategy.train = False games_per_evaluation = self.games // self.evaluations self.replacements = [] start_time = datetime.now() for episode in range(1, self.evaluations + 1): # train self.player1.strategy.train, self.player1.strategy.model.training = True, True # training mode self.simulation = TicTacToe([self.player1, self.player2]) results, losses = self.simulation.run_simulations( games_per_evaluation) self.add_results(("Losses", np.mean(losses))) # evaluate self.player1.strategy.train, self.player1.strategy.model.training = False, False # eval mode score, results, overview = evaluate_against_base_players( self.player1) self.add_loss(np.mean(losses)) self.add_results(results) if not silent and Printer.print_episode( episode * games_per_evaluation, self.games, datetime.now() - start_time): self.plot_and_save( "%s vs BEST" % (self.player1), "Train %s vs Best version of self\nGames: %s Evaluations: %s\nTime: %s" % (self.player1, episode * games_per_evaluation, self.evaluations, config.time_diff(start_time))) if evaluate_against_each_other(self.player1, self.player2): self.player2 = self.player1.copy(shared_weights=False) self.player2.strategy.train, self.player2.strategy.model.training = False, False self.replacements.append(episode) print("Best player replaced after episodes: %s" % self.replacements) self.final_score, self.final_results, self.results_overview = evaluate_against_base_players( self.player1, silent=False) return self
def run(self): if VS_TRADITIONAL: # ACTOR CRITIC if AC: for player in [FCACPlayer(LR)]: experiment = TrainACPlayerVsTraditionalOpponent( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player, opponent=None) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() # BASELINE if BASELINE: for player in [FCBaseLinePlayer(LR)]: experiment = TrainBaselinePlayerVsTraditionalOpponent( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player, opponent=None) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() # REINFORCE if REINFORCE: for player in [FCReinforcePlayer(LR)]: experiment = TrainReinforcePlayerVsTraditionalOpponent( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player, opponent=None) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() if VS_BEST: # ACTOR CRITIC if AC: for player in [FCACPlayer(LR)]: experiment = TrainACPlayerVsBest(games=GAMES, evaluations=EVALUATIONS, pretrained_player=player) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() # BASELINE if BASELINE: for player in [FCBaseLinePlayer(LR)]: experiment = TrainBaselinePlayerVsBest( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() experiment.run(lr=LR, milestones=True) # REINFORCE if REINFORCE: for player in [FCReinforcePlayer(LR)]: experiment = TrainReinforcePlayerVsBest( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() if VS_SELF: # ACTOR CRITIC """ Not yet implemented if AC: for player in [FCACPlayer(LR)]: experiment = TrainACPlayerVsSelf(games=GAMES, evaluations=EVALUATIONS, pretrained_player=player) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() """ # BASELINE if BASELINE: for player in [FCBaseLinePlayer(LR)]: experiment = TrainBaselinePlayerVsSelf( games=GAMES, evaluations=EVALUATIONS, pretrained_player=player) print("\n|| ----- Running %s with %s ----- ||" % (experiment, player)) experiment.run(lr=LR) experiment.reset() experiment.run(lr=LR, milestones=True) # REINFORCE """ Not yet implemented
def test_ConvReinforcePlayer(self): fc_player = FCReinforcePlayer(lr=1e-4) random_player = RandomPlayer() simulation = TicTacToe([fc_player, random_player]) simulation.run_simulations(100)
def test_DummyTrainReinforcePlayer(self): player1 = FCReinforcePlayer(lr=0.001) player2 = RandomPlayer() simulation = TicTacToe([player1, player2]) simulation.run_simulations(10)
def test_CreateReinforcementPlayer(self): FCReinforcePlayer(lr=0.001)
def test_evaluation(self): p1 = ttt_players.RandomPlayer() evaluate_against_base_players(p1, silent=False) p2 = FCReinforcePlayer(lr=1e-5) evaluate_against_base_players(p2, silent=False)
def run(self, lr, silent=False): EVALUATION_GAMES = 10 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK generator = RandomPlayer() color_iterator = self.AlternatingColorIterator() validation_set = self.generate_supervised_training_data( EVALUATION_GAMES, ExperiencedPlayer(deterministic=True, block_mid=True)) print("Training ReinforcedPlayer supervised continuously with LR: %s" % lr) start = datetime.now() for game in range(self.games): rewards = [] board = TicTacToeBoard() for i in range(9): expert_move = expert.get_move(board) player_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS rewards.append(reward) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() del rewards[:] self.add_results([("Losses", loss), ("Reward", average_reward)]) if game % self.evaluation_period == 0: test_rewards = [] for board, expert_move in validation_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) player.strategy.train, player.strategy.model.training = True, True test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) del test_rewards[:] self.add_results(("Test reward", average_test_reward)) if not silent: if Printer.print_episode(game + 1, self.games, datetime.now() - start): plot_name = "Supervised Continuous training of %s" % ( player) plot_info = "%s Games - Final reward: %s \nTime: %s" % ( game + 1, average_reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward
class PretrainLegalMoves(TicTacToeBaseExperiment): """ Trains a player on a continuously random generated data set to only play legal moves. The data set is generated by a random player and training terminates either after :param max_games are played or the player has not performed an illegal move in :param termination_criterion games. """ def __init__(self, max_games): super(PretrainLegalMoves, self).__init__() self.max_games = max_games def reset(self): self.__init__(self.max_games) return self def run(self, lr, termination_criterion, silent=False): self.player = FCReinforcePlayer(lr=lr) self.player.color = config.BLACK generator = RandomPlayer() print("Pretraining %s on legal moves" % self.player.__str__()) losses, rewards = [], [] start = datetime.now() for game in range(1, self.max_games + 1): loss, reward = self.__run_episode__(generator) losses.append(loss) rewards.append(reward) if not silent: if Printer.print_episode(game, self.max_games, datetime.now() - start): plot_name = "Pretraining %s using %s layers on legal moves\nlr: %s" % ( self.player.__class__.__name__, LAYERS, lr) plot_info = "%sGames - Final reward: %s \nTime: %s" % ( game, reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) if (100 * game / self.max_games) % 10 == 0: self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) if game > termination_criterion and sum(rewards[ -termination_criterion:]) / termination_criterion == 1: print( "Reached training goal: %s games with only legal moves played -> terminating training." % termination_criterion) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards print("Reached max training_games (%s) -> terminating training" % self.max_games) self.save_player( self.player, "using %s layers pretrained on legal moves for %s games lr: %s" % (LAYERS, self.max_games, lr)) return losses, rewards def __run_episode__(self, generator): player = self.player rewards = [] color_iterator = self.AlternatingColorIterator() board = TicTacToeBoard() for i in range(9): player_move = player.get_move(board) # Win if predicted move is legal, loss otherwise reward = config.LABEL_WIN if player_move in board.get_valid_moves( player.color) else config.LABEL_LOSS rewards.append(reward) # prepare for next sample board.apply_move(generator.get_move(board), color_iterator.__next__()) loss = player.strategy.update() player.strategy.rewards = [] average_reward = np.mean(rewards) del rewards[:] self.add_results([("Losses", loss), ("Score", average_reward)]) return loss, average_reward
def run(self, lr, silent=False): print( "Training PGStrategy supervised on %s games for %s Episodes - LR: %s" % (self.games, self.episodes, lr)) TEST_GAMES = 1 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK training_set = self.generate_supervised_training_data( self.games, expert) test_set = self.generate_supervised_training_data(TEST_GAMES, expert) start = datetime.now() for episode in range(self.episodes): rewards = [] test_rewards = [] for board, expert_move in training_set: # Training mode player.strategy.train, player.strategy.model.training = True, True strategy_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == strategy_move else config.LABEL_LOSS rewards.append(reward) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() for board, expert_move in test_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) self.add_results([("Losses", loss), ("Average reward", average_reward), ("Average test reward", average_test_reward)]) if not silent: if Printer.print_episode(episode + 1, self.episodes, datetime.now() - start): plot_name = "Supervised on %s games lr: %s" % (self.games, lr) plot_info = "Lr: %s - %s Games - %s Episodes\nFinal Scores: %s / %s \nTime: %s" % ( lr, self.games, episode + 1, '{:.2f}'.format(average_reward), '{:.2f}'.format(average_test_reward), config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward, average_test_reward