def run(self, lr, silent=False): EVALUATION_GAMES = 10 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK generator = RandomPlayer() color_iterator = self.AlternatingColorIterator() validation_set = self.generate_supervised_training_data( EVALUATION_GAMES, ExperiencedPlayer(deterministic=True, block_mid=True)) print("Training ReinforcedPlayer supervised continuously with LR: %s" % lr) start = datetime.now() for game in range(self.games): rewards = [] board = TicTacToeBoard() for i in range(9): expert_move = expert.get_move(board) player_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS rewards.append(reward) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() del rewards[:] self.add_results([("Losses", loss), ("Reward", average_reward)]) if game % self.evaluation_period == 0: test_rewards = [] for board, expert_move in validation_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) player.strategy.train, player.strategy.model.training = True, True test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) del test_rewards[:] self.add_results(("Test reward", average_test_reward)) if not silent: if Printer.print_episode(game + 1, self.games, datetime.now() - start): plot_name = "Supervised Continuous training of %s" % ( player) plot_info = "%s Games - Final reward: %s \nTime: %s" % ( game + 1, average_reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward
def run(self, lr, silent=False): print( "Training PGStrategy supervised on %s games for %s Episodes - LR: %s" % (self.games, self.episodes, lr)) TEST_GAMES = 1 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK training_set = self.generate_supervised_training_data( self.games, expert) test_set = self.generate_supervised_training_data(TEST_GAMES, expert) start = datetime.now() for episode in range(self.episodes): rewards = [] test_rewards = [] for board, expert_move in training_set: # Training mode player.strategy.train, player.strategy.model.training = True, True strategy_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == strategy_move else config.LABEL_LOSS rewards.append(reward) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() for board, expert_move in test_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) self.add_results([("Losses", loss), ("Average reward", average_reward), ("Average test reward", average_test_reward)]) if not silent: if Printer.print_episode(episode + 1, self.episodes, datetime.now() - start): plot_name = "Supervised on %s games lr: %s" % (self.games, lr) plot_info = "Lr: %s - %s Games - %s Episodes\nFinal Scores: %s / %s \nTime: %s" % ( lr, self.games, episode + 1, '{:.2f}'.format(average_reward), '{:.2f}'.format(average_test_reward), config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward, average_test_reward