def __run_episode__(self, generator): player = self.player rewards = [] color_iterator = self.AlternatingColorIterator() board = TicTacToeBoard() for i in range(9): player_move = player.get_move(board) # Win if predicted move is legal, loss otherwise reward = config.LABEL_WIN if player_move in board.get_valid_moves( player.color) else config.LABEL_LOSS rewards.append(reward) # prepare for next sample board.apply_move(generator.get_move(board), color_iterator.__next__()) loss = player.strategy.update() player.strategy.rewards = [] average_reward = np.mean(rewards) del rewards[:] self.add_results([("Losses", loss), ("Score", average_reward)]) return loss, average_reward
def generate_supervised_training_data(cls, games, labeling_strategy): """ Generates training data by applying random moves to a board and labeling each sample with the move that :param labeling_strategy would have taken given the board. :param games: The number of games to be simulated :param labeling_strategy: The strategy used to label each sample. The label equals labeling_strategy.get_move(board) :return: a list of tuples(board_sample, move_label) """ labeling_strategy.color = cls.config.BLACK generator = RandomPlayer() color_iterator = TicTacToeBaseExperiment.AlternatingColorIterator() start = datetime.now() training_set = [] for game in range(games): board = TicTacToeBoard() for i in range(9): # generate training pair expert_move = labeling_strategy.get_move(board) training_set.append((board.copy(), expert_move)) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) print("Generated %s training pairs form %s games in %s" % (len(training_set), games, datetime.now() - start)) return training_set
def test_Board_ApplyValidMoves(self): board = TicTacToeBoard() if config.BOARD_SIZE == 3: self.assertEqual(set(board.get_valid_moves(config.BLACK)), set([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]), msg="Valid moves incorrect") self.assertEqual(set(board.get_valid_moves(config.WHITE)), set([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]), msg="Valid moves incorrect") else: self.assertEqual(config.BOARD_SIZE**2, len(board.get_valid_moves(config.BLACK)), msg="Incorrect Number of valid moves") board.apply_move((1, 1), config.BLACK) if config.BOARD_SIZE == 3: self.assertEqual(set(board.get_valid_moves(config.BLACK)), set([(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]), msg="Valid moves incorrect") self.assertEqual(set(board.get_valid_moves(config.WHITE)), set([(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2)]), msg="Valid moves incorrect")
def test_Board_CountStones(self): board = TicTacToeBoard() board.apply_move((0, 0), config.BLACK) board.apply_move((1, 1), config.WHITE) board.apply_move((2, 2), config.BLACK) board.apply_move((1, 2), config.WHITE) board.apply_move((1, 0), config.BLACK) self.assertEqual((3, 2), board.count_stones())
def test_DummyUpdate(self): board = TicTacToeBoard() value_function = PGStrategy(lr=0.001, weight_decay=0.003) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK)) move = RandomPlayer.get_move(board) board.apply_move(move, config.BLACK) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK)) move = RandomPlayer.get_move(board) board.apply_move(move, config.WHITE) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))
def __run__(self, player1, player2): """ Runs an episode of the game :param player1: :param player2: :return: The original color of the winning player """ self.board = TicTacToeBoard() players = player1, player2 while True: move = players[0].get_move(self.board.copy()) self.board.apply_move(move, players[0].color) winner = self.board.game_won() if winner is not None: return config.get_label_from_winner_color( player1, player2, winner) players = list(reversed(players))
def test_Board_ApplyIllegalMove(self): board = TicTacToeBoard() board.apply_move((1, 1), config.BLACK) self.assertEqual(board.illegal_move, None) board.apply_move((1, 1), config.BLACK) self.assertEqual(board.illegal_move, config.BLACK)
class TicTacToe(TwoPlayerGame): def __init__(self, players): super(TicTacToe, self).__init__(players=players, config=config) self.player1.color = config.BLACK self.player2.color = config.WHITE for player in players: player.original_color = player.color def __run__(self, player1, player2): """ Runs an episode of the game :param player1: :param player2: :return: The original color of the winning player """ self.board = TicTacToeBoard() players = player1, player2 while True: move = players[0].get_move(self.board.copy()) self.board.apply_move(move, players[0].color) winner = self.board.game_won() if winner is not None: return config.get_label_from_winner_color( player1, player2, winner) players = list(reversed(players)) def run_simulations(self, episodes, switch_colors=True, switch_players=True): """ Runs a number of games using the given players and returns statistics over all games run. If both :param switch_colors and :param switch_players are set, all four possible starting positions will iterated through. :param episodes: The number of games to run :param switch_colors: Flag specifying whether to alternate the players colors during play :param switch_players: Flag specifying whether to alternate the starting player :return: The results and average losses per episode where results is a list of the original colors of the winning player ([original_winning_color]) """ simulation_players = [self.player1, self.player2] results = [] losses = [] for episode in range(episodes): if switch_colors and episode != 0 and episode % 2 == 0: simulation_players[0].color, simulation_players[ 1].color = simulation_players[1].color, simulation_players[ 0].color if switch_players and episode != 0 and episode + 1 % 2: simulation_players = list(reversed(simulation_players)) winner = self.__run__(simulation_players[0], simulation_players[1]) player_losses = [] for player in simulation_players: loss = player.register_winner(winner) if loss is not None: player_losses.append(loss) losses += player_losses results.append(winner) for player in simulation_players: player.color = player.original_color return results, losses
def test_DummyForwardPass(self): board = TicTacToeBoard() value_function = PGStrategy(lr=0.001, weight_decay=0.003) value_function.evaluate(board.board, board.get_legal_moves_map(config.BLACK))
def test_Board_Representation(self): random_player = ttt_players.RandomPlayer() boards = [] inverses = [] for i in range(100): board = TicTacToeBoard() inverse_board = TicTacToeBoard() for j in range(9): move = random_player.get_move(board) color = (config.BLACK, config.WHITE) color = random.choice(color) board.apply_move(move, color) boards.append(board.copy()) inverse_board.apply_move(move, board.other_color(color)) inverses.append((inverse_board.copy())) for i in range(len(boards)): rep = boards[i].get_representation(config.WHITE) self.assertTrue((rep == inverses[i].board).all(), msg="Inverting board failed")
def test_Board_GameWon(self): board = TicTacToeBoard() self.assertFalse(board.game_won(), msg="Empty Board") board.apply_move((0, 0), config.BLACK) board.apply_move((1, 1), config.WHITE) board.apply_move((1, 0), config.BLACK) board.apply_move((2, 2), config.WHITE) self.assertFalse(board.game_won(), msg="No Winner yet") board.apply_move((2, 0), config.BLACK) self.assertEqual(board.game_won(), config.BLACK, msg="Black Won")
def test_getAfterstates(self): board = TicTacToeBoard() self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)], [a[1] for a in board.get_afterstates(config.BLACK)]) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)], [a[1] for a in board.get_afterstates(config.WHITE)]) board.apply_move((2, 2), config.BLACK) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1)], [a[1] for a in board.get_afterstates(config.BLACK)]) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1)], [a[1] for a in board.get_afterstates(config.WHITE)]) board.apply_move((2, 1), config.WHITE) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0)], [a[1] for a in board.get_afterstates(config.BLACK)]) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0)], [a[1] for a in board.get_afterstates(config.WHITE)]) board.apply_move((1, 1), config.BLACK) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0)], [a[1] for a in board.get_afterstates(config.BLACK)]) self.assertEqual([(0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0)], [a[1] for a in board.get_afterstates(config.WHITE)])
def run(self, lr, silent=False): EVALUATION_GAMES = 10 player = FCReinforcePlayer(lr=lr) player.color = config.BLACK expert = ExperiencedPlayer(deterministic=True, block_mid=True) expert.color = config.BLACK generator = RandomPlayer() color_iterator = self.AlternatingColorIterator() validation_set = self.generate_supervised_training_data( EVALUATION_GAMES, ExperiencedPlayer(deterministic=True, block_mid=True)) print("Training ReinforcedPlayer supervised continuously with LR: %s" % lr) start = datetime.now() for game in range(self.games): rewards = [] board = TicTacToeBoard() for i in range(9): expert_move = expert.get_move(board) player_move = player.get_move(board) reward = config.LABEL_WIN if expert_move == player_move else config.LABEL_LOSS rewards.append(reward) # prepare for next sample move = generator.get_move(board) board.apply_move(move, color_iterator.__next__()) average_reward = sum(rewards) / len(rewards) player.strategy.rewards = rewards loss = player.strategy.update() del rewards[:] self.add_results([("Losses", loss), ("Reward", average_reward)]) if game % self.evaluation_period == 0: test_rewards = [] for board, expert_move in validation_set: # Evaluation mode player.strategy.train, player.strategy.model.training = False, False strategy_move = player.get_move(board) player.strategy.train, player.strategy.model.training = True, True test_reward = config.BLACK if expert_move == strategy_move else config.WHITE test_rewards.append(test_reward) average_test_reward = sum(test_rewards) / len(test_rewards) del test_rewards[:] self.add_results(("Test reward", average_test_reward)) if not silent: if Printer.print_episode(game + 1, self.games, datetime.now() - start): plot_name = "Supervised Continuous training of %s" % ( player) plot_info = "%s Games - Final reward: %s \nTime: %s" % ( game + 1, average_reward, config.time_diff(start)) self.plot_and_save(plot_name, plot_name + "\n" + plot_info) return average_reward