def play_round(self, num_reads: int) -> Tuple[Optional[str], List[np.ndarray]]: """ Evaluate the trained network by playing matches between the current and the previous NN @param num_reads: see args """ print("Starting game round...") # randomly choose starting player if np.random.uniform(0, 1) <= 0.5: white = self.current black = self.best w = "current" b = "best" else: white = self.best black = self.current w = "best" b = "current" # initializing current_board = Board() game_won = False dataset = [] value = 0 temperature = 0.1 # exploration vs exploitation factor (smaller -> more exploitation) while not game_won and current_board.is_playable(): dataset.append(copy.deepcopy(current_board.encode())) # get Policy if current_board.player == PLAYER_1: root = UCT_search(current_board, num_reads, white) policy = get_policy(root, temperature) print("Policy: ", policy, "white = %s" % (str(w))) elif current_board.player == PLAYER_2: root = UCT_search(current_board, num_reads, black) policy = get_policy(root, temperature) print("Policy: ", policy, "black = %s" % (str(b))) else: raise AssertionError("Invalid player.") # Chose a Column with given policy col_choice = np.random.choice(np.array([0, 1, 2, 3, 4, 5, 6]), p=policy) current_board.drop_piece(col_choice) # move piece print(current_board) if current_board.check_winner(): # someone wins if current_board.player == PLAYER_1: # black wins value = -1 elif current_board.player == PLAYER_2: # white wins value = 1 game_won = True # Append new board to the dataset encoded in one-hot-encoding manner dataset.append(current_board.encode()) if value == -1: dataset.append(f"{b} as black wins") return b, dataset elif value == 1: dataset.append(f"{w} as white wins") return w, dataset else: dataset.append("Nobody wins") return None, dataset
def self_play(net: Connect4Network, start_index: np.int, cpu_index: np.int, num_games: np.int, args: AlphaZeroArgs, iteration: np.int): """ Self Play of AlphaZero, generating and saving Datasets for the training of the Neural Network @param net: @param start_index: Start index of Self Play games @param cpu_index: @param num_games: @param args: @param iteration: current Iteration """ # number of more random moves, before lowering temp n_max_moves = 11 print(f"CPU={cpu_index}: Starting MCTS") iteration_dir = f"./datasets/iter_{iteration}" if not os.path.isdir(iteration_dir): os.makedirs(iteration_dir) # Play self play games for idx in range(start_index, num_games + start_index): print(f"Game {idx}") current_board = Board() game_won = False # indicates that a game is won dataset = [] states = [] value = 0 move_count = 0 while not game_won and current_board.is_playable(): t = 0.1 # less random further into the game if move_count < n_max_moves: t = args.temperature_mcts # save current board state (encoded and unencoded) states.append(current_board.current_board.copy()) board_state = current_board.encode().copy() root = UCT_search(current_board, args.num_reads_mcts, net) policy = get_policy(root, t) print(f"Game {idx} policy: {policy}") col_choice = np.random.choice(np.array([0, 1, 2, 3, 4, 5, 6]), p=policy) current_board.drop_piece(col_choice) # move piece dataset.append([board_state, policy]) print(f"[Iteration: {iteration}]: Game {idx} CURRENT BOARD:\n", current_board) move_count += 1 if current_board.check_winner(): # if somebody won if current_board.player == PLAYER_1: # black wins print("Black wins") value = -1 elif current_board.player == PLAYER_2: # white wins print("White wins") value = 1 game_won = True dataset_p = [] for idx, data in enumerate(dataset): s, p = data if idx == 0: dataset_p.append([s, p, 0]) else: dataset_p.append([s, p, value]) # Save the dataset time_string = datetime.datetime.today().strftime("%Y-%m-%d") pickle_file = f"iter_{iteration}/dataset_iter{iteration}_cpu{cpu_index}_{idx}_{time_string}" util.pickle_save(pickle_file, dataset_p)