def play_episode(root, max_depth, apprentice, move_type="all", verbose=False): episode = [] state = copy.deepcopy(root) edge_index = boardToData(root).edge_index # ******************* PLAY EPISODE *************************** for i in range(max_depth): #print_message_over(f"Playing episode: {i}/{max_depth}") # Check if episode is over if state.gameOver: break # Check is current player is alive or not if not state.activePlayer.is_alive: # print("\npassing, dead player") state.endTurn() continue # Get possible moves, and apprentice policy mask, actions = agent.maskAndMoves(state, state.gamePhase, edge_index) try: policy, value = apprentice.getPolicy(state) except Exception as e: state.report() print(state.activePlayer.is_alive) print(state.activePlayer.num_countries) raise e if isinstance(mask, torch.Tensor): mask = mask.detach().numpy() probs = policy * mask probs = probs.flatten() probs = probs / probs.sum() # Random selection? e-greedy? ind = np.random.choice(range(len(actions)), p=probs) move = agent.buildMove(state, actions[ind]) saved = (move_type == "all" or move_type == state.gamePhase) if verbose: # print(f"\t\tPlay episode: turn {i}, move = {move}, saved = {saved}") pass if saved: episode.append(copy.deepcopy(state)) # Play the move to continue state.playMove(move) return episode
def play_episode(root, max_depth, apprentice): episode = [] state = copy.deepcopy(root) edge_index = boardToData(root).edge_index # ******************* PLAY EPISODE *************************** for i in range(max_depth): #print_message_over(f"Playing episode: {i}/{max_depth}") # Check if episode is over if state.gameOver: break # Check is current player is alive or not if not state.activePlayer.is_alive: # print("\npassing, dead player") state.endTurn() continue # Get possible moves, and apprentice policy mask, actions = maskAndMoves(state, state.gamePhase, edge_index) try: policy, value = apprentice.play(state) except Exception as e: state.report() print(state.activePlayer.is_alive) print(state.activePlayer.num_countries) raise e policy = policy * mask probs = policy.squeeze().detach().numpy() probs = probs / probs.sum() # Random selection? e-greedy? ind = np.random.choice(range(len(actions)), p = probs) move = buildMove(state, actions[ind]) episode.append(copy.deepcopy(state)) # Play the move to continue state.playMove(move) return episode
#%%% board.play() while not board.gameOver and board.gamePhase != "attack": board.play() board.report() print(board.countriesPandas()) print("\n") # Get policy for board canon, _ = board.toCanonical(board.activePlayer.code) batch = torch_geometric.data.Batch.from_data_list([boardToData(canon)]) mask, moves = agent.maskAndMoves(canon, canon.gamePhase, batch.edge_index) policy, value = apprentice.getPolicy(canon) pop = policy.squeeze() T = 1 exp = np.exp(np.log(np.maximum(pop, 0.000001))/T) soft = exp/exp.sum() co = board.countries() for m, a, p, s in zip(mask.squeeze(), moves, pop, soft): if m.item(): if len(a) > 2: print( f"{a[0]}: {co[a[1]]['id']} -> {co[a[2]]['id']} - {p:.3f} - {s:.3f}") else: print(f"{a[0]}: {co[a[1]]['id']} - {p:.3f}- {s:.3f}")