def play(self, state): canon, map_to_orig = state.toCanonical(state.activePlayer.code) batch = torch_geometric.data.Batch.from_data_list([boardToData(canon)]) mask, moves = maskAndMoves(canon, canon.gamePhase, batch.edge_index) if not self.apprentice is None: _, _, _, players, misc = canon.toDicts() global_x = buildGlobalFeature(players, misc).unsqueeze(0) pick, place, attack, fortify, value = self.apprentice.forward( batch, global_x) if canon.gamePhase == 'initialPick': policy = pick elif canon.gamePhase in ['initialFortify', 'startTurn']: policy = place elif canon.gamePhase == 'attack': policy = attack elif canon.gamePhase == 'fortify': policy = fortify else: policy = torch.ones_like(mask) / max(mask.shape) policy = policy * mask value = value.squeeze() cor_value = torch.FloatTensor([ value[map_to_orig.get(i)] if not map_to_orig.get(i) is None else 0.0 for i in range(6) ]) return policy, cor_value
def play_episode(root, max_depth, apprentice, move_type="all", verbose=False): episode = [] state = copy.deepcopy(root) edge_index = boardToData(root).edge_index # ******************* PLAY EPISODE *************************** for i in range(max_depth): #print_message_over(f"Playing episode: {i}/{max_depth}") # Check if episode is over if state.gameOver: break # Check is current player is alive or not if not state.activePlayer.is_alive: # print("\npassing, dead player") state.endTurn() continue # Get possible moves, and apprentice policy mask, actions = agent.maskAndMoves(state, state.gamePhase, edge_index) try: policy, value = apprentice.getPolicy(state) except Exception as e: state.report() print(state.activePlayer.is_alive) print(state.activePlayer.num_countries) raise e if isinstance(mask, torch.Tensor): mask = mask.detach().numpy() probs = policy * mask probs = probs.flatten() probs = probs / probs.sum() # Random selection? e-greedy? ind = np.random.choice(range(len(actions)), p=probs) move = agent.buildMove(state, actions[ind]) saved = (move_type == "all" or move_type == state.gamePhase) if verbose: # print(f"\t\tPlay episode: turn {i}, move = {move}, saved = {saved}") pass if saved: episode.append(copy.deepcopy(state)) # Play the move to continue state.playMove(move) return episode
def create_self_play_data(move_type, path, root, apprentice, max_depth = 100, saved_states_per_episode=1, verbose = False): """ Function to create episodes from self play. Visited states are saved and then re visited with the expert to label the data To do this in parallel, we use the multiprocessing library. The idea is to feed a queue of states Then use this queue to, in parallel, tag each state with the expert move """ """ samples_type = {'initialPick':0, 'initialFortify':0, 'startTurn':0, 'attack':0, 'fortify':0} # Get information about existing files, to continue enlarging the dataset for k, v in samples_type.items(): path_aux = os.path.join(path, k, 'raw') val = max(list(map(int, filter(isint, [n[(n.find("_")+1):n.find(".")] for n in os.listdir(path_aux) if 'board' in n] ) ) ) + [0]) samples_type[k] = val """ edge_index = boardToData(root).edge_index # ******************* PLAY EPISODE *************************** episode = play_episode(root, max_depth, apprentice) # ******************* SELECT STATES *************************** # Take some states from episode try: # Define here how many states to select, and how options = [s for s in episode if s.gamePhase == move_type] if not options: # TODO: What to do in this case? For now just take some random states to avoid wasting the episode options = episode states_to_save = np.random.choice(options, min(saved_states_per_episode, len(options))) except Exception as e: raise e return states_to_save
def play_episode(root, max_depth, apprentice): episode = [] state = copy.deepcopy(root) edge_index = boardToData(root).edge_index # ******************* PLAY EPISODE *************************** for i in range(max_depth): #print_message_over(f"Playing episode: {i}/{max_depth}") # Check if episode is over if state.gameOver: break # Check is current player is alive or not if not state.activePlayer.is_alive: # print("\npassing, dead player") state.endTurn() continue # Get possible moves, and apprentice policy mask, actions = maskAndMoves(state, state.gamePhase, edge_index) try: policy, value = apprentice.play(state) except Exception as e: state.report() print(state.activePlayer.is_alive) print(state.activePlayer.num_countries) raise e policy = policy * mask probs = policy.squeeze().detach().numpy() probs = probs / probs.sum() # Random selection? e-greedy? ind = np.random.choice(range(len(actions)), p = probs) move = buildMove(state, actions[ind]) episode.append(copy.deepcopy(state)) # Play the move to continue state.playMove(move) return episode
def playMove(self, board, temp=1, num_sims=None, use_val = False): """ This function will be used in every type of move Call the MCTS, get the action probabilities, take the argmax or use any other criterion """ edge_index = boardToData(board).edge_index mask, actions = maskAndMoves(board, board.gamePhase, edge_index) # Do the MCTS policy, value, _ = self.MCTS.getActionProb(board, temp=temp, num_sims = num_sims, use_val = use_val) policy = policy * mask.squeeze().detach().numpy() probs = policy / policy.sum() # Use some criterion to choose the move z = np.random.uniform() if self.move_selection == "argmax" or (self.move_selection == "e_greedy" and z < self.eps_greedy): ind = np.argmax(probs) elif self.move_selection == "random_proportional" or (self.move_selection == "e_greedy" and z >= self.eps_greedy): ind = np.random.choice(range(len(actions)), p = probs) # Return the selected move return buildMove(board, actions[ind])
def search(self, state, depth, use_val=False): # print("\n\n-------- SEARCH --------") # print(f"depth: {depth}") # state.report() # Is terminal? return vector of score per player if isTerminal(state) or depth > self.max_depth: # print("\n\n-------- TERMINAL --------") return score_players(state), score_players(state) # Active player is dead, then end turn while not state.activePlayer.is_alive: state.endTurn() if state.gameOver: return score_players(state), score_players(state) s = hash(state) # Is leaf? if not s in self.Ps: canon, map_to_orig = state.toCanonical(state.activePlayer.code) batch = torch_geometric.data.Batch.from_data_list( [boardToData(canon)]) mask, moves = maskAndMoves(canon, canon.gamePhase, batch.edge_index) if not self.apprentice is None: policy, value = self.apprentice.play(canon) else: # No bias, just uniform sampling for the moment policy, value = torch.ones_like(mask) / max( mask.shape), torch.zeros((1, 6)) policy = policy * mask self.Vs[s], self.As[s] = mask.squeeze(), moves self.Ps[s] = policy.squeeze() self.Ns[s] = 1 # Return an evaluation v = np.zeros(6) for _ in range(self.sims_per_eval): sim = copy.deepcopy(state) sim.simulate(agent.RandomAgent()) v += score_players(sim) v /= self.sims_per_eval # Fix order of value returned by net value = value.squeeze() # Apprentice already does this # cor_value = torch.FloatTensor([value[map_to_orig.get(i)] if not map_to_orig.get(i) is None else 0.0 for i in range(6)]) cor_value = value return v, cor_value # Not a leaf, keep going down. Use values for the current player p = state.activePlayer.code action = -1 bestScore = -float('inf') # print("Valid:") # print(self.Vs[s]) for i, act in enumerate(self.As[s]): a = hash(act) # print(i, act) if self.Vs[s][i] > 0.0: if (s, a) in self.Rsa: # PUCT formula uct = self.Rsa[(s, a)][p] + self.cb * np.sqrt( np.log(self.Ns[s]) / max(self.Nsa[(s, a)], self.eps)) val = self.wb * self.Qsa[(s, a)] * (use_val) pol = self.wa * self.Ps[s][i] / (self.Nsa[(s, a)] + 1) sc = uct + pol + val[p] else: # Unseen action, take it action = act break if sc > bestScore: bestScore = sc action = act if isinstance(action, int) and action == -1: print("**** No move?? *****") state.report() print(self.As[s]) print(self.Vs[s]) # print('best: ', action) a = hash(action) # Best action in simplified way move = buildMove(state, action) # Play action, continue search # TODO: For now, armies are placed on one country only to simplify the game # print(move) state.playMove(move) v, net_v = self.search(state, depth + 1, use_val) if isinstance(net_v, torch.Tensor): net_v = net_v.detach().numpy() if isinstance(v, torch.Tensor): v = v.detach().numpy() if (s, a) in self.Rsa: rsa, qsa, nsa = self.Rsa[(s, a)], self.Qsa[(s, a)], self.Nsa[(s, a)] self.Rsa[(s, a)] = (nsa * rsa + v) / (nsa + 1) self.Qsa[(s, a)] = (nsa * qsa + net_v) / (nsa + 1) self.Nsa[(s, a)] += 1 else: self.Rsa[(s, a)] = v self.Qsa[(s, a)] = net_v self.Nsa[(s, a)] = 1 self.Ns[s] += 1 return v, net_v
def create_self_play_data(path, root, num_samples, start_sample, apprentice, expert, max_depth=100, saved_states_per_episode=1, verbose=False): """ Function to create episodes from self play. Visited states are saved and then re visited with the expert to label the data """ samples = 0 samples_type = { 'initialPick': 0, 'initialFortify': 0, 'startTurn': 0, 'attack': 0, 'fortify': 0 } for k, v in samples_type.items(): path_aux = os.path.join(path, k, 'raw') val = max( list( map( int, filter(isint, [ n[(n.find("_") + 1):n.find(".")] for n in os.listdir(path_aux) if 'board' in n ]))) + [0]) samples_type[k] = val move_to_save = itertools.cycle(list(samples_type.keys())) edge_index = boardToData(root).edge_index while samples < num_samples: # ******************* PLAY EPISODE *************************** episode = [] state = copy.deepcopy(root) for i in range(max_depth): print_message_over(f"Playing episode: {i}/{max_depth}") # Check if episode is over if state.gameOver: break # Check is current player is alive or not if not state.activePlayer.is_alive: # print("\npassing, dead player") state.endTurn() continue # Get possible moves, and apprentice policy mask, actions = maskAndMoves(state, state.gamePhase, edge_index) try: policy, value = apprentice.play(state) except Exception as e: state.report() print(state.activePlayer.is_alive) print(state.activePlayer.num_countries) raise e policy = policy * mask probs = policy.squeeze().detach().numpy() probs = probs / probs.sum() ind = np.random.choice(range(len(actions)), p=probs) move = buildMove(state, actions[ind]) episode.append(copy.deepcopy(state)) # Play the move to continue state.playMove(move) # ******************* SAVE STATES *************************** # Take some states from episode # Choose which kind of move we are going to save to_save = next(move_to_save) try: # Define here how many states to select, and how options = [s for s in episode if s.gamePhase == to_save] init_to_save = to_save while not options: to_save = next(move_to_save) if to_save == init_to_save: raise Exception( "Episode is empty? No dataset could be created for any game phase" ) options = [s for s in episode if s.gamePhase == to_save] states_to_save = np.random.choice( options, min(saved_states_per_episode, len(options))) except Exception as e: raise e # Get expert move for the chosen states for i, state in enumerate(states_to_save): print_message_over( f"Saving states: Saved {i}/{len(states_to_save)}... Total: {samples}/{num_samples}" ) policy_exp, value_exp, _ = expert.getActionProb(state, temp=1, num_sims=None, use_val=False) # Save the board, value and target board, _ = state.toCanonical(state.activePlayer.code) phase = board.gamePhase if isinstance(policy_exp, torch.Tensor): policy_exp = policy_exp.detach().numpy() if isinstance(value_exp, torch.Tensor): value_exp = value_exp.detach().numpy() saveBoardObs(path + '/' + phase + '/raw', 'board_{}.json'.format(samples_type[phase]), board, board.gamePhase, policy_exp.tolist(), value_exp.tolist()) samples += 1 samples_type[phase] += 1 print_message_over( f"Saving states: Saved {i+1}/{len(states_to_save)}... Total: {samples}/{num_samples}" ) print_message_over("Done!") print()
#%%% board.play() while not board.gameOver and board.gamePhase != "attack": board.play() board.report() print(board.countriesPandas()) print("\n") # Get policy for board canon, _ = board.toCanonical(board.activePlayer.code) batch = torch_geometric.data.Batch.from_data_list([boardToData(canon)]) mask, moves = agent.maskAndMoves(canon, canon.gamePhase, batch.edge_index) policy, value = apprentice.getPolicy(canon) pop = policy.squeeze() T = 1 exp = np.exp(np.log(np.maximum(pop, 0.000001))/T) soft = exp/exp.sum() co = board.countries() for m, a, p, s in zip(mask.squeeze(), moves, pop, soft): if m.item(): if len(a) > 2: print( f"{a[0]}: {co[a[1]]['id']} -> {co[a[2]]['id']} - {p:.3f} - {s:.3f}") else: