def expand(self, node, available, network, env, device="cpu"): """Expand the tree with a random new action, which should be valued by a rollout. """ if len(node.children) > 0: raise ValueError("expand called wrongly") # Create input all_moves = network.all_moves board = create_board(env.x, env.y, env.m, env.n) priors, value = network( torch.tensor(board, device=device).unsqueeze(0).float()) # Find new candidate actions, and create Nodes for each. # This will leave them available for select later. for a in available: new = Node(name=a, initial_count=1, initial_value=0, prior=float(priors[0, all_moves.index(a)].item())) node.add(new) # inplace update # Pick a move to rollout, and add it to the path. move = self.default_policy(available) self.path.append(node.children[available.index(move)]) return move, node, value
def expected_value(m, n, model, default_value=0.0): """Estimate the max value of each board position""" values = np.zeros((m, n)) for i in range(m): for j in range(n): board = tuple(flatten_board(create_board(i, j, m, n))) try: v = model[board].max() values[i, j] = v except KeyError: values[i, j] = default_value return values
def wythoff_dqn1(epsilon=0.1, gamma=0.8, learning_rate=0.1, num_episodes=10, batch_size=100, memory_capacity=10000, game='Wythoff10x10', network='DQN', anneal=False, tensorboard=None, update_every=5, self_play=False, save=False, save_model=False, monitor=None, return_none=False, debug=False, progress=False, seed=None): """Learn to play Wythoff's w/ e-greedy random exploration. Note: Learning is based on a player-opponent joint action formalism and tabular Q-learning. """ # ------------------------------------------------------------------------ # Init device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) if network == 'DQN': player = DQN(m, n, num_actions=len(all_possible_moves)) opponent = DQN(m, n, num_actions=len(all_possible_moves)) elif network == 'DQN_mlp': player = DQN_mlp(m, n, num_actions=len(all_possible_moves)) opponent = DQN_mlp(m, n, num_actions=len(all_possible_moves)) else: raise ValueError("network must DQN or DQN_mlp") if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") player_memory = ReplayMemory(memory_capacity) opponent_memory = ReplayMemory(memory_capacity) if self_play: player_memory = opponent_memory player_optimizer = optim.Adam(player.parameters(), learning_rate) opponent_optimizer = optim.Adam(opponent.parameters(), learning_rate) moves = MoveCount(m, n) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init # # Scores steps = 1 done = False mover = 'opponent' # This will shift to player on the first move. transitions = [] # Worlds state = env.reset() x, y, board, available = state board = tuple(flatten_board(board)) moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game while not done: # Choose a mover mover = shift_mover(mover) memory = shift_memory(mover, player_memory, opponent_memory) model = shift_model(mover, player, opponent) # Convert board to a model(state) state_hat = torch.from_numpy(np.array(board).reshape(m, n)) state_hat = state_hat.unsqueeze(0).unsqueeze(1).float() # Get and filter Qs Qs = model(state_hat).float().detach() # torch Qs = Qs.numpy().squeeze() mask = build_mask(available, m, n).flatten() Qs *= mask # Choose a move index = np.nonzero(mask)[0].tolist() move_i = e_greedy(Qs, epsilon=epsilon_e, index=index, mode='numpy') # Re-index move_i to match 'available' index move_a = index.index(move_i) move = available[move_a] # Analyze it... if move in locate_cold_moves(x, y, available): score += (1 - score) / episode # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next total_reward += reward # Save transitions, as tensors to be used at training time moves.update(move) state_hat_next = torch.from_numpy( np.array(board_next).reshape(m, n)) state_hat_next = state_hat_next.unsqueeze(0).unsqueeze(1).float() transitions.append([ state_hat.float(), torch.from_numpy(mask), torch.tensor(move_i), state_hat_next.float(), torch.tensor([reward]).unsqueeze(0).float() ]) # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # - if debug: print(f">>> {mover}: {move}") print(f">>> new position: ({x_next}, {y_next})") # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][4] = transitions[-1][4] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions), 2): s, x, a, sn, r = transitions[i] player_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) for i in range(1, len(transitions), 2): s, x, a, sn, r = transitions[i] opponent_memory.push(s.to(device), x.to(device), a.to(device), sn.to(device), r.to(device)) # Bypass is we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping batches of transitions from memory player, player_loss = train_dqn(batch_size, player, player_memory, player_optimizer, device, gamma=gamma) opponent, opponent_loss = train_dqn(batch_size, opponent, opponent_memory, opponent_optimizer, device, gamma=gamma) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") print(f">>> winner: {mover}") if debug or progress: print(f">>> Q: {Qs}") print(f">>> max(Q): {Qs.max()}") print(f">>> min(Q): {Qs.min()}") print(f">>> stdev(Q): {Qs.std()}") print( f">>> loss (player: {player_loss}, opponent: {opponent_loss})") print(f">>> player score: {score}") print(f">>> epsilon: {epsilon_e}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('player_loss', player_loss, episode) writer.add_scalar('opponent_loss', opponent_loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): example = create_board(a[0], a[1], m, n) values[i, :, :] = player(state_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) # max_values, _ = torch.max(values, 0) # min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if save_model: state = { 'stumbler_player_dict': player, 'stumbler_opponent_dict': opponent } torch.save(state, save + ".pytorch") if monitor: save_monitored(save, monitored) if tensorboard: writer.close() result = (player, opponent), (score / episode, total_reward) if return_none: result = None return result
def wythoff_dqn2(epsilon=0.1, gamma=0.5, learning_rate=1e-6, num_episodes=100, batch_size=20, memory_capacity=100, game='Wythoff10x10', network='DQN_xy1', anneal=False, tensorboard=None, update_every=5, double=False, double_update=10, save=False, save_model=False, monitor=None, return_none=False, debug=False, device='cpu', clip_grad=False, progress=False, zero=False, seed=None): """Learning Wythoff's, with a DQN.""" # ------------------------------------------------------------------------ # Init num_episodes = int(num_episodes) batch_size = int(batch_size) memory_capacity = int(memory_capacity) update_every = int(update_every) # Logs... if tensorboard is not None: try: os.makedirs(tensorboard) except OSError as exception: if exception.errno != errno.EEXIST: raise writer = SummaryWriter(log_dir=tensorboard) if monitor is not None: monitored = create_monitored(monitor) # Env... if tensorboard is not None: env = create_env(game, monitor=True) else: env = create_env(game, monitor=False) env.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------------ # Init # # Scores score = 0 total_reward = 0 # Agents, etc m, n, board, available = peek(env) all_possible_moves = create_all_possible_moves(m, n) # Is network a nn.Module? if hasattr(network, "forward"): Model = network # Is it the name of a azad model? else: Model = getattr(azad.models, network) player = Model().to(device) target = Model().to(device) if double: target.load_state_dict(player.state_dict()) target.eval() else: target = None if debug: print(f"---------------------------------------") print("Setting up....") print(f">>> Device: {device}") print(f">>> Network is {player}") print(f">>> Memory capacity {memory_capacity} ({batch_size})") memory = ReplayMemory(memory_capacity) # optimizer = optim.Adam(player.parameters(), learning_rate) optimizer = optim.SGD(player.parameters(), learning_rate) moves = MoveCount(m, n) opts = OptimalCount(0) # ------------------------------------------------------------------------ for episode in range(1, num_episodes + 1): # Re-init transitions = [] state = env.reset() x, y, board, available = state moves.update((x, y)) if debug: print(f"---------------------------------------") print(f">>> NEW GAME ({episode}).") print(f">>> Initial position ({x}, {y})") print(f">>> Initial moves {available}") print(f">>> Cold available {locate_cold_moves(x, y, available)}") print(f">>> All cold {locate_all_cold_moves(x, y)}") # Anneal epsilon? if anneal: epsilon_e = epsilon * (1.0 / np.log((episode + np.e))) else: epsilon_e = epsilon # ------------------------------------------------------------------- # Play a game steps = 1 done = False while not done: # Choose a move Qs = build_Qs(player, state, available, device=device, mode="numpy") move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy') move = available[move_i] moves.update(move) # Analyze it... best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 score += (best - score) / (episode + 1) # Play it state_next, reward, done, _ = env.step(move) (x_next, y_next, board_next, available_next) = state_next # Track value statistics total_reward += reward Q = Qs[move_i] prediction_error = Qs.max() - Q advantage = Q - Qs[np.nonzero(Qs)].mean() # Save transitions, as tensors to be used at training time # (onto GPU) transitions.append([ # S torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(), # A torch.tensor(move).unsqueeze(0), # S' torch.tensor( (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(), # R torch.tensor([reward]).unsqueeze(0).float(), ]) # - if debug: print(f">>> position: {(x, y)}") print(f">>> num available: {len(available)}") print(f">>> available: {available}") print(f">>> Qs (filtered): {Qs}") print(f">>> new position: ({x_next}, {y_next})") # Shift states state = deepcopy(state_next) board = deepcopy(board_next) available = deepcopy(available_next) x = deepcopy(x_next) y = deepcopy(y_next) steps += 1 # ---------------------------------------------------------------- # Learn from the game # # Find the losers transition and update its reward w/ -reward if steps > 2: transitions[-2][3] = transitions[-1][3] * -1 # Update the memories using the transitions from this game for i in range(0, len(transitions)): memory.push(*transitions[i]) if debug: print(f">>> final transitions: {transitions[-2:]}") # Bypass if we don't have enough in memory to learn if episode < batch_size: continue # Learn, samping a batch of transitions from memory player, loss = train_dqn(batch_size, player, memory, optimizer, device, target=target, gamma=gamma, clip_grad=clip_grad) # Update target net, if in double mode and time is right. if double and (episode % double_update == 0): target.load_state_dict(player.state_dict()) # ---------------------------------------------------------------- # Logs... if progress: print(f"---") if progress or debug: print(f">>> episode: {episode}") if debug or progress: print(f">>> loss {loss}") print(f">>> Q(last,a): {Q}") print(f">>> epsilon: {epsilon_e}") print(f">>> score: {score}") if tensorboard and (int(episode) % update_every) == 0: writer.add_scalar('reward', reward, episode) writer.add_scalar('epsilon_e', epsilon_e, episode) writer.add_scalar('loss', loss, episode) writer.add_scalar('steps', steps, episode) writer.add_scalar('score', score, episode) # Cold ref: cold = create_cold_board(m, n) plot_wythoff_board(cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png') writer.add_image('cold_positions', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'cold_board.png'))), 0, dataformats='HWC') # Extract all value boards, and find extrema values = torch.zeros((len(all_possible_moves), m, n)) for i, a in enumerate(all_possible_moves): sample_hat = np.asarray(create_board(a[0], a[1], m, n)) sample_hat = torch.from_numpy(sample_hat) sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float() values[i, :, :] = player(sample_hat).detach().float().reshape( m, n) mean_values = torch.mean(values, 0) max_values, _ = torch.max(values, 0) min_values, _ = torch.min(values, 0) # Log writer.add_scalar('Q_mean', torch.mean(mean_values), episode) writer.add_scalar('Q_min', torch.mean(min_values), episode) writer.add_scalar('Q_max', torch.mean(max_values), episode) # Plot mean plot_wythoff_board(mean_values.numpy(), vmin=mean_values.numpy().min(), vmax=mean_values.numpy().max(), path=tensorboard, name='player_mean_values.png') writer.add_image('mean player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_mean_values.png'))), 0, dataformats='HWC') # Plot max plot_wythoff_board(max_values.numpy(), vmin=max_values.numpy().min(), vmax=max_values.numpy().max(), path=tensorboard, name='player_max_values.png') writer.add_image('max player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_max_values.png'))), 0, dataformats='HWC') # Plot min plot_wythoff_board(min_values.numpy(), vmin=min_values.numpy().min(), vmax=min_values.numpy().max(), path=tensorboard, name='player_min_values.png') writer.add_image('min player', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'player_min_values.png'))), 0, dataformats='HWC') # Plot move count plot_wythoff_board(moves.count, vmax=moves.count.max() / 10, vmin=0, path=tensorboard, name='moves.png') writer.add_image('moves', torch.from_numpy( skimage.io.imread( os.path.join(tensorboard, 'moves.png'))), 0, dataformats='HWC') if monitor and (int(episode) % update_every) == 0: all_variables = locals() for k in monitor: monitored[k].append(float(all_variables[k])) # -------------------------------------------------------------------- if monitor and save: save_monitored(save, monitored) if tensorboard: writer.close() result = {"player": player.state_dict(), "score": score} if target is not None: result['target'] = target.state_dict() if save: torch.save(result, save + ".pytorch") if monitor and not save: result["monitored"] = monitored if return_none: result = None return result
def evaluate_wythoff(stumbler=None, strategist=None, stumbler_game='Wythoff10x10', strategist_game='Wythoff50x50', random_stumbler=False, load_model=None, save=None, return_none=False, num_episodes=100, debug=False): """Compare stumblers to strategists. Returns ------- wins : float the fraction of games won by the strategist. """ # ------------------------------------------------------------------------ if load_model is not None: stumbler, _, strategist = load_for_eval(load_model) # Init boards, etc # Stratgist env = create_env(strategist_game, monitor=False) m, n, board, _ = peek(env) if strategist is not None: hot_cold_table = create_bias_board(m, n, strategist) else: hot_cold_table = np.zeros_like(board) # Stumbler o, p, _, _ = peek(create_env(stumbler_game, monitor=False)) # ------------------------------------------------------------------------ # A stumbler and a strategist take turns playing a (m,n) game of wythoffs wins = 0.0 strategist_score = 0.0 stumbler_score = 0.0 for episode in range(num_episodes): # Re-init steps = 0 # Start the game, and process the result x, y, board, available = env.reset() board = tuple(flatten_board(board)) if debug: print("---------------------------------------") print(">>> NEW MODEL EVALUATION ({}).".format(episode)) print(">>> Initial position ({}, {})".format(x, y)) done = False while not done: # ---------------------------------------------------------------- # STUMBLER if (x < o) and (y < p): s_board = tuple(flatten_board(create_board(x, y, o, p))) s_available = create_moves(x, y) try: values = stumbler[s_board] move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy') move = s_available[move_i] except KeyError: move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] else: s_available = available move_i = np.random.randint(0, len(s_available)) move = s_available[move_i] # ---------------------------------------------------------------- # RANDOM PLAYER if random_stumbler: move_i = np.random.randint(0, len(available)) move = available[move_i] # Analyze the choice best = 0.0 if cold_move_available(x, y, s_available): if move in locate_cold_moves(x, y, s_available): best = 1.0 stumbler_score += (best - stumbler_score) / (episode + 1) # Move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if debug: print(">>> STUMBLER move {}".format(move)) if done: break # ---------------------------------------------------------------- # STRATEGIST # Choose. hot_cold_move_values = [hot_cold_table[i, j] for i, j in available] move_i = epsilon_greedy( np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy') move = available[move_i] if debug: print(">>> STRATEGIST move {}".format(move)) # Analyze the choice best = 0.0 if cold_move_available(x, y, available): if move in locate_cold_moves(x, y, available): best = 1.0 strategist_score += (best - strategist_score) / (episode + 1) # Make a move (x, y, board, available), reward, done, _ = env.step(move) board = tuple(flatten_board(board)) if done: wins += 1.0 break if debug: print("Wins {}, Scores ({}, {})".format(wins, stumbler_score, strategist_score)) if save is not None: np.savetxt( save, np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3), fmt='%.1f,%.4f,%.4f', comments="", header="wins,stumbler_score,strategist_score") result = (wins / num_episodes), stumbler_score, strategist_score if return_none: result = None return result