Esempio n. 1
0
    def expand(self, node, available, network, env, device="cpu"):
        """Expand the tree with a random new action, which should be valued
        by a rollout.
        """
        if len(node.children) > 0:
            raise ValueError("expand called wrongly")

        # Create input
        all_moves = network.all_moves
        board = create_board(env.x, env.y, env.m, env.n)

        priors, value = network(
            torch.tensor(board, device=device).unsqueeze(0).float())

        # Find new candidate actions, and create Nodes for each.
        # This will leave them available for select later.
        for a in available:
            new = Node(name=a,
                       initial_count=1,
                       initial_value=0,
                       prior=float(priors[0, all_moves.index(a)].item()))
            node.add(new)  # inplace update

        # Pick a move to rollout, and add it to the path.
        move = self.default_policy(available)
        self.path.append(node.children[available.index(move)])

        return move, node, value
Esempio n. 2
0
def expected_value(m, n, model, default_value=0.0):
    """Estimate the max value of each board position"""

    values = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            board = tuple(flatten_board(create_board(i, j, m, n)))
            try:
                v = model[board].max()
                values[i, j] = v
            except KeyError:
                values[i, j] = default_value

    return values
Esempio n. 3
0
def wythoff_dqn1(epsilon=0.1,
                 gamma=0.8,
                 learning_rate=0.1,
                 num_episodes=10,
                 batch_size=100,
                 memory_capacity=10000,
                 game='Wythoff10x10',
                 network='DQN',
                 anneal=False,
                 tensorboard=None,
                 update_every=5,
                 self_play=False,
                 save=False,
                 save_model=False,
                 monitor=None,
                 return_none=False,
                 debug=False,
                 progress=False,
                 seed=None):
    """Learn to play Wythoff's w/ e-greedy random exploration.
    
    Note: Learning is based on a player-opponent joint action formalism 
    and tabular Q-learning.
    """

    # ------------------------------------------------------------------------
    # Init
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Logs...
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # Env...
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------------------
    # Init
    #
    # Scores
    score = 0
    total_reward = 0

    # Agents, etc
    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)
    if network == 'DQN':
        player = DQN(m, n, num_actions=len(all_possible_moves))
        opponent = DQN(m, n, num_actions=len(all_possible_moves))
    elif network == 'DQN_mlp':
        player = DQN_mlp(m, n, num_actions=len(all_possible_moves))
        opponent = DQN_mlp(m, n, num_actions=len(all_possible_moves))
    else:
        raise ValueError("network must DQN or DQN_mlp")
    if debug:
        print(f"---------------------------------------")
        print("Setting up....")
        print(f">>> Network is {player}")
        print(f">>> Memory capacity {memory_capacity} ({batch_size})")

    player_memory = ReplayMemory(memory_capacity)
    opponent_memory = ReplayMemory(memory_capacity)
    if self_play:
        player_memory = opponent_memory

    player_optimizer = optim.Adam(player.parameters(), learning_rate)
    opponent_optimizer = optim.Adam(opponent.parameters(), learning_rate)

    moves = MoveCount(m, n)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Re-init
        #
        # Scores
        steps = 1
        done = False
        mover = 'opponent'  # This will shift to player on the first move.
        transitions = []

        # Worlds
        state = env.reset()
        x, y, board, available = state
        board = tuple(flatten_board(board))
        moves.update((x, y))
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = epsilon

        # -------------------------------------------------------------------
        # Play a game
        while not done:
            # Choose a mover
            mover = shift_mover(mover)
            memory = shift_memory(mover, player_memory, opponent_memory)
            model = shift_model(mover, player, opponent)

            # Convert board to a model(state)
            state_hat = torch.from_numpy(np.array(board).reshape(m, n))
            state_hat = state_hat.unsqueeze(0).unsqueeze(1).float()

            # Get and filter Qs
            Qs = model(state_hat).float().detach()  # torch
            Qs = Qs.numpy().squeeze()

            mask = build_mask(available, m, n).flatten()
            Qs *= mask

            # Choose a move
            index = np.nonzero(mask)[0].tolist()
            move_i = e_greedy(Qs, epsilon=epsilon_e, index=index, mode='numpy')

            # Re-index move_i to match 'available' index
            move_a = index.index(move_i)
            move = available[move_a]

            # Analyze it...
            if move in locate_cold_moves(x, y, available):
                score += (1 - score) / episode

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next
            total_reward += reward

            # Save transitions, as tensors to be used at training time
            moves.update(move)

            state_hat_next = torch.from_numpy(
                np.array(board_next).reshape(m, n))
            state_hat_next = state_hat_next.unsqueeze(0).unsqueeze(1).float()

            transitions.append([
                state_hat.float(),
                torch.from_numpy(mask),
                torch.tensor(move_i),
                state_hat_next.float(),
                torch.tensor([reward]).unsqueeze(0).float()
            ])

            # Shift states
            state = deepcopy(state_next)
            board = deepcopy(board_next)
            available = deepcopy(available_next)
            x = deepcopy(x_next)
            y = deepcopy(y_next)
            steps += 1

            # -
            if debug:
                print(f">>> {mover}: {move}")
                print(f">>> new position: ({x_next}, {y_next})")

        # ----------------------------------------------------------------
        # Learn from the game
        #
        # Find the losers transition and update its reward w/ -reward
        if steps > 2:
            transitions[-2][4] = transitions[-1][4] * -1

        # Update the memories using the transitions from this game
        for i in range(0, len(transitions), 2):
            s, x, a, sn, r = transitions[i]
            player_memory.push(s.to(device), x.to(device), a.to(device),
                               sn.to(device), r.to(device))
        for i in range(1, len(transitions), 2):
            s, x, a, sn, r = transitions[i]
            opponent_memory.push(s.to(device), x.to(device), a.to(device),
                                 sn.to(device), r.to(device))

        # Bypass is we don't have enough in memory to learn
        if episode < batch_size:
            continue

        # Learn, samping batches of transitions from memory
        player, player_loss = train_dqn(batch_size,
                                        player,
                                        player_memory,
                                        player_optimizer,
                                        device,
                                        gamma=gamma)
        opponent, opponent_loss = train_dqn(batch_size,
                                            opponent,
                                            opponent_memory,
                                            opponent_optimizer,
                                            device,
                                            gamma=gamma)

        # ----------------------------------------------------------------
        # Logs...
        if progress:
            print(f"---")
        if progress or debug:
            print(f">>> episode: {episode}")
            print(f">>> winner: {mover}")
        if debug or progress:
            print(f">>> Q: {Qs}")
            print(f">>> max(Q): {Qs.max()}")
            print(f">>> min(Q): {Qs.min()}")
            print(f">>> stdev(Q): {Qs.std()}")
            print(
                f">>> loss (player: {player_loss}, opponent: {opponent_loss})")
            print(f">>> player score: {score}")
            print(f">>> epsilon: {epsilon_e}")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', reward, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('player_loss', player_loss, episode)
            writer.add_scalar('opponent_loss', opponent_loss, episode)
            writer.add_scalar('steps', steps, episode)
            writer.add_scalar('score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(cold,
                               vmin=0,
                               vmax=1,
                               path=tensorboard,
                               name='cold_board.png')
            writer.add_image('cold_positions',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'cold_board.png'))),
                             0,
                             dataformats='HWC')

            # Extract all value boards, and find extrema
            values = torch.zeros((len(all_possible_moves), m, n))
            for i, a in enumerate(all_possible_moves):
                example = create_board(a[0], a[1], m, n)
                values[i, :, :] = player(state_hat).detach().float().reshape(
                    m, n)
            mean_values = torch.mean(values, 0)
            # max_values, _ = torch.max(values, 0)
            # min_values, _ = torch.min(values, 0)

            # Log
            writer.add_scalar('Q_mean', torch.mean(mean_values), episode)

            # Plot mean
            plot_wythoff_board(mean_values.numpy(),
                               vmin=mean_values.numpy().min(),
                               vmax=mean_values.numpy().max(),
                               path=tensorboard,
                               name='player_mean_values.png')
            writer.add_image('mean player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_mean_values.png'))),
                             0,
                             dataformats='HWC')

            # Plot move count
            plot_wythoff_board(moves.count,
                               vmax=moves.count.max() / 10,
                               vmin=0,
                               path=tensorboard,
                               name='moves.png')
            writer.add_image('moves',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard, 'moves.png'))),
                             0,
                             dataformats='HWC')

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if save_model:
        state = {
            'stumbler_player_dict': player,
            'stumbler_opponent_dict': opponent
        }
        torch.save(state, save + ".pytorch")
    if monitor:
        save_monitored(save, monitored)
    if tensorboard:
        writer.close()

    result = (player, opponent), (score / episode, total_reward)
    if return_none:
        result = None

    return result
Esempio n. 4
0
def wythoff_dqn2(epsilon=0.1,
                 gamma=0.5,
                 learning_rate=1e-6,
                 num_episodes=100,
                 batch_size=20,
                 memory_capacity=100,
                 game='Wythoff10x10',
                 network='DQN_xy1',
                 anneal=False,
                 tensorboard=None,
                 update_every=5,
                 double=False,
                 double_update=10,
                 save=False,
                 save_model=False,
                 monitor=None,
                 return_none=False,
                 debug=False,
                 device='cpu',
                 clip_grad=False,
                 progress=False,
                 zero=False,
                 seed=None):
    """Learning Wythoff's, with a DQN."""

    # ------------------------------------------------------------------------
    # Init
    num_episodes = int(num_episodes)
    batch_size = int(batch_size)
    memory_capacity = int(memory_capacity)
    update_every = int(update_every)

    # Logs...
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # Env...
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------------------
    # Init
    #
    # Scores
    score = 0
    total_reward = 0

    # Agents, etc
    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)

    # Is network a nn.Module?
    if hasattr(network, "forward"):
        Model = network
    # Is it the name of a azad model?
    else:
        Model = getattr(azad.models, network)

    player = Model().to(device)
    target = Model().to(device)

    if double:
        target.load_state_dict(player.state_dict())
        target.eval()
    else:
        target = None

    if debug:
        print(f"---------------------------------------")
        print("Setting up....")
        print(f">>> Device: {device}")
        print(f">>> Network is {player}")
        print(f">>> Memory capacity {memory_capacity} ({batch_size})")

    memory = ReplayMemory(memory_capacity)
    # optimizer = optim.Adam(player.parameters(), learning_rate)
    optimizer = optim.SGD(player.parameters(), learning_rate)
    moves = MoveCount(m, n)
    opts = OptimalCount(0)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Re-init
        transitions = []
        state = env.reset()
        x, y, board, available = state
        moves.update((x, y))
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = epsilon

        # -------------------------------------------------------------------
        # Play a game
        steps = 1
        done = False
        while not done:
            # Choose a move
            Qs = build_Qs(player,
                          state,
                          available,
                          device=device,
                          mode="numpy")
            move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy')
            move = available[move_i]
            moves.update(move)

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next

            # Track value statistics
            total_reward += reward
            Q = Qs[move_i]
            prediction_error = Qs.max() - Q
            advantage = Q - Qs[np.nonzero(Qs)].mean()

            # Save transitions, as tensors to be used at training time
            # (onto GPU)
            transitions.append([
                # S
                torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(),
                # A
                torch.tensor(move).unsqueeze(0),
                # S'
                torch.tensor(
                    (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(),
                # R
                torch.tensor([reward]).unsqueeze(0).float(),
            ])

            # -
            if debug:
                print(f">>> position: {(x, y)}")
                print(f">>> num available: {len(available)}")
                print(f">>> available: {available}")
                print(f">>> Qs (filtered): {Qs}")
                print(f">>> new position: ({x_next}, {y_next})")

            # Shift states
            state = deepcopy(state_next)
            board = deepcopy(board_next)
            available = deepcopy(available_next)
            x = deepcopy(x_next)
            y = deepcopy(y_next)

            steps += 1

        # ----------------------------------------------------------------
        # Learn from the game
        #
        # Find the losers transition and update its reward w/ -reward
        if steps > 2:
            transitions[-2][3] = transitions[-1][3] * -1

        # Update the memories using the transitions from this game
        for i in range(0, len(transitions)):
            memory.push(*transitions[i])

        if debug:
            print(f">>> final transitions: {transitions[-2:]}")

        # Bypass if we don't have enough in memory to learn
        if episode < batch_size:
            continue

        # Learn, samping a batch of transitions from memory
        player, loss = train_dqn(batch_size,
                                 player,
                                 memory,
                                 optimizer,
                                 device,
                                 target=target,
                                 gamma=gamma,
                                 clip_grad=clip_grad)

        # Update target net, if in double mode and time is right.
        if double and (episode % double_update == 0):
            target.load_state_dict(player.state_dict())

        # ----------------------------------------------------------------
        # Logs...
        if progress:
            print(f"---")
        if progress or debug:
            print(f">>> episode: {episode}")
        if debug or progress:
            print(f">>> loss {loss}")
            print(f">>> Q(last,a): {Q}")
            print(f">>> epsilon: {epsilon_e}")
            print(f">>> score: {score}")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', reward, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('loss', loss, episode)
            writer.add_scalar('steps', steps, episode)
            writer.add_scalar('score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(cold,
                               vmin=0,
                               vmax=1,
                               path=tensorboard,
                               name='cold_board.png')
            writer.add_image('cold_positions',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'cold_board.png'))),
                             0,
                             dataformats='HWC')

            # Extract all value boards, and find extrema
            values = torch.zeros((len(all_possible_moves), m, n))
            for i, a in enumerate(all_possible_moves):
                sample_hat = np.asarray(create_board(a[0], a[1], m, n))

                sample_hat = torch.from_numpy(sample_hat)
                sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float()

                values[i, :, :] = player(sample_hat).detach().float().reshape(
                    m, n)

            mean_values = torch.mean(values, 0)
            max_values, _ = torch.max(values, 0)
            min_values, _ = torch.min(values, 0)

            # Log
            writer.add_scalar('Q_mean', torch.mean(mean_values), episode)
            writer.add_scalar('Q_min', torch.mean(min_values), episode)
            writer.add_scalar('Q_max', torch.mean(max_values), episode)

            # Plot mean
            plot_wythoff_board(mean_values.numpy(),
                               vmin=mean_values.numpy().min(),
                               vmax=mean_values.numpy().max(),
                               path=tensorboard,
                               name='player_mean_values.png')
            writer.add_image('mean player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_mean_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot max
            plot_wythoff_board(max_values.numpy(),
                               vmin=max_values.numpy().min(),
                               vmax=max_values.numpy().max(),
                               path=tensorboard,
                               name='player_max_values.png')
            writer.add_image('max player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_max_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot min
            plot_wythoff_board(min_values.numpy(),
                               vmin=min_values.numpy().min(),
                               vmax=min_values.numpy().max(),
                               path=tensorboard,
                               name='player_min_values.png')
            writer.add_image('min player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_min_values.png'))),
                             0,
                             dataformats='HWC')

            # Plot move count
            plot_wythoff_board(moves.count,
                               vmax=moves.count.max() / 10,
                               vmin=0,
                               path=tensorboard,
                               name='moves.png')
            writer.add_image('moves',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard, 'moves.png'))),
                             0,
                             dataformats='HWC')

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if monitor and save:
        save_monitored(save, monitored)
    if tensorboard:
        writer.close()

    result = {"player": player.state_dict(), "score": score}
    if target is not None:
        result['target'] = target.state_dict()
    if save:
        torch.save(result, save + ".pytorch")

    if monitor and not save:
        result["monitored"] = monitored

    if return_none:
        result = None

    return result
Esempio n. 5
0
def evaluate_wythoff(stumbler=None,
                     strategist=None,
                     stumbler_game='Wythoff10x10',
                     strategist_game='Wythoff50x50',
                     random_stumbler=False,
                     load_model=None,
                     save=None,
                     return_none=False,
                     num_episodes=100,
                     debug=False):
    """Compare stumblers to strategists.
    
    Returns 
    -------
    wins : float
        the fraction of games won by the strategist.
    """
    # ------------------------------------------------------------------------
    if load_model is not None:
        stumbler, _, strategist = load_for_eval(load_model)

    # Init boards, etc
    # Stratgist
    env = create_env(strategist_game, monitor=False)
    m, n, board, _ = peek(env)
    if strategist is not None:
        hot_cold_table = create_bias_board(m, n, strategist)
    else:
        hot_cold_table = np.zeros_like(board)

    # Stumbler
    o, p, _, _ = peek(create_env(stumbler_game, monitor=False))

    # ------------------------------------------------------------------------
    # A stumbler and a strategist take turns playing a (m,n) game of wythoffs
    wins = 0.0
    strategist_score = 0.0
    stumbler_score = 0.0
    for episode in range(num_episodes):
        # Re-init
        steps = 0

        # Start the game, and process the result
        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))

        if debug:
            print("---------------------------------------")
            print(">>> NEW MODEL EVALUATION ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))

        done = False
        while not done:
            # ----------------------------------------------------------------
            # STUMBLER
            if (x < o) and (y < p):
                s_board = tuple(flatten_board(create_board(x, y, o, p)))
                s_available = create_moves(x, y)
                try:
                    values = stumbler[s_board]
                    move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy')
                    move = s_available[move_i]
                except KeyError:
                    move_i = np.random.randint(0, len(s_available))
                    move = s_available[move_i]
            else:
                s_available = available
                move_i = np.random.randint(0, len(s_available))
                move = s_available[move_i]

            # ----------------------------------------------------------------
            # RANDOM PLAYER
            if random_stumbler:
                move_i = np.random.randint(0, len(available))
                move = available[move_i]

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, s_available):
                if move in locate_cold_moves(x, y, s_available):
                    best = 1.0
                stumbler_score += (best - stumbler_score) / (episode + 1)

            # Move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if debug:
                print(">>> STUMBLER move {}".format(move))

            if done:
                break

            # ----------------------------------------------------------------
            # STRATEGIST
            # Choose.
            hot_cold_move_values = [hot_cold_table[i, j] for i, j in available]
            move_i = epsilon_greedy(
                np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy')
            move = available[move_i]

            if debug:
                print(">>> STRATEGIST move {}".format(move))

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                strategist_score += (best - strategist_score) / (episode + 1)

            # Make a move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if done:
                wins += 1.0
                break

        if debug:
            print("Wins {}, Scores ({}, {})".format(wins, stumbler_score,
                                                    strategist_score))

    if save is not None:
        np.savetxt(
            save,
            np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3),
            fmt='%.1f,%.4f,%.4f',
            comments="",
            header="wins,stumbler_score,strategist_score")

    result = (wins / num_episodes), stumbler_score, strategist_score
    if return_none:
        result = None

    return result