Beispiel #1
0
    def __init__(self, m, n, hot_value=-1, cold_value=1):
        self.m = int(m)
        self.n = int(m)
        self.hot_value = float(hot_value)
        self.cold_value = float(cold_value)

        self.board = create_cold_board(
            self.m, self.n, cold_value=cold_value, default=hot_value)
Beispiel #2
0
def wythoff_dqn1(epsilon=0.1,
                 gamma=0.8,
                 learning_rate=0.1,
                 num_episodes=10,
                 batch_size=100,
                 memory_capacity=10000,
                 game='Wythoff10x10',
                 network='DQN',
                 anneal=False,
                 tensorboard=None,
                 update_every=5,
                 self_play=False,
                 save=False,
                 save_model=False,
                 monitor=None,
                 return_none=False,
                 debug=False,
                 progress=False,
                 seed=None):
    """Learn to play Wythoff's w/ e-greedy random exploration.
    
    Note: Learning is based on a player-opponent joint action formalism 
    and tabular Q-learning.
    """

    # ------------------------------------------------------------------------
    # Init
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Logs...
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # Env...
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------------------
    # Init
    #
    # Scores
    score = 0
    total_reward = 0

    # Agents, etc
    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)
    if network == 'DQN':
        player = DQN(m, n, num_actions=len(all_possible_moves))
        opponent = DQN(m, n, num_actions=len(all_possible_moves))
    elif network == 'DQN_mlp':
        player = DQN_mlp(m, n, num_actions=len(all_possible_moves))
        opponent = DQN_mlp(m, n, num_actions=len(all_possible_moves))
    else:
        raise ValueError("network must DQN or DQN_mlp")
    if debug:
        print(f"---------------------------------------")
        print("Setting up....")
        print(f">>> Network is {player}")
        print(f">>> Memory capacity {memory_capacity} ({batch_size})")

    player_memory = ReplayMemory(memory_capacity)
    opponent_memory = ReplayMemory(memory_capacity)
    if self_play:
        player_memory = opponent_memory

    player_optimizer = optim.Adam(player.parameters(), learning_rate)
    opponent_optimizer = optim.Adam(opponent.parameters(), learning_rate)

    moves = MoveCount(m, n)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Re-init
        #
        # Scores
        steps = 1
        done = False
        mover = 'opponent'  # This will shift to player on the first move.
        transitions = []

        # Worlds
        state = env.reset()
        x, y, board, available = state
        board = tuple(flatten_board(board))
        moves.update((x, y))
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = epsilon

        # -------------------------------------------------------------------
        # Play a game
        while not done:
            # Choose a mover
            mover = shift_mover(mover)
            memory = shift_memory(mover, player_memory, opponent_memory)
            model = shift_model(mover, player, opponent)

            # Convert board to a model(state)
            state_hat = torch.from_numpy(np.array(board).reshape(m, n))
            state_hat = state_hat.unsqueeze(0).unsqueeze(1).float()

            # Get and filter Qs
            Qs = model(state_hat).float().detach()  # torch
            Qs = Qs.numpy().squeeze()

            mask = build_mask(available, m, n).flatten()
            Qs *= mask

            # Choose a move
            index = np.nonzero(mask)[0].tolist()
            move_i = e_greedy(Qs, epsilon=epsilon_e, index=index, mode='numpy')

            # Re-index move_i to match 'available' index
            move_a = index.index(move_i)
            move = available[move_a]

            # Analyze it...
            if move in locate_cold_moves(x, y, available):
                score += (1 - score) / episode

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next
            total_reward += reward

            # Save transitions, as tensors to be used at training time
            moves.update(move)

            state_hat_next = torch.from_numpy(
                np.array(board_next).reshape(m, n))
            state_hat_next = state_hat_next.unsqueeze(0).unsqueeze(1).float()

            transitions.append([
                state_hat.float(),
                torch.from_numpy(mask),
                torch.tensor(move_i),
                state_hat_next.float(),
                torch.tensor([reward]).unsqueeze(0).float()
            ])

            # Shift states
            state = deepcopy(state_next)
            board = deepcopy(board_next)
            available = deepcopy(available_next)
            x = deepcopy(x_next)
            y = deepcopy(y_next)
            steps += 1

            # -
            if debug:
                print(f">>> {mover}: {move}")
                print(f">>> new position: ({x_next}, {y_next})")

        # ----------------------------------------------------------------
        # Learn from the game
        #
        # Find the losers transition and update its reward w/ -reward
        if steps > 2:
            transitions[-2][4] = transitions[-1][4] * -1

        # Update the memories using the transitions from this game
        for i in range(0, len(transitions), 2):
            s, x, a, sn, r = transitions[i]
            player_memory.push(s.to(device), x.to(device), a.to(device),
                               sn.to(device), r.to(device))
        for i in range(1, len(transitions), 2):
            s, x, a, sn, r = transitions[i]
            opponent_memory.push(s.to(device), x.to(device), a.to(device),
                                 sn.to(device), r.to(device))

        # Bypass is we don't have enough in memory to learn
        if episode < batch_size:
            continue

        # Learn, samping batches of transitions from memory
        player, player_loss = train_dqn(batch_size,
                                        player,
                                        player_memory,
                                        player_optimizer,
                                        device,
                                        gamma=gamma)
        opponent, opponent_loss = train_dqn(batch_size,
                                            opponent,
                                            opponent_memory,
                                            opponent_optimizer,
                                            device,
                                            gamma=gamma)

        # ----------------------------------------------------------------
        # Logs...
        if progress:
            print(f"---")
        if progress or debug:
            print(f">>> episode: {episode}")
            print(f">>> winner: {mover}")
        if debug or progress:
            print(f">>> Q: {Qs}")
            print(f">>> max(Q): {Qs.max()}")
            print(f">>> min(Q): {Qs.min()}")
            print(f">>> stdev(Q): {Qs.std()}")
            print(
                f">>> loss (player: {player_loss}, opponent: {opponent_loss})")
            print(f">>> player score: {score}")
            print(f">>> epsilon: {epsilon_e}")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', reward, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('player_loss', player_loss, episode)
            writer.add_scalar('opponent_loss', opponent_loss, episode)
            writer.add_scalar('steps', steps, episode)
            writer.add_scalar('score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(cold,
                               vmin=0,
                               vmax=1,
                               path=tensorboard,
                               name='cold_board.png')
            writer.add_image('cold_positions',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'cold_board.png'))),
                             0,
                             dataformats='HWC')

            # Extract all value boards, and find extrema
            values = torch.zeros((len(all_possible_moves), m, n))
            for i, a in enumerate(all_possible_moves):
                example = create_board(a[0], a[1], m, n)
                values[i, :, :] = player(state_hat).detach().float().reshape(
                    m, n)
            mean_values = torch.mean(values, 0)
            # max_values, _ = torch.max(values, 0)
            # min_values, _ = torch.min(values, 0)

            # Log
            writer.add_scalar('Q_mean', torch.mean(mean_values), episode)

            # Plot mean
            plot_wythoff_board(mean_values.numpy(),
                               vmin=mean_values.numpy().min(),
                               vmax=mean_values.numpy().max(),
                               path=tensorboard,
                               name='player_mean_values.png')
            writer.add_image('mean player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_mean_values.png'))),
                             0,
                             dataformats='HWC')

            # Plot move count
            plot_wythoff_board(moves.count,
                               vmax=moves.count.max() / 10,
                               vmin=0,
                               path=tensorboard,
                               name='moves.png')
            writer.add_image('moves',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard, 'moves.png'))),
                             0,
                             dataformats='HWC')

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if save_model:
        state = {
            'stumbler_player_dict': player,
            'stumbler_opponent_dict': opponent
        }
        torch.save(state, save + ".pytorch")
    if monitor:
        save_monitored(save, monitored)
    if tensorboard:
        writer.close()

    result = (player, opponent), (score / episode, total_reward)
    if return_none:
        result = None

    return result
Beispiel #3
0
def wythoff_dqn2(epsilon=0.1,
                 gamma=0.5,
                 learning_rate=1e-6,
                 num_episodes=100,
                 batch_size=20,
                 memory_capacity=100,
                 game='Wythoff10x10',
                 network='DQN_xy1',
                 anneal=False,
                 tensorboard=None,
                 update_every=5,
                 double=False,
                 double_update=10,
                 save=False,
                 save_model=False,
                 monitor=None,
                 return_none=False,
                 debug=False,
                 device='cpu',
                 clip_grad=False,
                 progress=False,
                 zero=False,
                 seed=None):
    """Learning Wythoff's, with a DQN."""

    # ------------------------------------------------------------------------
    # Init
    num_episodes = int(num_episodes)
    batch_size = int(batch_size)
    memory_capacity = int(memory_capacity)
    update_every = int(update_every)

    # Logs...
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # Env...
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------------------
    # Init
    #
    # Scores
    score = 0
    total_reward = 0

    # Agents, etc
    m, n, board, available = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)

    # Is network a nn.Module?
    if hasattr(network, "forward"):
        Model = network
    # Is it the name of a azad model?
    else:
        Model = getattr(azad.models, network)

    player = Model().to(device)
    target = Model().to(device)

    if double:
        target.load_state_dict(player.state_dict())
        target.eval()
    else:
        target = None

    if debug:
        print(f"---------------------------------------")
        print("Setting up....")
        print(f">>> Device: {device}")
        print(f">>> Network is {player}")
        print(f">>> Memory capacity {memory_capacity} ({batch_size})")

    memory = ReplayMemory(memory_capacity)
    # optimizer = optim.Adam(player.parameters(), learning_rate)
    optimizer = optim.SGD(player.parameters(), learning_rate)
    moves = MoveCount(m, n)
    opts = OptimalCount(0)

    # ------------------------------------------------------------------------
    for episode in range(1, num_episodes + 1):
        # Re-init
        transitions = []
        state = env.reset()
        x, y, board, available = state
        moves.update((x, y))
        if debug:
            print(f"---------------------------------------")
            print(f">>> NEW GAME ({episode}).")
            print(f">>> Initial position ({x}, {y})")
            print(f">>> Initial moves {available}")
            print(f">>> Cold available {locate_cold_moves(x, y, available)}")
            print(f">>> All cold {locate_all_cold_moves(x, y)}")

        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = epsilon

        # -------------------------------------------------------------------
        # Play a game
        steps = 1
        done = False
        while not done:
            # Choose a move
            Qs = build_Qs(player,
                          state,
                          available,
                          device=device,
                          mode="numpy")
            move_i = e_greedy(Qs, epsilon=epsilon_e, mode='numpy')
            move = available[move_i]
            moves.update(move)

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # Play it
            state_next, reward, done, _ = env.step(move)
            (x_next, y_next, board_next, available_next) = state_next

            # Track value statistics
            total_reward += reward
            Q = Qs[move_i]
            prediction_error = Qs.max() - Q
            advantage = Q - Qs[np.nonzero(Qs)].mean()

            # Save transitions, as tensors to be used at training time
            # (onto GPU)
            transitions.append([
                # S
                torch.tensor((x, y)).unsqueeze(0).unsqueeze(1).float(),
                # A
                torch.tensor(move).unsqueeze(0),
                # S'
                torch.tensor(
                    (x_next, y_next)).unsqueeze(0).unsqueeze(1).float(),
                # R
                torch.tensor([reward]).unsqueeze(0).float(),
            ])

            # -
            if debug:
                print(f">>> position: {(x, y)}")
                print(f">>> num available: {len(available)}")
                print(f">>> available: {available}")
                print(f">>> Qs (filtered): {Qs}")
                print(f">>> new position: ({x_next}, {y_next})")

            # Shift states
            state = deepcopy(state_next)
            board = deepcopy(board_next)
            available = deepcopy(available_next)
            x = deepcopy(x_next)
            y = deepcopy(y_next)

            steps += 1

        # ----------------------------------------------------------------
        # Learn from the game
        #
        # Find the losers transition and update its reward w/ -reward
        if steps > 2:
            transitions[-2][3] = transitions[-1][3] * -1

        # Update the memories using the transitions from this game
        for i in range(0, len(transitions)):
            memory.push(*transitions[i])

        if debug:
            print(f">>> final transitions: {transitions[-2:]}")

        # Bypass if we don't have enough in memory to learn
        if episode < batch_size:
            continue

        # Learn, samping a batch of transitions from memory
        player, loss = train_dqn(batch_size,
                                 player,
                                 memory,
                                 optimizer,
                                 device,
                                 target=target,
                                 gamma=gamma,
                                 clip_grad=clip_grad)

        # Update target net, if in double mode and time is right.
        if double and (episode % double_update == 0):
            target.load_state_dict(player.state_dict())

        # ----------------------------------------------------------------
        # Logs...
        if progress:
            print(f"---")
        if progress or debug:
            print(f">>> episode: {episode}")
        if debug or progress:
            print(f">>> loss {loss}")
            print(f">>> Q(last,a): {Q}")
            print(f">>> epsilon: {epsilon_e}")
            print(f">>> score: {score}")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', reward, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('loss', loss, episode)
            writer.add_scalar('steps', steps, episode)
            writer.add_scalar('score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(cold,
                               vmin=0,
                               vmax=1,
                               path=tensorboard,
                               name='cold_board.png')
            writer.add_image('cold_positions',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'cold_board.png'))),
                             0,
                             dataformats='HWC')

            # Extract all value boards, and find extrema
            values = torch.zeros((len(all_possible_moves), m, n))
            for i, a in enumerate(all_possible_moves):
                sample_hat = np.asarray(create_board(a[0], a[1], m, n))

                sample_hat = torch.from_numpy(sample_hat)
                sample_hat = sample_hat.unsqueeze(0).unsqueeze(1).float()

                values[i, :, :] = player(sample_hat).detach().float().reshape(
                    m, n)

            mean_values = torch.mean(values, 0)
            max_values, _ = torch.max(values, 0)
            min_values, _ = torch.min(values, 0)

            # Log
            writer.add_scalar('Q_mean', torch.mean(mean_values), episode)
            writer.add_scalar('Q_min', torch.mean(min_values), episode)
            writer.add_scalar('Q_max', torch.mean(max_values), episode)

            # Plot mean
            plot_wythoff_board(mean_values.numpy(),
                               vmin=mean_values.numpy().min(),
                               vmax=mean_values.numpy().max(),
                               path=tensorboard,
                               name='player_mean_values.png')
            writer.add_image('mean player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_mean_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot max
            plot_wythoff_board(max_values.numpy(),
                               vmin=max_values.numpy().min(),
                               vmax=max_values.numpy().max(),
                               path=tensorboard,
                               name='player_max_values.png')
            writer.add_image('max player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_max_values.png'))),
                             0,
                             dataformats='HWC')
            # Plot min
            plot_wythoff_board(min_values.numpy(),
                               vmin=min_values.numpy().min(),
                               vmax=min_values.numpy().max(),
                               path=tensorboard,
                               name='player_min_values.png')
            writer.add_image('min player',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard,
                                                  'player_min_values.png'))),
                             0,
                             dataformats='HWC')

            # Plot move count
            plot_wythoff_board(moves.count,
                               vmax=moves.count.max() / 10,
                               vmin=0,
                               path=tensorboard,
                               name='moves.png')
            writer.add_image('moves',
                             torch.from_numpy(
                                 skimage.io.imread(
                                     os.path.join(tensorboard, 'moves.png'))),
                             0,
                             dataformats='HWC')

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if monitor and save:
        save_monitored(save, monitored)
    if tensorboard:
        writer.close()

    result = {"player": player.state_dict(), "score": score}
    if target is not None:
        result['target'] = target.state_dict()
    if save:
        torch.save(result, save + ".pytorch")

    if monitor and not save:
        result["monitored"] = monitored

    if return_none:
        result = None

    return result
Beispiel #4
0
def wythoff_oracular_strategy(num_episodes=1000,
                              learning_rate=0.025,
                              num_hidden1=100,
                              num_hidden2=25,
                              stumbler_game='Wythoff10x10',
                              strategist_game='Wythoff50x50',
                              tensorboard=None,
                              update_every=50,
                              save=None,
                              return_none=False,
                              debug=False,
                              seed=None):
    """Train a strategist layer on perfact data."""

    # ------------------------------------------------------------------------
    # Setup
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise

        writer = SummaryWriter(log_dir=tensorboard)

    # Boards, etc
    m, n, board, _ = peek(create_env(strategist_game))
    o, p, _, _ = peek(create_env(stumbler_game))
    if debug:
        print(">>> TRANING AN OPTIMAL STRATEGIST.")
        print(">>> Train board {}".format(o, p))
        print(">>> Test board {}".format(m, n))

    # Seeding...
    np.random.seed(seed)

    # Train params
    strategic_default_value = 0.0
    batch_size = 64

    # ------------------------------------------------------------------------
    # Build a Strategist, its memory, and its optimizer

    # Create a model, of the right size.
    # model = HotCold2(2, num_hidden1=num_hidden1)
    model = HotCold3(2, num_hidden1=num_hidden1, num_hidden2=num_hidden2)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    memory = ReplayMemory(10000)

    # Run learning episodes. The 'stumbler' is just the opt
    # cold board
    for episode in range(num_episodes):
        # The cold spots are '1' everythig else is '0'
        strategic_value = create_cold_board(o, p)

        # ...Into tuples
        s_data = convert_ijv(strategic_value)
        s_data = balance_ijv(s_data, strategic_default_value)

        for d in s_data:
            memory.push(*d)

        loss = 0.0
        if len(memory) > batch_size:
            # Sample data....
            coords = []
            values = []
            samples = memory.sample(batch_size)

            for c, v in samples:
                coords.append(c)
                values.append(v)

            coords = torch.tensor(
                np.vstack(coords), requires_grad=True, dtype=torch.float)
            values = torch.tensor(
                values, requires_grad=False, dtype=torch.float)

            # Making some preditions,
            predicted_values = model(coords).squeeze()

            # and find their loss.
            loss = F.mse_loss(predicted_values, values)

            # Walk down the hill of righteousness!
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if debug:
                print(">>> Coords: {}".format(coords))
                print(">>> Values: {}".format(values))
                print(">>> Predicted values: {}".format(values))
                print(">>> Loss {}".format(loss))

        # Use the trained strategist to generate a bias_board,
        bias_board = create_bias_board(m, n, model)

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar(
                os.path.join(tensorboard, 'error'), loss, episode)

            plot_wythoff_board(
                strategic_value,
                vmin=0,
                vmax=1,
                path=tensorboard,
                name='strategy_board_{}.png'.format(episode))
            writer.add_image(
                'Training board',
                skimage.io.imread(
                    os.path.join(tensorboard,
                                 'strategy_board_{}.png'.format(episode))))

            plot_wythoff_board(
                bias_board,
                vmin=0,
                vmax=1,
                path=tensorboard,
                name='bias_board_{}.png'.format(episode))
            writer.add_image(
                'Testing board',
                skimage.io.imread(
                    os.path.join(tensorboard,
                                 'bias_board_{}.png'.format(episode))))

    # The end
    if tensorboard:
        writer.close()

    # Suppress return for parallel runs?
    result = (model), (loss)
    if return_none:
        result = None

    return result
Beispiel #5
0
def wythoff_strategist(stumbler_model,
                       stumbler_game,
                       num_episodes=1000,
                       cold_threshold=0.0,
                       hot_threshold=0.5,
                       hot_value=1,
                       cold_value=-1,
                       learning_rate=0.01,
                       game='Wythoff50x50',
                       model=None,
                       num_hidden1=100,
                       num_hidden2=25,
                       initial=0,
                       score=0.0,
                       tensorboard=None,
                       stumbler_mode='numpy',
                       balance_cold=False,
                       reflect_cold=True,
                       update_every=50,
                       save=None,
                       load_model=None,
                       save_model=False,
                       monitor=None,
                       return_none=False,
                       debug=False,
                       heuristic=True,
                       seed=None):
    """Learn a generalizable strategy for Wythoffs game"""

    # ------------------------------------------------------------------------
    # Setup
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise

        writer = SummaryWriter(log_dir=tensorboard)

    # Create env and find all moves in it

    # Create env
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)
    o, p, _, _ = peek(create_env(stumbler_game, monitor=False))

    m, n, board, _ = peek(env)
    all_possible_moves = create_all_possible_moves(m, n)

    # Watch vars?
    if monitor:
        monitored = create_monitored(monitor)

    # Init strategist
    if model is None:
        model = init_strategist(num_hidden1, num_hidden2)

    # Add old weights from file?
    if load_model is not None:
        if debug:
            print(">>> Loading model from {}".format(load_model))
        model = load_strategist(model, load_model)

    # Init SGD.
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # ------------------------------------------------------------------------
    # Extract strategic data from the stumbler
    strategic_default_value = 0.0
    if heuristic:
        if hot_threshold is None:
            strategic_value = estimate_cold(
                m,
                n,
                stumbler_model,
                threshold=cold_threshold,
                value=cold_value,
                reflect=reflect_cold,
                default_value=strategic_default_value)
        elif cold_threshold is None:
            strategic_value = estimate_hot(
                m,
                n,
                stumbler_model,
                threshold=hot_threshold,
                value=hot_value,
                default_value=strategic_default_value)
        else:
            strategic_value = estimate_hot_cold(
                o,
                p,
                stumbler_model,
                hot_threshold=hot_threshold,
                cold_threshold=cold_threshold,
                hot_value=hot_value,
                cold_value=cold_value,
                reflect_cold=reflect_cold,
                default_value=strategic_default_value)
    else:
        strategic_value = expected_value(
            o, p, stumbler_model, default_value=strategic_default_value)

    # Convert format.
    s_data = convert_ijv(strategic_value)
    if balance_cold:
        s_data = balance_ijv(s_data, cold_value)

    # Sanity?
    if s_data is None:
        return model, None

    # Define a memory to sample.
    memory = ReplayMemory(len(s_data))
    batch_size = len(s_data)
    for d in s_data:
        memory.push(*d)

    # ------------------------------------------------------------------------
    # Sample the memory to teach the strategist
    bias_board = None
    for episode in range(initial, initial + num_episodes):
        loss = 0.0

        if debug:
            print("---------------------------------------")
            print(">>> STRATEGIST ({}).".format(episode))

        coords = []
        values = []
        for c, v in memory.sample(batch_size):
            coords.append(c)
            values.append(v)
        coords = torch.tensor(
            np.vstack(coords), requires_grad=True, dtype=torch.float)
        values = torch.tensor(values, requires_grad=False, dtype=torch.float)

        # Making some preditions, ...
        predicted_values = model(coords).squeeze()

        # and learn from them
        loss = F.mse_loss(predicted_values, values)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # --------------------------------------------------------------------
        if debug:
            print(">>> Coords: {}".format(coords))
            print(">>> Values: {}".format(values))
            print(">>> Predicted values: {}".format(values))
            print(">>> Loss {}".format(loss))

        if tensorboard and (int(episode) % update_every) == 0:
            # Timecourse
            writer.add_scalar('stategist_error', loss, episode)

            bias_board = create_bias_board(m, n, model)
            plot_wythoff_board(
                bias_board,
                vmin=-1.5,
                vmax=1.5,
                path=tensorboard,
                height=10,
                width=15,
                name='bias_board.png')
            writer.add_image(
                'strategist',
                skimage.io.imread(os.path.join(tensorboard, 'bias_board.png')))

        if monitor and (int(episode) % update_every) == 0:
            # Score the model:
            with th.no_grad():
                pred = create_bias_board(m, n, model, default=0.0).numpy()
                cold = create_cold_board(m, n, default=hot_value)
                mae = np.median(np.abs(pred - cold))

            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # Final score for the model:
    with th.no_grad():
        pred = create_bias_board(m, n, model, default=0.0).numpy()
        cold = create_cold_board(m, n, default=hot_value)
        mae = np.median(np.abs(pred - cold))

    # Save?
    if save_model:
        state = {
            'strategist_model_dict': model.state_dict(),
            "num_hidden1": num_hidden1,
            "num_hidden2": num_hidden2
        }
        th.save(state, save + ".pytorch")

    if monitor:
        save_monitored(save, monitored)

    # Suppress return for parallel runs?
    result = (model), (mae)
    if return_none:
        result = None

    return result
Beispiel #6
0
def wythoff_stumbler(num_episodes=10,
                     epsilon=0.1,
                     gamma=0.8,
                     learning_rate=0.1,
                     game='Wythoff10x10',
                     model=None,
                     opponent=None,
                     anneal=False,
                     bias_board=None,
                     influence=0.0,
                     score=0.0,
                     total_reward=0.0,
                     tensorboard=None,
                     update_every=5,
                     initial=0,
                     self_play=False,
                     save=False,
                     load_model=None,
                     save_model=False,
                     monitor=None,
                     return_none=False,
                     debug=False,
                     seed=None):
    """Learn to play Wythoff's w/ e-greedy random exploration.
    
    Note: Learning is based on a player-opponent joint action formalism 
    and tabular Q-learning.
    """

    # ------------------------------------------------------------------------
    # Init env
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    # Create env
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # ------------------------------------------------------------------------
    # Init Agents
    default_Q = 0.0
    m, n, board, available = peek(env)
    if model is None:
        model = {}
    if opponent is None:
        opponent = {}

    # Override from file?
    if load_model is not None:
        if debug:
            print(">>> Loadiing model/opponent from {}".format(load_model))

        model, opponent = load_stumbler(model, opponent, load_model)

    # ------------------------------------------------------------------------
    for episode in range(initial, initial + num_episodes):
        # Re-init
        steps = 1

        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))
        if debug:
            print("---------------------------------------")
            print(">>> NEW GAME ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))
            print(">>> Initial moves {}".format(available))
            print("---------------------------------------")

        t_state = [
            board,
        ]
        t_available = [available]
        t_move = []
        t_move_i = []
        t_reward = []

        # -------------------------------------------------------------------
        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = episode

        # -------------------------------------------------------------------
        # Play!
        done = False
        player_win = False
        while not done:
            # PLAYER CHOOSES A MOVE
            try:
                Qs_episode = add_bias_board(model[board], available,
                                            bias_board, influence)
                move_i = epsilon_greedy(
                    Qs_episode, epsilon=epsilon_e, mode='numpy')
            except KeyError:
                model[board] = np.ones(len(available)) * default_Q
                move_i = np.random.randint(0, len(available))

            move = available[move_i]

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # PLAY THE MOVE
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            steps += 1

            # Log....
            if debug:
                print(">>> PLAYER move {}".format(move))

            t_state.append(board)
            t_move.append(move)
            t_available.append(available)
            t_move_i.append(move_i)
            t_reward.append(reward)

            if done:
                player_win = True
                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

            # ----------------------------------------------------------------
            if not done:
                # OPPONENT CHOOSES A MOVE
                try:
                    Qs_episode = add_bias_board(opponent[board], available,
                                                bias_board, influence)
                    move_i = epsilon_greedy(
                        Qs_episode, epsilon=epsilon_e, mode='numpy')
                except KeyError:
                    opponent[board] = np.ones(len(available)) * default_Q
                    move_i = np.random.randint(0, len(available))

                move = available[move_i]

                # PLAY THE MOVE
                (x, y, board, available), reward, done, _ = env.step(move)
                board = tuple(flatten_board(board))
                steps += 1

                # Log....
                if debug:
                    print(">>> OPPONENT move {}".format(move))

                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

                if done:
                    t_state.append(board)
                    t_move.append(move)
                    t_available.append(available)
                    t_move_i.append(move_i)
                    t_reward.append(reward)

        # ----------------------------------------------------------------
        # Learn by unrolling the last game...

        # PLAYER (model)
        s_idx = np.arange(0, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = model[s][m_i]

            try:
                max_Q = model[next_s].max()
            except KeyError:
                model[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = model[next_s].max()

            if player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Update running reward total for player
            total_reward += r

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            model[s][m_i] = Q + (learning_rate * loss)

        # OPPONENT
        s_idx = np.arange(1, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = opponent[s][m_i]

            try:
                max_Q = opponent[next_s].max()
            except KeyError:
                opponent[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = opponent[next_s].max()

            if not player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            opponent[s][m_i] = Q + (learning_rate * loss)

        # ----------------------------------------------------------------
        # Update the log
        if debug:
            print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format(
                r, Q, next_Q, loss))

            if done and (r > 0):
                print("*** WIN ***")
            if done and (r < 0):
                print("*** OPPONENT WIN ***")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', r, episode)
            writer.add_scalar('Q', Q, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('stumber_error', loss, episode)
            writer.add_scalar('stumber_steps', steps, episode)
            writer.add_scalar('stumbler_score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(
                cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png')
            writer.add_image(
                'cold_positions',
                skimage.io.imread(os.path.join(tensorboard, 'cold_board.png')))

            # Agent max(Q) boards
            values = expected_value(m, n, model)
            plot_wythoff_board(
                values, path=tensorboard, name='player_max_values.png')
            writer.add_image(
                'player',
                skimage.io.imread(
                    os.path.join(tensorboard, 'player_max_values.png')))

            values = expected_value(m, n, opponent)
            plot_wythoff_board(
                values, path=tensorboard, name='opponent_max_values.png')
            writer.add_image(
                'opponent',
                skimage.io.imread(
                    os.path.join(tensorboard, 'opponent_max_values.png')))

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if save_model:
        state = {
            'stumbler_player_dict': model,
            'stumbler_opponent_dict': opponent
        }
        torch.save(state, save + ".pytorch")

    if monitor:
        save_monitored(save, monitored)

    if tensorboard:
        writer.close()

    result = (model, opponent), (score, total_reward)
    if return_none:
        result = None

    return result