Ejemplo n.º 1
0
def bandit_stumbler(path,
                    num_trials=10,
                    epsilon=0.1,
                    gamma=0.8,
                    learning_rate=0.1,
                    log_path=None,
                    bandit_name='BanditTwoArmedDeterministicFixed'):
    """Train a Q-agent to play n-bandit, using SGD.
    
    Note: bandits are drawm from azad.local_gym. See that module for
    more information on the bandits.
    """
    # Create path
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    # -------------------------------------------
    # setup

    # Logging
    if log_path is None:
        log_path = path
    writer = SummaryWriter(log_dir=log_path)

    # The world is a slot machine!
    env = gym.make('{}-v0'.format(bandit_name))
    env = wrappers.Monitor(env,
                           './tmp/{}-v0-1'.format(bandit_name),
                           force=True)

    # Init the 'agent'
    model = LinQN1(1, 2)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # -------------------------------------------
    # Run some trials

    # Loop over trials not batchs, doing
    # SGD on each outcome
    # (no idea how well this will work)
    for trial in range(num_trials):
        state = Tensor([env.reset()])

        if trial == 0:
            writer.add_graph(model, state)

        # Look at the world and approximate its value then act.
        Qs = model(state)
        action = epsilon_greedy(Qs, epsilon)

        Q = Qs[int(action)]

        next_state, reward, _, _ = env.step(int(action))
        next_state = Tensor([next_state])

        # Walk down the hill o' rightenous!
        max_Q = model(next_state).detach().max()
        next_Q = reward + (gamma * max_Q)
        loss = F.smooth_l1_loss(Q, next_Q)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log
        for path, param in model.named_parameters():
            writer.add_histogram(path, param.clone().cpu().data.numpy(), trial)
        writer.add_scalar(os.path.join(log_path, 'error'), loss.data[0], trial)
        writer.add_scalar(os.path.join(log_path, 'Q'), Q, trial)
        writer.add_scalar(os.path.join(log_path, 'reward'), reward, trial)
        writer.add_scalar(os.path.join(log_path, 'state'), state, trial)

    # Cleanup and end
    writer.close()

    return model, env
Ejemplo n.º 2
0
def evaluate_wythoff(stumbler=None,
                     strategist=None,
                     stumbler_game='Wythoff10x10',
                     strategist_game='Wythoff50x50',
                     random_stumbler=False,
                     load_model=None,
                     save=None,
                     return_none=False,
                     num_episodes=100,
                     debug=False):
    """Compare stumblers to strategists.
    
    Returns 
    -------
    wins : float
        the fraction of games won by the strategist.
    """
    # ------------------------------------------------------------------------
    if load_model is not None:
        stumbler, _, strategist = load_for_eval(load_model)

    # Init boards, etc
    # Stratgist
    env = create_env(strategist_game, monitor=False)
    m, n, board, _ = peek(env)
    if strategist is not None:
        hot_cold_table = create_bias_board(m, n, strategist)
    else:
        hot_cold_table = np.zeros_like(board)

    # Stumbler
    o, p, _, _ = peek(create_env(stumbler_game, monitor=False))

    # ------------------------------------------------------------------------
    # A stumbler and a strategist take turns playing a (m,n) game of wythoffs
    wins = 0.0
    strategist_score = 0.0
    stumbler_score = 0.0
    for episode in range(num_episodes):
        # Re-init
        steps = 0

        # Start the game, and process the result
        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))

        if debug:
            print("---------------------------------------")
            print(">>> NEW MODEL EVALUATION ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))

        done = False
        while not done:
            # ----------------------------------------------------------------
            # STUMBLER
            if (x < o) and (y < p):
                s_board = tuple(flatten_board(create_board(x, y, o, p)))
                s_available = create_moves(x, y)
                try:
                    values = stumbler[s_board]
                    move_i = epsilon_greedy(values, epsilon=0.0, mode='numpy')
                    move = s_available[move_i]
                except KeyError:
                    move_i = np.random.randint(0, len(s_available))
                    move = s_available[move_i]
            else:
                s_available = available
                move_i = np.random.randint(0, len(s_available))
                move = s_available[move_i]

            # ----------------------------------------------------------------
            # RANDOM PLAYER
            if random_stumbler:
                move_i = np.random.randint(0, len(available))
                move = available[move_i]

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, s_available):
                if move in locate_cold_moves(x, y, s_available):
                    best = 1.0
                stumbler_score += (best - stumbler_score) / (episode + 1)

            # Move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if debug:
                print(">>> STUMBLER move {}".format(move))

            if done:
                break

            # ----------------------------------------------------------------
            # STRATEGIST
            # Choose.
            hot_cold_move_values = [hot_cold_table[i, j] for i, j in available]
            move_i = epsilon_greedy(
                np.asarray(hot_cold_move_values), epsilon=0.0, mode='numpy')
            move = available[move_i]

            if debug:
                print(">>> STRATEGIST move {}".format(move))

            # Analyze the choice
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                strategist_score += (best - strategist_score) / (episode + 1)

            # Make a move
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            if done:
                wins += 1.0
                break

        if debug:
            print("Wins {}, Scores ({}, {})".format(wins, stumbler_score,
                                                    strategist_score))

    if save is not None:
        np.savetxt(
            save,
            np.asarray([wins, stumbler_score, strategist_score]).reshape(1, 3),
            fmt='%.1f,%.4f,%.4f',
            comments="",
            header="wins,stumbler_score,strategist_score")

    result = (wins / num_episodes), stumbler_score, strategist_score
    if return_none:
        result = None

    return result
Ejemplo n.º 3
0
def wythoff_stumbler(num_episodes=10,
                     epsilon=0.1,
                     gamma=0.8,
                     learning_rate=0.1,
                     game='Wythoff10x10',
                     model=None,
                     opponent=None,
                     anneal=False,
                     bias_board=None,
                     influence=0.0,
                     score=0.0,
                     total_reward=0.0,
                     tensorboard=None,
                     update_every=5,
                     initial=0,
                     self_play=False,
                     save=False,
                     load_model=None,
                     save_model=False,
                     monitor=None,
                     return_none=False,
                     debug=False,
                     seed=None):
    """Learn to play Wythoff's w/ e-greedy random exploration.
    
    Note: Learning is based on a player-opponent joint action formalism 
    and tabular Q-learning.
    """

    # ------------------------------------------------------------------------
    # Init env
    if tensorboard is not None:
        try:
            os.makedirs(tensorboard)
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                raise
        writer = SummaryWriter(log_dir=tensorboard)

    # Create env
    if tensorboard is not None:
        env = create_env(game, monitor=True)
    else:
        env = create_env(game, monitor=False)
    env.seed(seed)
    np.random.seed(seed)

    if monitor is not None:
        monitored = create_monitored(monitor)

    # ------------------------------------------------------------------------
    # Init Agents
    default_Q = 0.0
    m, n, board, available = peek(env)
    if model is None:
        model = {}
    if opponent is None:
        opponent = {}

    # Override from file?
    if load_model is not None:
        if debug:
            print(">>> Loadiing model/opponent from {}".format(load_model))

        model, opponent = load_stumbler(model, opponent, load_model)

    # ------------------------------------------------------------------------
    for episode in range(initial, initial + num_episodes):
        # Re-init
        steps = 1

        x, y, board, available = env.reset()
        board = tuple(flatten_board(board))
        if debug:
            print("---------------------------------------")
            print(">>> NEW GAME ({}).".format(episode))
            print(">>> Initial position ({}, {})".format(x, y))
            print(">>> Initial moves {}".format(available))
            print("---------------------------------------")

        t_state = [
            board,
        ]
        t_available = [available]
        t_move = []
        t_move_i = []
        t_reward = []

        # -------------------------------------------------------------------
        # Anneal epsilon?
        if anneal:
            epsilon_e = epsilon * (1.0 / np.log((episode + np.e)))
        else:
            epsilon_e = episode

        # -------------------------------------------------------------------
        # Play!
        done = False
        player_win = False
        while not done:
            # PLAYER CHOOSES A MOVE
            try:
                Qs_episode = add_bias_board(model[board], available,
                                            bias_board, influence)
                move_i = epsilon_greedy(
                    Qs_episode, epsilon=epsilon_e, mode='numpy')
            except KeyError:
                model[board] = np.ones(len(available)) * default_Q
                move_i = np.random.randint(0, len(available))

            move = available[move_i]

            # Analyze it...
            best = 0.0
            if cold_move_available(x, y, available):
                if move in locate_cold_moves(x, y, available):
                    best = 1.0
                score += (best - score) / (episode + 1)

            # PLAY THE MOVE
            (x, y, board, available), reward, done, _ = env.step(move)
            board = tuple(flatten_board(board))
            steps += 1

            # Log....
            if debug:
                print(">>> PLAYER move {}".format(move))

            t_state.append(board)
            t_move.append(move)
            t_available.append(available)
            t_move_i.append(move_i)
            t_reward.append(reward)

            if done:
                player_win = True
                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

            # ----------------------------------------------------------------
            if not done:
                # OPPONENT CHOOSES A MOVE
                try:
                    Qs_episode = add_bias_board(opponent[board], available,
                                                bias_board, influence)
                    move_i = epsilon_greedy(
                        Qs_episode, epsilon=epsilon_e, mode='numpy')
                except KeyError:
                    opponent[board] = np.ones(len(available)) * default_Q
                    move_i = np.random.randint(0, len(available))

                move = available[move_i]

                # PLAY THE MOVE
                (x, y, board, available), reward, done, _ = env.step(move)
                board = tuple(flatten_board(board))
                steps += 1

                # Log....
                if debug:
                    print(">>> OPPONENT move {}".format(move))

                t_state.append(board)
                t_move.append(move)
                t_available.append(available)
                t_move_i.append(move_i)
                t_reward.append(reward)

                if done:
                    t_state.append(board)
                    t_move.append(move)
                    t_available.append(available)
                    t_move_i.append(move_i)
                    t_reward.append(reward)

        # ----------------------------------------------------------------
        # Learn by unrolling the last game...

        # PLAYER (model)
        s_idx = np.arange(0, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = model[s][m_i]

            try:
                max_Q = model[next_s].max()
            except KeyError:
                model[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = model[next_s].max()

            if player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Update running reward total for player
            total_reward += r

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            model[s][m_i] = Q + (learning_rate * loss)

        # OPPONENT
        s_idx = np.arange(1, steps - 1, 2)
        for i in s_idx:
            # States and actions
            s = t_state[i]
            next_s = t_state[i + 2]
            m_i = t_move_i[i]

            # Value and reward
            Q = opponent[s][m_i]

            try:
                max_Q = opponent[next_s].max()
            except KeyError:
                opponent[next_s] = np.ones(len(t_available[i])) * default_Q
                max_Q = opponent[next_s].max()

            if not player_win:
                r = t_reward[i]
            else:
                r = -1 * t_reward[i + 1]

            # Loss and learn
            next_Q = r + (gamma * max_Q)
            loss = next_Q - Q
            opponent[s][m_i] = Q + (learning_rate * loss)

        # ----------------------------------------------------------------
        # Update the log
        if debug:
            print(">>> Reward {}; Loss(Q {}, next_Q {}) -> {}".format(
                r, Q, next_Q, loss))

            if done and (r > 0):
                print("*** WIN ***")
            if done and (r < 0):
                print("*** OPPONENT WIN ***")

        if tensorboard and (int(episode) % update_every) == 0:
            writer.add_scalar('reward', r, episode)
            writer.add_scalar('Q', Q, episode)
            writer.add_scalar('epsilon_e', epsilon_e, episode)
            writer.add_scalar('stumber_error', loss, episode)
            writer.add_scalar('stumber_steps', steps, episode)
            writer.add_scalar('stumbler_score', score, episode)

            # Cold ref:
            cold = create_cold_board(m, n)
            plot_wythoff_board(
                cold, vmin=0, vmax=1, path=tensorboard, name='cold_board.png')
            writer.add_image(
                'cold_positions',
                skimage.io.imread(os.path.join(tensorboard, 'cold_board.png')))

            # Agent max(Q) boards
            values = expected_value(m, n, model)
            plot_wythoff_board(
                values, path=tensorboard, name='player_max_values.png')
            writer.add_image(
                'player',
                skimage.io.imread(
                    os.path.join(tensorboard, 'player_max_values.png')))

            values = expected_value(m, n, opponent)
            plot_wythoff_board(
                values, path=tensorboard, name='opponent_max_values.png')
            writer.add_image(
                'opponent',
                skimage.io.imread(
                    os.path.join(tensorboard, 'opponent_max_values.png')))

        if monitor and (int(episode) % update_every) == 0:
            all_variables = locals()
            for k in monitor:
                monitored[k].append(float(all_variables[k]))

    # --------------------------------------------------------------------
    if save_model:
        state = {
            'stumbler_player_dict': model,
            'stumbler_opponent_dict': opponent
        }
        torch.save(state, save + ".pytorch")

    if monitor:
        save_monitored(save, monitored)

    if tensorboard:
        writer.close()

    result = (model, opponent), (score, total_reward)
    if return_none:
        result = None

    return result
Ejemplo n.º 4
0
def cart_stumbler(path,
                  num_episodes=500,
                  epsilon=0.1,
                  epsilon_min=0.01,
                  epsilon_tau=500,
                  gamma=1,
                  learning_rate=0.001,
                  num_hidden=200,
                  log_path=None,
                  batch_size=64):
    """Train TwoQN to use a pole cart"""
    # Create path
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

    # -------------------------------------------
    # Tensorboard setup
    if log_path is None:
        log_path = path
    writer = SummaryWriter(log_dir=log_path)

    # -------------------------------------------
    # The world is a cart....
    env = gym.make('CartPole-v0')
    env = wrappers.Monitor(env, './tmp/cartpole-v0-1', force=True)

    # -------------------------------------------
    # Init the DQN, it's memory, and its optim
    # model = ThreeQN(4, 2, num_hidden1=1000, num_hidden2=200)
    model = ReLu2(4, 2, num_hidden=num_hidden)
    memory = ReplayMemory(10000)
    optimizer = optim.Adam(model.parameters(), learning_rate)

    # -------------------------------------------
    # Run some episodes
    episode_durations = []

    for episode in range(num_episodes):
        state = Tensor(env.reset())

        if episode == 0:
            writer.add_graph(model, state)

        steps = 0
        while True:
            env.render()

            # -------------------------------------------
            # Look at the world and approximate its value.
            Q = model(state)

            # Make a decision.
            epsilon_step = epsilon_min + (epsilon - epsilon_min) * exp(
                -1.0 * steps / epsilon_tau)
            action = torch.tensor(epsilon_greedy(Q, epsilon_step),
                                  dtype=torch.float)
            next_state, reward, done, _ = env.step(int(action))

            # Punishment, at the end of the world.
            if done:
                reward = -1

            next_state = Tensor(next_state)
            reward = Tensor([reward])

            # Log this episode
            writer.add_scalar(os.path.join(log_path, 'Q'), Q[int(action)],
                              episode)
            writer.add_scalar(os.path.join(log_path, 'reward'), reward,
                              episode)

            # Always remember the past
            # (you are still doomed to repeat it).
            memory.push(state.unsqueeze(0), action.unsqueeze(0),
                        next_state.unsqueeze(0), reward.unsqueeze(0))

            # -------------------------------------------
            # Learn from the last result.

            # If there is not enough in memory,
            # don't try and learn anything.
            if done:
                print(">>> {2} Episode {0} finished after {1} steps".format(
                    episode, steps,
                    '\033[92m' if steps >= 195 else '\033[99m'))

                episode_durations.append(steps)
                writer.add_scalar(os.path.join(log_path, 'durations'), steps,
                                  episode)

                # plot_cart_durations(episode_durations)

                break
            elif len(memory) < batch_size:
                continue

            # Grab some examples from memory
            # and repackage them.
            transitions = memory.sample(batch_size)
            t_states, t_actions, t_next_states, t_rewards = zip(*transitions)

            # Conversions....
            t_states = Variable(torch.cat(t_states))
            t_actions = Variable(torch.cat(t_actions))
            t_rewards = Variable(torch.cat(t_rewards)).squeeze()
            t_next_states = Variable(torch.cat(t_next_states))

            # Possible Qs for actions
            Qs = model(t_states).gather(
                1,
                t_actions.unsqueeze(1).type(torch.LongTensor)).squeeze()

            # In Q learning we use the max Q of the next state,
            # and the reward, to estimate future Qs value
            max_Qs = model(t_next_states).detach().max(1)[0]
            future_Qs = t_rewards + (gamma * max_Qs)

            # Want to min the loss between predicted Qs
            # and the observed
            loss = F.smooth_l1_loss(Qs, future_Qs)
            writer.add_scalar(os.path.join(log_path, 'error'),
                              loss.data[0].mean(), episode)

            # Grad. descent!
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # -------------------------------------------
            state = next_state
            steps += 1

            if done:
                break

    # -------------------------------------------
    # Clean up
    writer.close()

    env.env.close()
    plt.ioff()
    plt.savefig("{}.png".format(path))
    plt.close()

    return episode_durations