Ejemplo n.º 1
0
def test_agent(env, W, Q_w, Q_E, gamma, training_steps, max_ep_steps, device,
               n_test_runs, test_log_file):
    """
        This function tests the overall performance of the agent, given that
        it has already learnt the keyboard.

        env: Environment
        W: Weight vector to be learnt over cumulants
        Q_w: Q-function over weight vectors
        Q_E: Q-functions over all cumulants and options
        gamma: Discount factor
        training_steps: Number of steps for which agent is to be trained
        max_ep_steps: Maximum number of steps in an episode
        device: cpu or gpu
        n_test_runs: Number of episodes for which performance is tested
        test_log_file: Path to log file for test results
    """
    ep_returns = []

    for _ in range(n_test_runs):
        s = env.reset()
        s = torch.from_numpy(s).float().to(device)
        n_steps = 0
        ep_return = 0
        done = False

        while n_steps < max_ep_steps and not done:
            with torch.no_grad():
                q = Q_w(s)
            w = W[torch.argmax(q)]
            (s_next, done, r_prime, gamma_prime, n_steps,
             info) = option_keyboard(env, s, w, Q_E, gamma, n_steps,
                                     max_ep_steps, device)

            s_next = torch.from_numpy(s_next).float().to(device)
            s = (s_next if not done else torch.from_numpy(
                env.reset()).float().to(device))

            ep_return += sum(info['rewards'])

        ep_returns.append(ep_return)

    print('Steps:', training_steps, 'Avg. return:',
          sum(ep_returns) / n_test_runs, 'Episodic return:', ep_returns)

    logfile = open(test_log_file, 'a+b')
    pickle.dump({'steps': training_steps, 'returns': ep_returns}, logfile)
    logfile.close()

    return ep_returns
Ejemplo n.º 2
0
def main():
    args = parser.parse_args()
    env = gym.make(args.env_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    set_global_seed(args.seed)

    d = env.num_resources()

    hyperparams_file = open(
        os.path.join(
            args.saved_models.split('saved_models')[0], 'hyperparams'), 'rb')

    # Loading saved models and constant values
    returns = []
    if args.save_path:
        fp = open(args.save_path, 'a+b')

    W = [x for x in product([-1, 0, 1], repeat=2) if sum(x) >= 0]
    W.remove((0, 0))
    W = np.array(W)

    hyperparams = pickle.load(hyperparams_file)
    gamma = hyperparams.gamma_ok
    max_ep_steps = hyperparams.max_steps_agent

    value_fns = [
        ValueFunction(input_dim=env.observation_space.shape[0] + d,
                      action_dim=(env.action_space.n + 1),
                      n_options=d,
                      hidden=[64, 128],
                      batch_size=hyperparams.ok_batch_size,
                      gamma=gamma,
                      alpha=hyperparams.alpha_ok) for _ in range(d)
    ]

    Q_w = MlpDiscrete(input_dim=env.observation_space.shape[0],
                      output_dim=W.shape[0],
                      hidden=[64, 128])

    for i in range(env.num_resources()):
        if not torch.cuda.is_available():
            checkpoint = torch.load(os.path.join(args.saved_models,
                                                 'value_fn_%d.pt' % (i + 1)),
                                    map_location=torch.device('cpu'))
        else:
            checkpoint = torch.load(
                os.path.join(args.saved_models, 'value_fn_%d.pt' % (i + 1)))

        value_fns[i].q_net.load_state_dict(checkpoint['Q'])
        value_fns[i].q_net.to(device)

    if not torch.cuda.is_available():
        checkpoint = torch.load(os.path.join(args.saved_models, 'agent.pt'),
                                map_location=torch.device('cpu'))
    else:
        checkpoint = torch.load(os.path.join(args.saved_models, 'agent.pt'))

    Q_w.load_state_dict(checkpoint['Q'])
    Q_w.to(device)
    # ########

    for _ in range(args.n_test_episodes):
        s = env.reset()
        done = False
        s = torch.from_numpy(s).float().to(device)
        n_steps = 0
        ret = 0

        while not done:
            w = W[torch.argmax(Q_w(s))]
            (s_next, done, _, _, n_steps,
             info) = option_keyboard(env, s, w, value_fns, gamma, n_steps,
                                     max_ep_steps, device, args.visualize)

            ret += sum(info['rewards'])
            s = torch.from_numpy(s_next).float().to(device)

        print('Episode return:', ret)
        returns.append(ret)

    returns = np.array(returns)
    print('Mean: %f, Std. dev: %f' % (returns.mean(), returns.std()))
    pickle.dump({'Seed': args.seed, 'Returns': returns}, fp)
    fp.close()
Ejemplo n.º 3
0
def test_learning_options(env, Q_E, index, w, gamma, training_steps,
                          max_ep_steps, device, n_test_runs, log_file):
    """
        This function tests the performance of the agent for different weight
        vectors w. This is typically used to see how the options being learnt
        perform for w = (1, 1), which would optimize for all types of food
        items. We also record performance for w = (1, -1) and w = (-1, 1) since
        the keyboard learnt should perform reasonably well for all
        configurations of w that we consider.

        env: Environment
        Q_E: Q-functions over all cumulants and options
        index: Index of cumulant for which performance is measured
        w: Weight vector (kept constant to measure keyboard performance)
        gamma: Discount factor
        training_steps: Number of steps for which agent is to be trained
        max_ep_steps: Maximum number of steps in an episode
        device: cpu or gpu
        n_test_runs: Number of episodes for which performance is tested
        log_file: Path to log file for test results
    """
    ep_returns = []
    cumulant_returns = []

    env.set_learning_options(w, True)

    for _ in range(n_test_runs):
        s = env.reset()
        s = torch.from_numpy(s).float().to(device)
        n_steps = 0
        ep_return = 0
        cumulant_return = 0
        done = False

        while n_steps < max_ep_steps and not done:
            (s_next, done, r_prime, gamma_prime, n_steps,
             info) = option_keyboard(env, s, w, Q_E, gamma, n_steps,
                                     max_ep_steps, device)

            s_next = torch.from_numpy(s_next).float().to(device)
            s = (s_next if not done else torch.from_numpy(
                env.reset()).float().to(device))

            ep_return += sum(info['rewards'])

            cumulant_return += sum([
                info['env_info'][i]['rewards'][index]
                for i in range(len(info['env_info']))
            ])

        ep_returns.append(ep_return)
        cumulant_returns.append(cumulant_return)

    print('w:', w, 'Steps:', training_steps, 'Avg. return:',
          sum(ep_returns) / n_test_runs, 'Episodic return:', ep_returns,
          'Cumulant return:', cumulant_returns)

    logfile = open(log_file, 'a+b')
    pickle.dump(
        {
            'steps': training_steps,
            'returns': ep_returns,
            'cumulant_returns': cumulant_returns
        }, logfile)
    logfile.close()

    env.set_learning_options(np.ones(len(w)), True)

    return ep_returns, cumulant_returns
Ejemplo n.º 4
0
def keyboard_player(env, W, Q, alpha, eps, gamma, training_steps, batch_size,
                    pretrained_agent, max_ep_steps, device, test_interval,
                    n_test_runs, log_file, log_dir):
    """
        env: Environment
        W: Weight vector to be learnt over cumulants
        Q: Q-functions over all cumulants and options
        alpha: learning rate
        eps: Exploration parameter over weight vector w
        gamma: Discount factor
        training_steps: Number of steps for which agent is to be trained
        batch_size: Batch size for updating Q-values
        pretrained_agent: Path to pretrained agent model
        max_ep_steps: Maximum number of steps in an episode
        device: cpu or gpu
        test_interval: Number of steps after which agent is tested
        n_test_runs: Number of episodes for which performance is tested
        log_file: File to store episode return logs
        log_dir: Directory where logs and intermediate models are saved
    """

    n = W.shape[0]
    Q_w = MlpDiscrete(input_dim=env.observation_space.shape[0],
                      output_dim=n,
                      hidden=[64, 128])
    Q_w.to(device)
    optimizer = Adam(Q_w.parameters(), lr=alpha)

    s = env.reset()
    s = torch.from_numpy(s).float().to(device)
    n_steps = 0
    best_avg_return = -100  # random low value, needs to be changed!
    q_loss = 0
    n_items_batch = 0
    done = False

    writer = {}
    writer['writer'] = SummaryWriter(os.path.join(log_dir, 'runs'))

    # Load pretrained agent, if available
    if pretrained_agent:
        checkpoint = torch.load(os.path.join(pretrained_agent, 'agent.pt'))
        n_steps = checkpoint['steps']
        Q_w.load_state_dict(checkpoint['Q'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_avg_return = checkpoint['best_avg_return']

    # Start learning
    while n_steps < training_steps:
        if done:
            s = torch.from_numpy(env.reset()).float().to(device)
            done = False

        q = Q_w(s)

        # Epsilon-greedy exploration
        if np.random.binomial(1, eps):
            w_index = torch.tensor(np.random.randint(n)).to(device)
        else:
            w_index = torch.argmax(q)

        w = W[w_index]

        (s_next, done, r_prime, gamma_prime, n_steps,
         _) = option_keyboard(env, s, w, Q, gamma, n_steps, max_ep_steps,
                              device)

        s_next = torch.from_numpy(s_next).float().to(device)

        q_next = Q_w(s_next).detach()

        td_error = r_prime + gamma_prime * q_next.max() - q[w_index]
        q_loss += 0.5 * (td_error**2)
        n_items_batch += 1

        # Update the networks
        if n_items_batch == batch_size:
            optimizer.zero_grad()
            writer['writer'].add_scalar('agent/Q', q_loss.item(), n_steps + 1)
            q_loss.backward()
            optimizer.step()
            q_loss = 0
            n_items_batch = 0

        s = s_next

        # Test the agent at intermediate time steps and save current and best
        # models
        if n_steps % test_interval == 0:
            ep_returns = test_agent(env, W, Q_w, Q, gamma, n_steps,
                                    max_ep_steps, device, n_test_runs,
                                    log_file)
            writer['writer'].add_scalar('episode_returns/Agent',
                                        sum(ep_returns) / n_test_runs, n_steps)

            torch.save(
                {
                    'steps': n_steps,
                    'Q': Q_w.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_avg_return': best_avg_return
                }, os.path.join(log_dir, 'saved_models', 'agent.pt'))

            if sum(ep_returns) / n_test_runs > best_avg_return:
                best_avg_return = sum(ep_returns) / n_test_runs
                torch.save(
                    {
                        'steps': n_steps,
                        'Q': Q_w.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'best_avg_return': best_avg_return
                    }, os.path.join(log_dir, 'saved_models', 'best',
                                    'agent.pt'))

    return Q_w