def test_init_replay_memory(env):
    _, env = env
    replay_memory_size = 100
    replay_start_size = 100
    history_length = 4
    first_index = 3
    replay_memory = init_replay_memory(env,
                                       history_length=history_length,
                                       replay_memory_size=replay_memory_size,
                                       replay_start_size=replay_start_size,
                                       input_as_images=True,
                                       preprocess_fn=preprocess,
                                       print_info=False)
    assert len(replay_memory) == replay_start_size
    assert replay_memory[first_index][0].shape == torch.Size(
        [1, 84, 84, history_length])
    assert replay_memory[first_index][1].shape == torch.Size([1])
    assert replay_memory[first_index][2].shape == torch.Size([1])
    assert replay_memory[first_index][4].shape == torch.Size([1])
    len_vector = replay_start_size // 2
    assert replay_memory[first_index:len_vector][0].shape == torch.Size(
        [len_vector - first_index, 84, 84, history_length])
    assert replay_memory[first_index:len_vector][1].shape == torch.Size(
        [len_vector - first_index])
    assert replay_memory[first_index:len_vector][2].shape == torch.Size(
        [len_vector - first_index])
    assert replay_memory[first_index:len_vector][4].shape == torch.Size(
        [len_vector - first_index])
Esempio n. 2
0
def test_init_replay_memory(env):
    _, env = env
    replay_memory_size = 100
    replay_start_size = 10
    replay_memory = init_replay_memory(env,
                                       replay_memory_size=replay_memory_size,
                                       replay_start_size=replay_start_size,
                                       print_info=False)
    assert len(replay_memory) == replay_start_size
Esempio n. 3
0
def replay_memory():
    '''
    Generate a filled replay_memory
    '''
    nb_timesteps = pytest.agent_history_length
    nb_actions = pytest.nb_actions
    env = gym.make(pytest.env_name)
    env = KFrames(env, history_length=nb_timesteps)
    replay_memory = init_replay_memory(env, replay_memory_size=100, replay_start_size=100, preprocess_fn=preprocess, print_info=False)
    return nb_actions, nb_timesteps, replay_memory
Esempio n. 4
0
def replay_memory():
    '''
    Generate a filled replay_memory
    '''
    nb_timesteps = pytest.agent_history_length
    env = gym.make(pytest.env_name)
    nb_actions = env.action_space.n
    env = SkipFrames(env, skip_frames=nb_timesteps - 1)
    replay_memory = init_replay_memory(env,
                                       replay_memory_size=100,
                                       replay_start_size=100,
                                       input_as_images=True,
                                       preprocess_fn=preprocess,
                                       print_info=False)
    return nb_actions, nb_timesteps, replay_memory
Esempio n. 5
0
def train_deepq(name,
                env,
                nb_actions,
                Q_network,
                preprocess_fn=None,
                batch_size=32,
                replay_start_size=50000,
                replay_memory_size=50000,
                agent_history_length=4,
                target_network_update_frequency=10000,
                discount_factor=0.99,
                learning_rate=1e-5,
                update_frequency=4,
                inital_exploration=1,
                final_exploration=0.1,
                final_exploration_step=int(1e6),
                nb_timesteps=int(1e7),
                tensorboard_freq=50,
                demo_tensorboard=False):

    #SAVE/LOAD MODEL
    DIRECTORY_MODELS = './models/'
    if not os.path.exists(DIRECTORY_MODELS):
        os.makedirs(DIRECTORY_MODELS)
    PATH_SAVE = DIRECTORY_MODELS + name + '_' + time.strftime('%Y%m%d-%H%M')

    #GPU/CPU
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('RUNNING ON', device)

    #TENSORBOARDX
    writer = SummaryWriter(comment=name)

    replay_memory = init_replay_memory(env, replay_memory_size,
                                       replay_start_size, preprocess_fn)

    print('#### TRAINING ####')
    print('see more details on tensorboard')

    done = True  #reset environment
    eps_schedule = ScheduleExploration(inital_exploration, final_exploration,
                                       final_exploration_step)
    Q_network = Q_network.to(device)
    Q_hat = copy.deepcopy(Q_network).to(device)
    loss = SmoothL1Loss()
    optimizer = RMSprop(Q_network.parameters(),
                        lr=learning_rate,
                        alpha=0.95,
                        eps=0.01,
                        centered=True)

    episode = 1
    rewards_episode, total_reward_per_episode = list(), list()
    for timestep in tqdm(range(nb_timesteps)):  #tqdm
        #if an episode is ended
        if done:
            total_reward_per_episode.append(np.sum(rewards_episode))
            rewards_episode = list()
            phi_t = env.reset()
            if preprocess_fn:
                phi_t = preprocess_fn(phi_t)

            if (episode % tensorboard_freq == 0):
                assert len(total_reward_per_episode) == tensorboard_freq
                #tensorboard
                writer.add_scalar('rewards/train_reward',
                                  np.mean(total_reward_per_episode), episode)
                total_reward_per_episode = list()
                writer.add_scalar('other/replay_memory_size',
                                  len(replay_memory), episode)
                writer.add_scalar('other/eps_exploration',
                                  eps_schedule.get_eps(), episode)
                if demo_tensorboard:
                    demos, demo_rewards = play(env,
                                               Q_network,
                                               preprocess_fn,
                                               nb_episodes=1,
                                               eps=eps_schedule.get_eps())
                    writer.add_scalar('rewards/demo_reward',
                                      np.mean(demo_rewards), episode)
                    for demo in demos:
                        demo = demo.permute([3, 0, 1, 2]).unsqueeze(0)
                        writer.add_video(name, demo, episode, fps=25)

                #save model
                torch.save(Q_network.state_dict(), PATH_SAVE)

            episode += 1

        a_t = get_action(phi_t, env, Q_network, eps_schedule)

        phi_t_1, r_t, done, info = env.step(a_t)
        rewards_episode.append(r_t)
        if preprocess_fn:
            phi_t_1 = preprocess_fn(phi_t_1)
        replay_memory.push([phi_t, a_t, r_t, phi_t_1, done])
        phi_t = phi_t_1

        #training
        if timestep % update_frequency == 0:
            #get training data
            phi_t_training, actions_training, y = get_training_data(
                Q_hat, replay_memory, batch_size, discount_factor)

            #forward
            phi_t_training = phi_t_training.to(device)
            Q_values = Q_network(phi_t_training)
            mask = torch.zeros([batch_size, nb_actions]).to(device)
            for j in range(len(actions_training)):
                mask[j, actions_training[j]] = 1
            Q_values = Q_values * mask
            Q_values = torch.sum(Q_values, dim=1)
            output = loss(Q_values, y)

            #backward and gradient descent
            optimizer.zero_grad()
            output.backward()
            optimizer.step()

        if timestep % target_network_update_frequency == 0:
            Q_hat = copy.deepcopy(Q_network).to(device)
Esempio n. 6
0
def train_deepq(env_name,
                env,
                Q_network,
                input_as_images,
                double_Q=True,
                batch_size=64,
                replay_start_size=50000,
                replay_memory_size=int(1e6),
                agent_history_length=4,
                target_network_update_frequency=10000,
                discount_factor=0.99,
                update_frequency=4,
                eps_training=LinearScheduler(
                    steps=[(0, 1), (int(1e6), 0.1), (2 * int(1e6), 0.01)]),
                nb_timesteps=5 * int(1e7),
                tensorboard_freq=50000,
                first_demo=50000):

    nb_actions = env.action_space.n
    print('NB ACTIONS:', nb_actions)

    #SAVE/LOAD MODEL
    DIRECTORY_MODELS = './models/'
    if not os.path.exists(DIRECTORY_MODELS):
        os.makedirs(DIRECTORY_MODELS)
    PATH_SAVE = DIRECTORY_MODELS + env_name + '_' + time.strftime(
        '%Y%m%d-%H%M')

    #GPU/CPU
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    print('RUNNING ON', device)
    print('Number of trainable parameters:',
          torch.nn.utils.parameters_to_vector(Q_network.parameters()).shape[0])

    #TENSORBOARDX
    writer = SummaryWriter(comment=env_name)

    replay_memory = init_replay_memory(env, agent_history_length,
                                       replay_memory_size, replay_start_size,
                                       input_as_images)

    print('#### TRAINING ####')
    print('see more details on tensorboard')

    done = True  #reset environment
    Q_network = Q_network.to(device)
    Q_hat = copy.deepcopy(Q_network).to(device)
    optimizer = RMSprop(Q_network.parameters(),
                        lr=0.00025,
                        momentum=0,
                        alpha=0.95,
                        eps=0.01,
                        centered=True)

    rewards_episode, total_reward_per_episode, total_gradient_norm, total_bs = list(
    ), list(), list(), list()

    for timestep in tqdm(range(nb_timesteps)):

        #if an episode is ended
        if done:
            total_reward_per_episode.append(np.sum(rewards_episode))
            rewards_episode = list()
            #reset the environment
            phi_t = env.reset()
            last_episodes = Memory(agent_history_length)
            while len(last_episodes.replay_memory) < agent_history_length:
                last_episodes.push(phi_t)

        #choose action
        observations = torch.stack(
            last_episodes[0:agent_history_length]).to(device)
        if input_as_images:
            observations = observations.unsqueeze(0).permute(0, 2, 3, 1)
        a_t = eps_greedy_action(observations, env, Q_network, eps_training)

        #interact with the environment
        phi_t_1, r_t, done, info = env.step(a_t)

        #for tensorboard
        rewards_episode.append(r_t)

        #store in memory
        replay_memory.push([phi_t, a_t, r_t, done])
        phi_t = phi_t_1
        last_episodes.push(phi_t)

        #training
        if ((timestep + 1) % update_frequency) == 0:
            #get training data
            phi_t_training, actions_training, r, phi_t_1_training, episode_terminates = replay_memory.sample(
                batch_size)
            phi_t_training, actions_training, r, phi_t_1_training, episode_terminates = phi_t_training.to(
                device), actions_training.to(device), r.to(
                    device), phi_t_1_training.to(
                        device), episode_terminates.to(device)

            # clip reward
            r = r.clamp(-1, 1)

            #error
            if double_Q:
                temp = torch.max(Q_network(phi_t_1_training), dim=1)[1]
                Q_hat_values = Q_hat(phi_t_1_training)[
                    torch.arange(temp.shape[0]), temp]
            else:
                Q_hat_values = torch.max(Q_hat(phi_t_1_training), dim=1)[0]

            delta = r + (1 -
                         episode_terminates) * discount_factor * Q_hat_values
            delta = delta.detach(
            )  #we don't want to compute gradients on target variables

            Q_values = Q_network(phi_t_training)

            mask = torch.zeros([*episode_terminates.shape,
                                nb_actions]).to(device)
            mask.scatter_(1, actions_training.unsqueeze(1), 1.0)
            Q_values = Q_values * mask
            delta = torch.sum(Q_values, dim=1) - delta
            clipped_delta = delta.clamp(-1, 1)
            targets = torch.zeros([*episode_terminates.shape,
                                   nb_actions]).to(device)
            targets.masked_scatter_(mask.byte(), clipped_delta)

            #backward and gradient descent
            optimizer.zero_grad()
            Q_values.backward(targets.data)
            optimizer.step()

            #tensorboard
            gradient_norm = 0
            for p in Q_network.parameters():
                gradient_norm += torch.sum(p.grad.data**2)
            gradient_norm = np.sqrt(gradient_norm.cpu())
            total_gradient_norm.append(gradient_norm)
            total_bs.append(phi_t_training.shape[0])

        if timestep % target_network_update_frequency == 0:
            Q_hat = copy.deepcopy(Q_network).to(device)

        #tensorboard and save model
        if timestep % tensorboard_freq == 0:
            scalars = {
                '2_other/replay_memory_size': len(replay_memory),
                '2_other/eps_exploration': eps_training.get_eps(),
            }
            if timestep > 0:
                scalars['0_rewards/mean_train_reward'] = np.mean(
                    total_reward_per_episode)
                scalars['1_gradient/mean_gradient_norm'] = np.mean(
                    total_gradient_norm)
                scalars['2_other/min_bs'] = np.min(total_bs)
                scalars['2_other/mean_bs'] = np.mean(total_bs)
            if input_as_images and (timestep >= first_demo):
                demo, demo_rewards = play_atari(env_name,
                                                agent_history_length,
                                                Q_network,
                                                nb_episodes=100,
                                                eps=float(
                                                    eps_training.get_eps()))
                scalars['0_rewards/demo_reward'] = np.mean(demo_rewards)
            else:
                demo = None
            write_to_tensorboard(env_name, writer, timestep, scalars,
                                 Q_network, demo)
            total_reward_per_episode, total_gradient_norm, total_bs = list(
            ), list(), list()

            #save model
            torch.save(Q_network.state_dict(), PATH_SAVE)