def test_init_replay_memory(env): _, env = env replay_memory_size = 100 replay_start_size = 100 history_length = 4 first_index = 3 replay_memory = init_replay_memory(env, history_length=history_length, replay_memory_size=replay_memory_size, replay_start_size=replay_start_size, input_as_images=True, preprocess_fn=preprocess, print_info=False) assert len(replay_memory) == replay_start_size assert replay_memory[first_index][0].shape == torch.Size( [1, 84, 84, history_length]) assert replay_memory[first_index][1].shape == torch.Size([1]) assert replay_memory[first_index][2].shape == torch.Size([1]) assert replay_memory[first_index][4].shape == torch.Size([1]) len_vector = replay_start_size // 2 assert replay_memory[first_index:len_vector][0].shape == torch.Size( [len_vector - first_index, 84, 84, history_length]) assert replay_memory[first_index:len_vector][1].shape == torch.Size( [len_vector - first_index]) assert replay_memory[first_index:len_vector][2].shape == torch.Size( [len_vector - first_index]) assert replay_memory[first_index:len_vector][4].shape == torch.Size( [len_vector - first_index])
def test_init_replay_memory(env): _, env = env replay_memory_size = 100 replay_start_size = 10 replay_memory = init_replay_memory(env, replay_memory_size=replay_memory_size, replay_start_size=replay_start_size, print_info=False) assert len(replay_memory) == replay_start_size
def replay_memory(): ''' Generate a filled replay_memory ''' nb_timesteps = pytest.agent_history_length nb_actions = pytest.nb_actions env = gym.make(pytest.env_name) env = KFrames(env, history_length=nb_timesteps) replay_memory = init_replay_memory(env, replay_memory_size=100, replay_start_size=100, preprocess_fn=preprocess, print_info=False) return nb_actions, nb_timesteps, replay_memory
def replay_memory(): ''' Generate a filled replay_memory ''' nb_timesteps = pytest.agent_history_length env = gym.make(pytest.env_name) nb_actions = env.action_space.n env = SkipFrames(env, skip_frames=nb_timesteps - 1) replay_memory = init_replay_memory(env, replay_memory_size=100, replay_start_size=100, input_as_images=True, preprocess_fn=preprocess, print_info=False) return nb_actions, nb_timesteps, replay_memory
def train_deepq(name, env, nb_actions, Q_network, preprocess_fn=None, batch_size=32, replay_start_size=50000, replay_memory_size=50000, agent_history_length=4, target_network_update_frequency=10000, discount_factor=0.99, learning_rate=1e-5, update_frequency=4, inital_exploration=1, final_exploration=0.1, final_exploration_step=int(1e6), nb_timesteps=int(1e7), tensorboard_freq=50, demo_tensorboard=False): #SAVE/LOAD MODEL DIRECTORY_MODELS = './models/' if not os.path.exists(DIRECTORY_MODELS): os.makedirs(DIRECTORY_MODELS) PATH_SAVE = DIRECTORY_MODELS + name + '_' + time.strftime('%Y%m%d-%H%M') #GPU/CPU device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('RUNNING ON', device) #TENSORBOARDX writer = SummaryWriter(comment=name) replay_memory = init_replay_memory(env, replay_memory_size, replay_start_size, preprocess_fn) print('#### TRAINING ####') print('see more details on tensorboard') done = True #reset environment eps_schedule = ScheduleExploration(inital_exploration, final_exploration, final_exploration_step) Q_network = Q_network.to(device) Q_hat = copy.deepcopy(Q_network).to(device) loss = SmoothL1Loss() optimizer = RMSprop(Q_network.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True) episode = 1 rewards_episode, total_reward_per_episode = list(), list() for timestep in tqdm(range(nb_timesteps)): #tqdm #if an episode is ended if done: total_reward_per_episode.append(np.sum(rewards_episode)) rewards_episode = list() phi_t = env.reset() if preprocess_fn: phi_t = preprocess_fn(phi_t) if (episode % tensorboard_freq == 0): assert len(total_reward_per_episode) == tensorboard_freq #tensorboard writer.add_scalar('rewards/train_reward', np.mean(total_reward_per_episode), episode) total_reward_per_episode = list() writer.add_scalar('other/replay_memory_size', len(replay_memory), episode) writer.add_scalar('other/eps_exploration', eps_schedule.get_eps(), episode) if demo_tensorboard: demos, demo_rewards = play(env, Q_network, preprocess_fn, nb_episodes=1, eps=eps_schedule.get_eps()) writer.add_scalar('rewards/demo_reward', np.mean(demo_rewards), episode) for demo in demos: demo = demo.permute([3, 0, 1, 2]).unsqueeze(0) writer.add_video(name, demo, episode, fps=25) #save model torch.save(Q_network.state_dict(), PATH_SAVE) episode += 1 a_t = get_action(phi_t, env, Q_network, eps_schedule) phi_t_1, r_t, done, info = env.step(a_t) rewards_episode.append(r_t) if preprocess_fn: phi_t_1 = preprocess_fn(phi_t_1) replay_memory.push([phi_t, a_t, r_t, phi_t_1, done]) phi_t = phi_t_1 #training if timestep % update_frequency == 0: #get training data phi_t_training, actions_training, y = get_training_data( Q_hat, replay_memory, batch_size, discount_factor) #forward phi_t_training = phi_t_training.to(device) Q_values = Q_network(phi_t_training) mask = torch.zeros([batch_size, nb_actions]).to(device) for j in range(len(actions_training)): mask[j, actions_training[j]] = 1 Q_values = Q_values * mask Q_values = torch.sum(Q_values, dim=1) output = loss(Q_values, y) #backward and gradient descent optimizer.zero_grad() output.backward() optimizer.step() if timestep % target_network_update_frequency == 0: Q_hat = copy.deepcopy(Q_network).to(device)
def train_deepq(env_name, env, Q_network, input_as_images, double_Q=True, batch_size=64, replay_start_size=50000, replay_memory_size=int(1e6), agent_history_length=4, target_network_update_frequency=10000, discount_factor=0.99, update_frequency=4, eps_training=LinearScheduler( steps=[(0, 1), (int(1e6), 0.1), (2 * int(1e6), 0.01)]), nb_timesteps=5 * int(1e7), tensorboard_freq=50000, first_demo=50000): nb_actions = env.action_space.n print('NB ACTIONS:', nb_actions) #SAVE/LOAD MODEL DIRECTORY_MODELS = './models/' if not os.path.exists(DIRECTORY_MODELS): os.makedirs(DIRECTORY_MODELS) PATH_SAVE = DIRECTORY_MODELS + env_name + '_' + time.strftime( '%Y%m%d-%H%M') #GPU/CPU device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print('RUNNING ON', device) print('Number of trainable parameters:', torch.nn.utils.parameters_to_vector(Q_network.parameters()).shape[0]) #TENSORBOARDX writer = SummaryWriter(comment=env_name) replay_memory = init_replay_memory(env, agent_history_length, replay_memory_size, replay_start_size, input_as_images) print('#### TRAINING ####') print('see more details on tensorboard') done = True #reset environment Q_network = Q_network.to(device) Q_hat = copy.deepcopy(Q_network).to(device) optimizer = RMSprop(Q_network.parameters(), lr=0.00025, momentum=0, alpha=0.95, eps=0.01, centered=True) rewards_episode, total_reward_per_episode, total_gradient_norm, total_bs = list( ), list(), list(), list() for timestep in tqdm(range(nb_timesteps)): #if an episode is ended if done: total_reward_per_episode.append(np.sum(rewards_episode)) rewards_episode = list() #reset the environment phi_t = env.reset() last_episodes = Memory(agent_history_length) while len(last_episodes.replay_memory) < agent_history_length: last_episodes.push(phi_t) #choose action observations = torch.stack( last_episodes[0:agent_history_length]).to(device) if input_as_images: observations = observations.unsqueeze(0).permute(0, 2, 3, 1) a_t = eps_greedy_action(observations, env, Q_network, eps_training) #interact with the environment phi_t_1, r_t, done, info = env.step(a_t) #for tensorboard rewards_episode.append(r_t) #store in memory replay_memory.push([phi_t, a_t, r_t, done]) phi_t = phi_t_1 last_episodes.push(phi_t) #training if ((timestep + 1) % update_frequency) == 0: #get training data phi_t_training, actions_training, r, phi_t_1_training, episode_terminates = replay_memory.sample( batch_size) phi_t_training, actions_training, r, phi_t_1_training, episode_terminates = phi_t_training.to( device), actions_training.to(device), r.to( device), phi_t_1_training.to( device), episode_terminates.to(device) # clip reward r = r.clamp(-1, 1) #error if double_Q: temp = torch.max(Q_network(phi_t_1_training), dim=1)[1] Q_hat_values = Q_hat(phi_t_1_training)[ torch.arange(temp.shape[0]), temp] else: Q_hat_values = torch.max(Q_hat(phi_t_1_training), dim=1)[0] delta = r + (1 - episode_terminates) * discount_factor * Q_hat_values delta = delta.detach( ) #we don't want to compute gradients on target variables Q_values = Q_network(phi_t_training) mask = torch.zeros([*episode_terminates.shape, nb_actions]).to(device) mask.scatter_(1, actions_training.unsqueeze(1), 1.0) Q_values = Q_values * mask delta = torch.sum(Q_values, dim=1) - delta clipped_delta = delta.clamp(-1, 1) targets = torch.zeros([*episode_terminates.shape, nb_actions]).to(device) targets.masked_scatter_(mask.byte(), clipped_delta) #backward and gradient descent optimizer.zero_grad() Q_values.backward(targets.data) optimizer.step() #tensorboard gradient_norm = 0 for p in Q_network.parameters(): gradient_norm += torch.sum(p.grad.data**2) gradient_norm = np.sqrt(gradient_norm.cpu()) total_gradient_norm.append(gradient_norm) total_bs.append(phi_t_training.shape[0]) if timestep % target_network_update_frequency == 0: Q_hat = copy.deepcopy(Q_network).to(device) #tensorboard and save model if timestep % tensorboard_freq == 0: scalars = { '2_other/replay_memory_size': len(replay_memory), '2_other/eps_exploration': eps_training.get_eps(), } if timestep > 0: scalars['0_rewards/mean_train_reward'] = np.mean( total_reward_per_episode) scalars['1_gradient/mean_gradient_norm'] = np.mean( total_gradient_norm) scalars['2_other/min_bs'] = np.min(total_bs) scalars['2_other/mean_bs'] = np.mean(total_bs) if input_as_images and (timestep >= first_demo): demo, demo_rewards = play_atari(env_name, agent_history_length, Q_network, nb_episodes=100, eps=float( eps_training.get_eps())) scalars['0_rewards/demo_reward'] = np.mean(demo_rewards) else: demo = None write_to_tensorboard(env_name, writer, timestep, scalars, Q_network, demo) total_reward_per_episode, total_gradient_norm, total_bs = list( ), list(), list() #save model torch.save(Q_network.state_dict(), PATH_SAVE)