def duel_rank_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, max_frames, envo): """ Implementation of the training algorithm for Dueling Network Architecture using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Dueling Network Architectures for Deep Reinforcement Learning" by Ziyu Wang et al. Refer to section 4.2 in the paper for more implementation info. """ gym.undo_logger_setup() logging.basicConfig(filename=envo+'_'+'duel_rank_training.log',level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DUEL(num_actions) target = DUEL(num_actions) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, model, target, gamma, batch_size) frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma, prob_alpha) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type(model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps'] ) episodes_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] wLoss_func = Weighted_Loss() current_state, _, _, _ = util.play_game(env, frames_per_state) print('Starting training...') for frames_count in range(1, max_frames): epsilon=exploreScheduler.anneal_linear(frames_count) beta = betaScheduler.anneal_linear(frames_count) choice = random.uniform(0,1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) current_state_ex = Variable(current_state, volatile=True) curr_obs_ex = Variable(curr_obs, volatile=True) action_ex = Variable(action, volatile=True) reward_ex = Variable(reward, volatile=True) #compute td-error for one sample td_error = duel_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma) td_error = torch.pow(torch.abs(td_error)+1e-6, prob_alpha) exp_replay.push(current_state, action, reward, curr_obs, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(batch_size) num_samples_per_batch = len(obs_samples) obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals)) p_batch = 1/ obs_priorityTensor w_batch = (1/len(exp_replay) * p_batch)**beta max_weight = exp_replay.get_max_weight(beta) w_batch /= max_weight w_batch = w_batch.type(Tensor) batch = Experience(*zip(*obs_samples)) loss, new_weights = duel_compute_y(batch, num_samples_per_batch, model, target, gamma, w_batch, wLoss_func) loss_abs = torch.abs(new_weights) exp_replay.update(obs_ranks, loss_abs) currentLOSS = loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() grad_index = 0 for param in model.parameters(): #Clip the combined gradient entering the last conv layer by 1/sqrt(2) if grad_index == 4: param.grad.data.mul_(1/math.sqrt(2)) #Clip gradients to have their norm less than or equal to 10 grad_norm = torch.norm(param.grad.data) if grad_norm > 10: param.grad.data.div_(grad_norm).mul_(10) grad_index += 1 optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()[0]) frames_per_episode+= frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode=1 episodes_count+=1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration)/100.0 avg_reward_content = 'Episode from', episodes_count-99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) # sort memory replay every half of it's capacity iterations if frames_count % int(rp_size/2) == 0: exp_replay.sort() #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory+'/'+envo+'/') torch.save(model.state_dict(), output_directory+envo+'/rank_duel_'+ str(frames_count)+'.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
def dqn_train(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, envo): gym.undo_logger_setup() logging.basicConfig(filename=envo + '_' + model_type + '_training.log', level=logging.INFO) num_actions = env.action_space.n print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions, use_bn=False) target = DQN(num_actions, use_bn=False) if use_cuda: model.cuda() target.cuda() exp_replay = None episodes_count = 1 if last_checkpoint != '': model.load_state_dict(torch.load(last_checkpoint)) exp_replay = util.initialize_replay_resume(env, rp_start, rp_size, frames_per_state, model) episodes_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_replay(env, rp_start, rp_size, frames_per_state) target.load_state_dict(model.state_dict()) print('weights loaded...') optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) frames_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) print('Starting training...') count = 0 while True: epsilon = scheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([reward]) exp_replay.push(current_state, action, reward, curr_obs) current_state = curr_obs #sample random mini-batch obs_sample = exp_replay.sample(batch_size) batch = Experience( *zip(*obs_sample) ) #unpack the batch into states, actions, rewards and next_states #compute y if len(exp_replay) >= batch_size: loss = dqn_compute_y(batch, batch_size, model, target, gamma) optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()[0]) frames_count += 1 frames_per_episode += frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode = 1 episodes_count += 1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + envo + '/' + model_type + '/') torch.save( model.state_dict(), output_directory + envo + '/' + model_type + '/weights_' + str(frames_count) + '.pth') #Print frame count for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
def ddqn_rank_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, max_frames, envo): """ Implementation of the training algorithm for DDQN using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver. Refer to section 3.3 in the paper for more info. """ gym.undo_logger_setup() logging.basicConfig(filename=envo + '_' + 'ddqn_rank_weighted_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions) target = DQN(num_actions) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') #TODO: Implementation of resume # exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, # model, target, gamma, batch_size) # frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma, prob_alpha) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) episodes_count = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = util.play_game(env, frames_per_state) wLoss_func = Weighted_Loss() print('Starting training...') for frames_count in range(1, max_frames): epsilon = exploreScheduler.anneal_linear(frames_count) beta = betaScheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) td_error = 1 temp_exp = Experience(current_state, action, reward, curr_obs, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples # start = time.time() if frames_count % rp_size == 0: obs_samples, obs_priorityVals = exp_replay.sample(batch_size - 1, prob_alpha, sort=True) else: obs_samples, obs_priorityVals = exp_replay.sample(batch_size - 1, prob_alpha, sort=False) obs_samples.append(temp_exp) obs_priorityVals.append(td_error) obs_pVals_tensor = torch.from_numpy(np.array(obs_priorityVals)) # print("P(i): ", obs_pVals_tensor) IS_weights = torch.pow((obs_pVals_tensor * rp_size), -beta) max_weight = torch.max(IS_weights) IS_weights_norm = torch.div(IS_weights, max_weight).type(Tensor) IS_weights_norm[-1] = torch.max(IS_weights_norm) # print("Norm W(i): ", IS_weights_norm) batch = Experience(*zip(*obs_samples)) loss, new_weights = ddqn_compute_y(batch, batch_size, model, target, gamma, IS_weights_norm, wLoss_func) new_weights = torch.pow(new_weights, prob_alpha) new_exp = Experience(temp_exp.state, temp_exp.action, temp_exp.reward, temp_exp.next_state, new_weights[batch_size - 1]) exp_replay.update(obs_samples, new_weights, new_exp) optimizer.zero_grad() loss.backward() # print("loss: ", loss.data) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()[0]) else: exp_replay.push(new_exp.state, new_exp.action, new_exp.reward, new_exp.next_state, td_error) # end = time.time() # duration = end-start # print('duration : ', duration) if done: # print('Game: ', rewards_per_episode) rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 episodes_count += 1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + '/' + envo + '/') torch.save( model.state_dict(), output_directory + '/' + envo + '/rank_uniform' + str(frames_count) + '.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
def ddqn_rankBatch_train(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, inital_beta, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint): """ Implementation of the training algorithm for DDQN using Rank-based prioritization. Information with regards to the algorithm can be found in the paper, "Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and David Silver. Refer to section 3.3 in the paper for more info. """ gym.undo_logger_setup() logging.basicConfig(filename='ddqn_rank_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions, use_bn=False) target = DQN(num_actions, use_bn=False) if use_cuda: model.cuda() target.cuda() frames_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') exp_replay = util.initialize_rank_replay_resume( env, rp_start, rp_size, frames_per_state, model, target, gamma, batch_size) frames_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, model, target, gamma) target.load_state_dict(model.state_dict()) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) episodes_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = util.play_game(env, frames_per_state) print('Starting training...') count = 0 while True: epsilon = scheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = util.get_greedy_action(model, current_state) curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([[reward]]) current_state_ex = Variable(current_state, volatile=True) curr_obs_ex = Variable(curr_obs, volatile=True) action_ex = Variable(action, volatile=True) reward_ex = Variable(reward, volatile=True) #compute td-error for one sample td_error = ddqn_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma) td_error = torch.abs(td_error) exp_replay.push(current_state_ex, action_ex, reward_ex, curr_obs_ex, td_error) current_state = curr_obs # compute y if len(exp_replay) >= batch_size: # Get batch samples obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample( batch_size) obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals)) p_batch = 1 / obs_priorityTensor w_batch = (1 / len(exp_replay) * p_batch)**inital_beta max_weight = exp_replay.get_max_weight(inital_beta) params_grad = [] for i in range(len(obs_samples)): sample = obs_samples[i] sample.state.volatile = False sample.next_state.volatile = False sample.reward.volatile = False sample.action.volatile = False loss = ddqn_compute_y(batch_size=1, state_batch=sample.state, reward_batch=sample.reward, action_batch=sample.action, next_state_batch=sample.next_state, model=model, target=target, gamma=gamma) loss_abs = torch.abs(loss) exp_replay.update(obs_ranks[i], loss_abs) for param in model.parameters(): if param.grad is not None: param.grad.data.zero_() loss.backward() #accumulate weight change if i == 0: for param in model.parameters(): tmp = ((w_batch[i] / max_weight) * loss.data[0]) * param.grad.data params_grad.append(tmp) else: paramIndex = 0 for param in model.parameters(): tmp = ((w_batch[i] / max_weight) * loss.data[0]) * param.grad.data params_grad[paramIndex] = tmp + params_grad[paramIndex] paramIndex += 1 # update weights paramIndex = 0 for param in model.parameters(): param.data += params_grad[paramIndex].mul( optimizer_constructor.kwargs['lr']).type(Tensor) paramIndex += 1 frames_count += 1 frames_per_episode += frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode = 1 episodes_count += 1 env.reset() current_state, _, _, _ = util.play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + model_type + '/') torch.save(model.state_dict(), 'rank_weights_' + str(frames_count) + '.pth') #Print frame count and sort experience replay for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update) exp_replay.sort()