def duel_rank_train(env, exploreScheduler, betaScheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, 
	exp_frame, exp_initial, exp_final, prob_alpha, gamma, target_update_steps, frames_per_epoch, 
	frames_per_state, output_directory, last_checkpoint, max_frames, envo):

	"""
	Implementation of the training algorithm for Dueling Network Architecture using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Dueling Network Architectures for Deep Reinforcement Learning" by Ziyu Wang et al. 
	Refer to section 4.2 in the paper for more implementation info. 
	"""
	
	gym.undo_logger_setup()
	logging.basicConfig(filename=envo+'_'+'duel_rank_training.log',level=logging.INFO)
	num_actions = env.action_space.n
	env.reset()
	
	print('No. of actions: ', num_actions)
	print(env.unwrapped.get_action_meanings())

	# initialize action value and target network with the same weights
	model = DUEL(num_actions)
	target = DUEL(num_actions)

	if use_cuda:
		model.cuda()
		target.cuda()

	frames_count = 1

	if last_checkpoint:
		model.load_state_dict(torch.load(last_checkpoint))
		print(last_checkpoint)
		print('weights loaded...')

		exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state, 
			model, target, gamma, batch_size)
		frames_count = get_index_from_checkpoint_path(last_checkpoint)

	else:
		exp_replay = util.initialize_rank_replay(env, rp_start, rp_size, frames_per_state, 
			model, target, gamma, prob_alpha)

	target.load_state_dict(model.state_dict())

	optimizer = optimizer_constructor.type(model.parameters(), lr=optimizer_constructor.kwargs['lr'],
		alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps'] )

	episodes_count = 1
	frames_per_episode = 1
	epsiodes_durations = []
	rewards_per_episode = 0
	rewards_duration = []
	loss_per_epoch = []
	wLoss_func = Weighted_Loss()

	
	current_state, _, _, _ = util.play_game(env, frames_per_state)
	print('Starting training...')

	for frames_count in range(1, max_frames):

		epsilon=exploreScheduler.anneal_linear(frames_count)
		beta = betaScheduler.anneal_linear(frames_count)
		choice = random.uniform(0,1)

		# epsilon greedy algorithm
		if choice <= epsilon:
			action = LongTensor([[random.randrange(num_actions)]])

		else:
			action = util.get_greedy_action(model, current_state)

		curr_obs, reward, done, _ = util.play_game(env, frames_per_state, action[0][0])

		rewards_per_episode += reward
		reward = Tensor([[reward]])
		current_state_ex = Variable(current_state, volatile=True)
		curr_obs_ex = Variable(curr_obs, volatile=True)
		action_ex = Variable(action, volatile=True)
		reward_ex = Variable(reward, volatile=True)

		#compute td-error for one sample
		td_error = duel_compute_td_error(batch_size=1, state_batch=current_state_ex, reward_batch=reward_ex, action_batch=action_ex, 
			next_state_batch=curr_obs_ex, model=model, target=target, gamma=gamma)

		td_error = torch.pow(torch.abs(td_error)+1e-6, prob_alpha)
		exp_replay.push(current_state, action, reward, curr_obs, td_error)
		current_state = curr_obs

		# compute y 
		if len(exp_replay) >= batch_size:
			# Get batch samples
			obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(batch_size)
			num_samples_per_batch = len(obs_samples)
			obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals))
			p_batch = 1/ obs_priorityTensor
			w_batch = (1/len(exp_replay) * p_batch)**beta
			max_weight = exp_replay.get_max_weight(beta)
			w_batch /= max_weight
			w_batch = w_batch.type(Tensor)

			batch = Experience(*zip(*obs_samples))

			loss, new_weights = duel_compute_y(batch, num_samples_per_batch, model, target, gamma, w_batch, wLoss_func)
			loss_abs = torch.abs(new_weights)
			exp_replay.update(obs_ranks, loss_abs)

			currentLOSS = loss.data.cpu().numpy()[0]

			optimizer.zero_grad()
			loss.backward()

			
			grad_index = 0
			for param in model.parameters():

				#Clip the combined gradient entering the last conv layer by 1/sqrt(2)
				if grad_index == 4:
					param.grad.data.mul_(1/math.sqrt(2))

				#Clip gradients to have their norm less than or equal to 10 
				grad_norm = torch.norm(param.grad.data)
				if grad_norm > 10: 
					param.grad.data.div_(grad_norm).mul_(10)

				grad_index += 1



			optimizer.step()
			loss_per_epoch.append(loss.data.cpu().numpy()[0])
		
		frames_per_episode+= frames_per_state

		if done:
			rewards_duration.append(rewards_per_episode)
			rewards_per_episode = 0
			frames_per_episode=1
			episodes_count+=1
			env.reset()
			current_state, _, _, _ = util.play_game(env, frames_per_state)

			if episodes_count % 100 == 0:
				avg_episode_reward = sum(rewards_duration)/100.0
				avg_reward_content = 'Episode from', episodes_count-99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(loss_per_epoch)
				print(avg_reward_content)
				logging.info(avg_reward_content)
				rewards_duration = []
				loss_per_epoch = []

		# update weights of target network for every TARGET_UPDATE_FREQ steps
		if frames_count % target_update_steps == 0:
			target.load_state_dict(model.state_dict())
			# print('weights updated at frame no. ', frames_count)

		# sort memory replay every half of it's capacity iterations 
		if frames_count % int(rp_size/2) == 0:
			exp_replay.sort()
	

		#Save weights every 250k frames
		if frames_count % 250000 == 0:
			util.make_sure_path_exists(output_directory+'/'+envo+'/')
			torch.save(model.state_dict(), output_directory+envo+'/rank_duel_'+ str(frames_count)+'.pth')


		#Print frame count and sort experience replay for every 1000000 (one million) frames:
		if frames_count % 1000000 == 0:
			training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
			print(training_update)
			logging.info(training_update)
Exemple #2
0
def dqn_train(env, scheduler, optimizer_constructor, model_type, batch_size,
              rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma,
              target_update_steps, frames_per_epoch, frames_per_state,
              output_directory, last_checkpoint, envo):

    gym.undo_logger_setup()
    logging.basicConfig(filename=envo + '_' + model_type + '_training.log',
                        level=logging.INFO)
    num_actions = env.action_space.n

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions, use_bn=False)
    target = DQN(num_actions, use_bn=False)

    if use_cuda:
        model.cuda()
        target.cuda()

    exp_replay = None
    episodes_count = 1

    if last_checkpoint != '':
        model.load_state_dict(torch.load(last_checkpoint))
        exp_replay = util.initialize_replay_resume(env, rp_start, rp_size,
                                                   frames_per_state, model)
        episodes_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = util.initialize_replay(env, rp_start, rp_size,
                                            frames_per_state)

    target.load_state_dict(model.state_dict())
    print('weights loaded...')

    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    frames_count = 1
    frames_per_episode = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []

    env.reset()

    current_state, _, _, _ = util.play_game(env, frames_per_state)

    print('Starting training...')

    count = 0

    while True:

        epsilon = scheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = util.get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = util.play_game(env, frames_per_state,
                                                   action[0][0])

        rewards_per_episode += reward
        reward = Tensor([reward])

        exp_replay.push(current_state, action, reward, curr_obs)

        current_state = curr_obs

        #sample random mini-batch
        obs_sample = exp_replay.sample(batch_size)

        batch = Experience(
            *zip(*obs_sample)
        )  #unpack the batch into states, actions, rewards and next_states

        #compute y
        if len(exp_replay) >= batch_size:

            loss = dqn_compute_y(batch, batch_size, model, target, gamma)
            optimizer.zero_grad()
            loss.backward()

            for param in model.parameters():
                param.grad.data.clamp_(-1, 1)

            optimizer.step()

            loss_per_epoch.append(loss.data.cpu().numpy()[0])

        frames_count += 1
        frames_per_episode += frames_per_state

        if done:
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            frames_per_episode = 1
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = util.play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())
            # print('weights updated at frame no. ', frames_count)

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + envo + '/' +
                                       model_type + '/')
            torch.save(
                model.state_dict(), output_directory + envo + '/' +
                model_type + '/weights_' + str(frames_count) + '.pth')

        #Print frame count for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
Exemple #3
0
def ddqn_rank_train(env, exploreScheduler, betaScheduler,
                    optimizer_constructor, model_type, batch_size, rp_start,
                    rp_size, exp_frame, exp_initial, exp_final, prob_alpha,
                    gamma, target_update_steps, frames_per_epoch,
                    frames_per_state, output_directory, last_checkpoint,
                    max_frames, envo):
    """
	Implementation of the training algorithm for DDQN using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and
	David Silver. Refer to section 3.3 in the paper for more info. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename=envo + '_' +
                        'ddqn_rank_weighted_training.log',
                        level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions)
    target = DQN(num_actions)

    if use_cuda:
        model.cuda()
        target.cuda()

    frames_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        #TODO: Implementation of resume
        # exp_replay = util.initialize_rank_replay_resume(env, rp_start, rp_size, frames_per_state,
        # 	model, target, gamma, batch_size)
        # frames_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = util.initialize_rank_replay(env, rp_start, rp_size,
                                                 frames_per_state, model,
                                                 target, gamma, prob_alpha)

    target.load_state_dict(model.state_dict())

    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    episodes_count = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []
    current_state, _, _, _ = util.play_game(env, frames_per_state)
    wLoss_func = Weighted_Loss()

    print('Starting training...')

    for frames_count in range(1, max_frames):

        epsilon = exploreScheduler.anneal_linear(frames_count)
        beta = betaScheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = util.get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = util.play_game(env, frames_per_state,
                                                   action[0][0])

        rewards_per_episode += reward
        reward = Tensor([[reward]])
        td_error = 1

        temp_exp = Experience(current_state, action, reward, curr_obs,
                              td_error)
        current_state = curr_obs

        # compute y
        if len(exp_replay) >= batch_size:
            # Get batch samples

            # start = time.time()

            if frames_count % rp_size == 0:
                obs_samples, obs_priorityVals = exp_replay.sample(batch_size -
                                                                  1,
                                                                  prob_alpha,
                                                                  sort=True)
            else:
                obs_samples, obs_priorityVals = exp_replay.sample(batch_size -
                                                                  1,
                                                                  prob_alpha,
                                                                  sort=False)

            obs_samples.append(temp_exp)
            obs_priorityVals.append(td_error)

            obs_pVals_tensor = torch.from_numpy(np.array(obs_priorityVals))
            # print("P(i): ", obs_pVals_tensor)
            IS_weights = torch.pow((obs_pVals_tensor * rp_size), -beta)
            max_weight = torch.max(IS_weights)

            IS_weights_norm = torch.div(IS_weights, max_weight).type(Tensor)
            IS_weights_norm[-1] = torch.max(IS_weights_norm)

            # print("Norm W(i): ", IS_weights_norm)

            batch = Experience(*zip(*obs_samples))
            loss, new_weights = ddqn_compute_y(batch, batch_size, model,
                                               target, gamma, IS_weights_norm,
                                               wLoss_func)
            new_weights = torch.pow(new_weights, prob_alpha)
            new_exp = Experience(temp_exp.state, temp_exp.action,
                                 temp_exp.reward, temp_exp.next_state,
                                 new_weights[batch_size - 1])
            exp_replay.update(obs_samples, new_weights, new_exp)
            optimizer.zero_grad()
            loss.backward()
            # print("loss: ", loss.data)
            optimizer.step()
            loss_per_epoch.append(loss.data.cpu().numpy()[0])

        else:
            exp_replay.push(new_exp.state, new_exp.action, new_exp.reward,
                            new_exp.next_state, td_error)

        # end = time.time()

        # duration = end-start

        # print('duration : ', duration)

        if done:
            # print('Game: ', rewards_per_episode)
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = util.play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + '/' + envo + '/')
            torch.save(
                model.state_dict(), output_directory + '/' + envo +
                '/rank_uniform' + str(frames_count) + '.pth')

        #Print frame count and sort experience replay for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
def ddqn_rankBatch_train(env, scheduler, optimizer_constructor, model_type,
                         batch_size, rp_start, rp_size, exp_frame, exp_initial,
                         exp_final, inital_beta, gamma, target_update_steps,
                         frames_per_epoch, frames_per_state, output_directory,
                         last_checkpoint):
    """
	Implementation of the training algorithm for DDQN using Rank-based prioritization.
	Information with regards to the algorithm can be found in the paper, 
	"Prioritized Experience Replay" by Tom Schaul, John Quan, Ioannis Antonoglou and
	David Silver. Refer to section 3.3 in the paper for more info. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename='ddqn_rank_training.log', level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions, use_bn=False)
    target = DQN(num_actions, use_bn=False)

    if use_cuda:
        model.cuda()
        target.cuda()

    frames_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        exp_replay = util.initialize_rank_replay_resume(
            env, rp_start, rp_size, frames_per_state, model, target, gamma,
            batch_size)
        frames_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = util.initialize_rank_replay(env, rp_start, rp_size,
                                                 frames_per_state, model,
                                                 target, gamma)

    target.load_state_dict(model.state_dict())

    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    episodes_count = 1
    frames_per_episode = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []

    current_state, _, _, _ = util.play_game(env, frames_per_state)
    print('Starting training...')

    count = 0

    while True:

        epsilon = scheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = util.get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = util.play_game(env, frames_per_state,
                                                   action[0][0])

        rewards_per_episode += reward
        reward = Tensor([[reward]])
        current_state_ex = Variable(current_state, volatile=True)
        curr_obs_ex = Variable(curr_obs, volatile=True)
        action_ex = Variable(action, volatile=True)
        reward_ex = Variable(reward, volatile=True)

        #compute td-error for one sample
        td_error = ddqn_compute_td_error(batch_size=1,
                                         state_batch=current_state_ex,
                                         reward_batch=reward_ex,
                                         action_batch=action_ex,
                                         next_state_batch=curr_obs_ex,
                                         model=model,
                                         target=target,
                                         gamma=gamma)

        td_error = torch.abs(td_error)
        exp_replay.push(current_state_ex, action_ex, reward_ex, curr_obs_ex,
                        td_error)
        current_state = curr_obs

        # compute y
        if len(exp_replay) >= batch_size:
            # Get batch samples
            obs_samples, obs_ranks, obs_priorityVals = exp_replay.sample(
                batch_size)
            obs_priorityTensor = torch.from_numpy(np.array(obs_priorityVals))
            p_batch = 1 / obs_priorityTensor
            w_batch = (1 / len(exp_replay) * p_batch)**inital_beta
            max_weight = exp_replay.get_max_weight(inital_beta)
            params_grad = []

            for i in range(len(obs_samples)):
                sample = obs_samples[i]
                sample.state.volatile = False
                sample.next_state.volatile = False
                sample.reward.volatile = False
                sample.action.volatile = False
                loss = ddqn_compute_y(batch_size=1,
                                      state_batch=sample.state,
                                      reward_batch=sample.reward,
                                      action_batch=sample.action,
                                      next_state_batch=sample.next_state,
                                      model=model,
                                      target=target,
                                      gamma=gamma)
                loss_abs = torch.abs(loss)
                exp_replay.update(obs_ranks[i], loss_abs)

                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.data.zero_()

                loss.backward()

                #accumulate weight change
                if i == 0:
                    for param in model.parameters():
                        tmp = ((w_batch[i] / max_weight) *
                               loss.data[0]) * param.grad.data
                        params_grad.append(tmp)

                else:
                    paramIndex = 0
                    for param in model.parameters():
                        tmp = ((w_batch[i] / max_weight) *
                               loss.data[0]) * param.grad.data
                        params_grad[paramIndex] = tmp + params_grad[paramIndex]
                        paramIndex += 1

            # update weights
            paramIndex = 0
            for param in model.parameters():
                param.data += params_grad[paramIndex].mul(
                    optimizer_constructor.kwargs['lr']).type(Tensor)
                paramIndex += 1

        frames_count += 1
        frames_per_episode += frames_per_state

        if done:
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            frames_per_episode = 1
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = util.play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())
            # print('weights updated at frame no. ', frames_count)

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + model_type + '/')
            torch.save(model.state_dict(),
                       'rank_weights_' + str(frames_count) + '.pth')

        #Print frame count and sort experience replay for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
            exp_replay.sort()