Esempio n. 1
0
def bcq_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5", buffer_seed=0, buffer_size='1000K',
			  cut_buffer_size='1000K', eval_freq=float(1e3), max_timesteps=float(1e6), lr=1e-3,
			  logger_kwargs=dict()):

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print("running on device:", device)

	"""set up logger"""
	global logger
	logger = EpochLogger(**logger_kwargs)
	logger.save_config(locals())

	file_name = "BCQ_%s_%s" % (env_set, seed)
	buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed)
	print
	("---------------------------------------")
	print
	("Task: " + file_name)
	print
	("---------------------------------------")

	if not os.path.exists("./results"):
		os.makedirs("./results")

	env = gym.make(env_set)
	test_env = gym.make(env_set)

	# Set seeds
	env.seed(seed)
	test_env.seed(seed)
	env.action_space.np_random.seed(seed)
	test_env.action_space.np_random.seed(seed)
	torch.manual_seed(seed)
	np.random.seed(seed)

	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0]
	max_action = float(env.action_space.high[0])

	# Initialize policy
	policy = BCQ_bl.BCQ(state_dim, action_dim, max_action, lr=lr)

	# Load buffer
	replay_buffer = utils.ReplayBuffer()
	replay_buffer.load(buffer_name + '_' + buffer_size)
	if buffer_size != cut_buffer_size:
		replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3)
	print(replay_buffer.get_length())

	print('buffer setting:', buffer_name + '_' + cut_buffer_size)


	episode_num = 0
	done = True

	training_iters, epoch = 0, 0
	while training_iters < max_timesteps:
		epoch += 1
		pol_vals = policy.train(replay_buffer, iterations=int(eval_freq), logger=logger)

		avgtest_reward = evaluate_policy(policy, test_env)
		training_iters += eval_freq


		logger.log_tabular('Epoch', epoch)
		logger.log_tabular('AverageTestEpRet', avgtest_reward)
		logger.log_tabular('TotalSteps', training_iters)
		logger.log_tabular('QLoss', average_only=True)
		logger.log_tabular('Q1Vals', with_min_and_max=True)
		logger.log_tabular('Q2Vals', with_min_and_max=True)
		logger.log_tabular('ActLoss', with_min_and_max=True)
		logger.dump_tabular()
Esempio n. 2
0
        os.makedirs("./pytorch_models")

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy and buffer
    policy = DDPG.DDPG(state_dim, action_dim, max_action)
    replay_buffer = utils.ReplayBuffer()

    total_timesteps = 0
    episode_num = 0
    done = True

    while total_timesteps < args.max_timesteps:

        if done:

            if total_timesteps != 0:
                print(total_timesteps, episode_num, episode_timesteps,
                      episode_reward)
                print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" %
                      (total_timesteps, episode_num, episode_timesteps,
                       episode_reward))
def ue_train(env_set="Hopper-v2",
             seed=1,
             buffer_type="FinalSigma0.5",
             buffer_seed=0,
             buffer_size='1000K',
             cut_buffer_size='1000K',
             gamma=0.99,
             rollout=1000,
             loss_k=10000,
             max_ue_trainsteps=int(1e6),
             logger_kwargs=dict()):

    print('testing MClength:', rollout)
    print('Training loss ratio k:', loss_k)
    print('Discount value', gamma)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)

    global logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed)
    setting_name = "%s_%s_r%s_g%s" % (buffer_name, cut_buffer_size, rollout,
                                      gamma)
    print("---------------------------------------")
    print("Settings: " + setting_name)
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    env = gym.make(env_set)

    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Load buffer
    replay_buffer = utils.ReplayBuffer()
    replay_buffer.load(buffer_name + '_' + buffer_size)
    if buffer_size != cut_buffer_size:
        replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3)
    print(replay_buffer.get_length())

    print('buffer setting:', buffer_name + '_' + cut_buffer_size)

    if not os.path.exists('./results/ueMC_%s_Gt.npy' % setting_name):
        save_s = not os.path.exists("./results/ueMC_%s_S.npy" %
                                    (buffer_name + '_' + cut_buffer_size))
        # extract (s,a,r) pairs from replay buffer
        length = replay_buffer.get_length()
        print(length)
        states, actions, gts = [], [], []
        for ind in range(length):
            state, _, action, _, dint = replay_buffer.index(ind)
            gt = calculate_mc_ret(replay_buffer,
                                  ind,
                                  rollout=rollout,
                                  discount=gamma)
            gts.append(gt)
            states.append(state)
            actions.append(action)

        if save_s:
            np.save(
                './results/ueMC_%s_S' % (buffer_name + '_' + cut_buffer_size),
                states)
            np.save(
                './results/ueMC_%s_A' % (buffer_name + '_' + cut_buffer_size),
                actions)

        np.save('./results/ueMC_%s_Gt' % setting_name, gts)

    print('ue train starts ==')

    states = np.load('./results/ueMC_%s_S.npy' %
                     (buffer_name + '_' + cut_buffer_size),
                     allow_pickle=True)
    actions = np.load('./results/ueMC_%s_A.npy' %
                      (buffer_name + '_' + cut_buffer_size),
                      allow_pickle=True)
    gts = np.load('./results/ueMC_%s_Gt.npy' % setting_name, allow_pickle=True)

    upper_envelope, ue_lossval = train_upper_envelope(states, actions, gts, state_dim, device, seed, \
                  max_step_num=max_ue_trainsteps, k=loss_k, logger=logger)
    torch.save(upper_envelope.state_dict(), '%s/%s_UE.pth' % ("./pytorch_models", setting_name + \
                    '_s%s_lok%s'%(seed, loss_k)))
    print('ue train finished --')

    print('plotting ue --')

    upper_envelope = Value(state_dim, activation='relu')
    upper_envelope.load_state_dict(torch.load('%s/%s_UE.pth' % ("./pytorch_models", setting_name + \
                   '_s%s_lok%s'%(seed, loss_k))))

    plot_envelope(upper_envelope, states, actions, gts, \
         setting_name+'[k=%s_MClen=%s_gamma=%s'%(loss_k, rollout, gamma)+'loss%.2f'%ue_lossval, seed)
Esempio n. 4
0
def ddpg_genbuf(env_set="Hopper-v2", seed=0, max_timesteps=float(1e6), start_timesteps=int(1e3),
				expl_noise=0.5,
			    eval_freq='episode_timesteps',
			    logger_kwargs=dict()):


	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print("running on device:", device)

	"""set up logger"""
	global logger
	logger = EpochLogger(**logger_kwargs)
	logger.save_config(locals())


	file_name = "DDPG_%s_%s" % (env_set, str(seed))
	buffer_name = "FinalSigma%s_%s_%s_%sK" % (str(expl_noise), env_set, str(seed),
										   str(int(max_timesteps/1e3)))
	exp_name = "ddpg_collection_%s_steps%s_sigma%s_%s" \
			   % (env_set, str(max_timesteps), str(expl_noise), str(seed))
	print ("---------------------------------------")
	print ("Settings: " + file_name)
	print ("Save Buffer as: " + buffer_name)
	print ("---------------------------------------")

	if not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")


	env = gym.make(env_set)
	test_env = gym.make(env_set)

	# Set seeds
	'''for algos with environment interacts we also have to seed env.action_space'''
	env.seed(seed)
	test_env.seed(seed)
	env.action_space.np_random.seed(seed)
	test_env.action_space.np_random.seed(seed)
	torch.manual_seed(seed)
	np.random.seed(seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])
	print('max episode length', env._max_episode_steps)

	# Initialize policy and buffer
	policy = DDPG_col.DDPG(state_dim, action_dim, max_action)
	replay_buffer = utils.ReplayBuffer()
	
	total_timesteps = 0
	episode_num = 0
	done = True 

	while total_timesteps < max_timesteps:
		
		if done: 

			if total_timesteps != 0:
				policy.train(replay_buffer, episode_timesteps)

				avgtest_reward = evaluate_policy(policy, test_env, eval_episodes=10)


				logger.log_tabular('Episode', episode_num)
				logger.log_tabular('AverageTestEpRet', avgtest_reward)
				logger.log_tabular('TotalSteps', total_timesteps)
				logger.log_tabular('EpRet', episode_reward)
				logger.log_tabular('EpLen', episode_timesteps)
				logger.dump_tabular()


			# Reset environment
			obs = env.reset()
			done = False
			episode_reward = 0
			episode_timesteps = 0
			episode_num += 1 
		
		# Select action randomly or according to policy
		if total_timesteps < start_timesteps:
			action = env.action_space.sample()
		else:
			action = policy.select_action(np.array(obs))
			if expl_noise != 0:
				action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0]))\
							  .clip(env.action_space.low, env.action_space.high)

		# Perform new action!!!
		new_obs, reward, done, _ = env.step(action)
		episode_reward += reward
		episode_timesteps += 1
		total_timesteps += 1

		done_bool = 0 if episode_timesteps == env._max_episode_steps else float(done)

		# Store data in replay buffer

		replay_buffer.add((obs, new_obs, action, reward, done_bool))
		obs = new_obs

		
	# Save final policy
	policy.save("%s" % (file_name), directory="./pytorch_models")
	# Save final buffer
	replay_buffer.save(buffer_name)
def bc_ue_learn(env_set="Hopper-v2",
                seed=0,
                buffer_type="FinalSigma0.5",
                buffer_seed=0,
                buffer_size='1000K',
                cut_buffer_size='1000K',
                ue_seed_list=[1],
                gamma=0.99,
                ue_rollout=1000,
                ue_loss_k=10000,
                clip_ue=None,
                detect_interval=10000,
                eval_freq=float(500),
                max_timesteps=float(1e5),
                lr=1e-3,
                wd=0,
                border=0.9,
                logger_kwargs=dict()):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)
    """set up logger"""
    global logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    rollout_list = [None, 1000, 200, 100, 10]
    k_list = [10000, 1000, 100, 100000, 50000, 5000]

    file_name = "BCueclip_%s_%s" % (env_set, seed)
    buffer_name = "%s_%s_%s" % (buffer_type, env_set, buffer_seed)
    setting_name = "%s_%s_r%s_g%s" % (buffer_name, cut_buffer_size, ue_rollout,
                                      gamma)

    print
    ("---------------------------------------")
    print
    ("Settings: " + file_name)
    print
    ("---------------------------------------")

    env = gym.make(env_set)
    test_env = gym.make(env_set)

    # Set seeds
    env.seed(seed)
    test_env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.action_space.np_random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Load buffer
    replay_buffer = utils.ReplayBuffer()
    replay_buffer.load(buffer_name + '_' + buffer_size)
    if buffer_size != cut_buffer_size:
        replay_buffer.cut_final(int(cut_buffer_size[:-1]) * 1e3)
    print(replay_buffer.get_length())

    print('buffer setting:', buffer_name + '_' + cut_buffer_size)

    print('clip and selection type:', clip_ue)
    env_bs_dic = {
        'Hopper-v2': [4, 4],
        'Walker2d-v2': [3, 5],
        'HalfCheetah-v2': [1, 1]
    }
    if clip_ue is None:
        best_ue_seed = env_bs_dic[env_set][buffer_seed]
        C = None
    elif clip_ue == "s-auto":
        best_ue_seed = env_bs_dic[env_set][buffer_seed]
        print('-- Do clipping on the selected envelope --')
        C, _ = get_ue_clipping_info(best_ue_seed, ue_loss_k, detect_interval, setting_name, state_dim,\
         buffer_info=buffer_name + '_' + cut_buffer_size, ue_setting='[k=%s_MClen=%s_gamma=%s'%(ue_loss_k, ue_rollout, gamma))
    elif clip_ue == "f-auto":
        print('-- Do clipping on each envelope --')
        ues_info = dict()
        for ue_seed in ue_seed_list:
            ues_info[ue_seed] = get_ue_clipping_info(ue_seed, ue_loss_k, detect_interval, setting_name, state_dim,\
            buffer_info=buffer_name + '_' + cut_buffer_size, ue_setting='[k=%s_MClen=%s_gamma=%s'%(ue_loss_k, ue_rollout, gamma))
        print('Auto clipping info:', ues_info)
        clipping_val_list, clipping_loss_list = tuple(
            map(list, zip(*ues_info.values())))
        sele_idx = int(np.argmin(np.array(clipping_loss_list)))
        best_ue_seed = ue_seed_list[sele_idx]
        C = clipping_val_list[sele_idx]

    print("Best UE", best_ue_seed, "Clipping value: ", C)

    print('-- Policy train starts --')
    gts = np.load('./results/ueMC_%s_Gt.npy' % setting_name, allow_pickle=True)
    print('Load best envelope from', './results/ueMC_%s_Gt.npy' % setting_name)
    upper_envelope = Value(state_dim, activation='relu')
    upper_envelope.load_state_dict(
        torch.load('%s/%s_UE.pth' %
                   ("./pytorch_models", setting_name + '_s%s_lok%s' %
                    (best_ue_seed, ue_loss_k))))
    print(
        'Load best envelope from',
        '%s/%s_UE.pth' % ("./pytorch_models", setting_name + '_s%s_lok%s' %
                          (best_ue_seed, ue_loss_k)))
    print('with testing MClength:', ue_rollout, 'training loss ratio k:',
          ue_loss_k)

    #plot_envelope(upper_envelope, states, actions, gts, buffer_name, seed)

    # Initialize policy
    policy = BC_ue_border_clip.BC_ue(state_dim,
                                     action_dim,
                                     max_action,
                                     lr=lr,
                                     wd=wd,
                                     ue_valfunc=upper_envelope,
                                     mc_rets=gts)

    episode_num = 0
    done = True

    training_iters, epoch = 0, 0
    while training_iters < max_timesteps:
        epoch += 1
        pol_vals = policy.train(replay_buffer,
                                iterations=int(eval_freq),
                                border=border,
                                logger=logger,
                                C=C)

        avgtest_reward = evaluate_policy(policy, test_env)
        training_iters += eval_freq

        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('AverageTestEpRet', avgtest_reward)
        logger.log_tabular('TotalSteps', training_iters)
        logger.log_tabular('Loss', average_only=True)
        logger.log_tabular('SVal', with_min_and_max=True)
        logger.log_tabular('UpSize', with_min_and_max=True)
        logger.dump_tabular()