def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99): env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(): env = TfEnv(CustomGymEnv('PointMazeLeft-v0')) experts = load_latest_experts('data/point', n=50) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_gail'): with tf.Session(): algo.train() test_pointmaze(sess.run(policy))
def main(exp_name, params_folder=None): env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False)) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file) irl_model = AIRL(env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def main(exp_name=None, fusion=True): # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) #experts = load_latest_experts('data/ant_data_collect', n=5) #qvar: inverse model q(a|s,s') qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=experts, fusion=True, max_itrs=10) #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, fusion=fusion, max_itrs=10, score_discrim=True) #Empowerment-based potential functions gamma* Phi(s')-Phi(s) empw_model = Empowerment(env=env, fusion=True, max_itrs=4) t_empw_model = Empowerment(env=env, scope='t_efn', fusion=True, max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=3000, #130, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, target_empw_update=5, irl_model_wt=1.0, entropy_weight=0.1, lambda_i=1.0, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), plot=False) with rllab_logdir(algo=algo, dirname='data/ant_state_irl'): #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above with tf.Session(): algo.train()
def main(exp_name, ent_wt=1.0, discrete=True): tf.reset_default_graph() if discrete: env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) else: env = TfEnv( CustomGymEnv('PointMazeLeftCont-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, turn_on_wandb=args.turn_on_wandb, render_env=True, gif_dir='logs/maze_wall_meta_irl', gif_header='', wandb_entity=args.wandb_entity, wandb_project=args.wandb_project, wandb_run_name=args.wandb_run_name, wandb_monitor_gym=args.wandb_monitor_gym, ) if discrete: output = 'data/maze_left_data_collect_discrete-15/%s' % exp_name else: output = 'data/maze_left_data_collect/%s' % exp_name with rllab_logdir(algo=algo, dirname=output): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \ max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None): env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/maze_right_data_collect', n=2, visible_gpus=visible_gpus) sess = tf.Session(config=tf_config) # sess = tf_debug.LocalCLIDebugWrapperSession(sess) max_path_length = 500 irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \ max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=max_path_length, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s' dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % ( max_nstep, exp_folder, exp_name ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % ( max_nstep, exp_name) with rllab_logdir(algo=algo, dirname=dirname): sess.__enter__() algo.train(sess) sess.close()
def test_pointmaze(policy): test_env = TfEnv(CustomGymEnv('PointMazeRight-v0')) for i in range(5): done = False s = test_env.reset() reward = 0 steps = 0 while not done: a = np.random.choice(policy.shape[1], policy[s]) s_, r, _, done = test_env.step(a) steps += 1 reward += r print('Average episode reward is {}'.format(reward / steps))
def main(): env = TfEnv(CustomGymEnv('PointMazeRight-v0')) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO( env=env, policy=policy, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_trpo'): algo.train()
def main(exp_name, params_folder=None, visible_gpus='0', discount=0.99): env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file, tf_config) irl_model = AIRL(discount=discount, env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/maze_left_transfer/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(exp_name, ent_wt=1.0): tf.reset_default_graph() env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=1500, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s'%exp_name): algo.train(sess)
def main(exp_name, ent_wt=1.0, visible_gpus='0',discount=0.99): tf.reset_default_graph() env = TfEnv(CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=1500, batch_size=20000, max_path_length=500, discount=discount, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/maze_right_data_collect/%s'%exp_name): algo.train(sess)
def main(exp_name=None, fusion=False, latent_dim=3): max_path_length = 100 info_coeff = 0.1 imitation_coeff = 0.01 batch_size = 16 meta_batch_size = 50 max_itrs = 20 pre_epoch = 1000 entropy_weight = 1.0 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( 'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim) # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) pretrain_model = Pretrain(experts, policy, context_encoder, env, latent_dim, batch_size=400, kl_weight=0.1, epoch=pre_epoch) # pretrain_model = None if pretrain_model is None: pre_epoch = 0 irl_model = InfoAIRL(env=env, policy=policy, context_encoder=context_encoder, reward_arch=reward_arch, reward_arch_args=reward_arch_args, expert_trajs=experts, state_only=True, max_path_length=max_path_length, fusion=fusion, max_itrs=max_itrs, meta_batch_size=meta_batch_size, imitation_coeff=imitation_coeff, info_coeff=info_coeff, latent_dim=latent_dim) algo = MetaIRLTRPO( env=env, policy=policy, irl_model=irl_model, randomize_policy=True, pretrain_model=pretrain_model, n_itr=3000, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) if fusion: dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) else: dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train()
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 16 meta_batch_size = 1 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None # tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) barrier_range = [0.2, 0.6] barrier_y = 0.3 # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) irl_itr_list = [2800] for irl_itr in irl_itr_list: # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800)) policy_prior_params = load_prior_params(params_file, 'policy_params') # policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, reward_arch=reward_arch, reward_arch_args=reward_arch_args, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr if not os.path.isdir(savedir): os.mkdir(savedir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) irl_model.context_encoder.set_param_values( init_context_encoder_params) policy.set_param_values(policy_prior_params) irl_model.set_params(prior_params) boundary_low = -0.1 boundary_high = 0.6 expert_obs, expert_acts, expert_contexts = irl_model.extract_paths( irl_model.expert_trajs, keys=('observations', 'actions', 'contexts'), T=max_path_length) expert_trajs = np.concatenate( (expert_obs, expert_acts), axis=-1) # num_experts x T x (state_dim + act_dim) grid_size = 0.005 rescale = 1. / grid_size for itr in range(100): expert_traj_batch, m_batch = irl_model.sample_batch( expert_trajs, expert_contexts, batch_size=1, warm_up=False, warm_up_idx=False) obs_batch = [] num_y = 0 for pos_y in np.arange(boundary_low, boundary_high, grid_size): num_y += 1 num_x = 0 for pos_x in np.arange(boundary_low, boundary_high, grid_size): num_x += 1 obs_batch.append([pos_x, pos_y, 0.]) obs_batch = np.array(obs_batch).reshape( [1, -1, max_path_length, 3]) expert_traj_batch = np.tile( np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]), [1, obs_batch.shape[1], 1, 1]) reward = tf.get_default_session().run( irl_model.reward, feed_dict={ irl_model.expert_traj_var: expert_traj_batch, irl_model.obs_t: obs_batch }) score = reward[:, 0] ax = sns.heatmap(score.reshape([num_x, num_y]), cmap="YlGnBu_r") ax.scatter((m_batch[0][0][0] - boundary_low) * rescale, (m_batch[0][0][1] - boundary_low) * rescale, marker='*', s=150, c='r', edgecolors='k', linewidths=0.5) ax.scatter((0.3 - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, (0. - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, marker='o', s=120, c='white', linewidths=0.5, edgecolors='k') ax.plot([(barrier_range[0] - boundary_low) * rescale, (barrier_range[1] - boundary_low) * rescale], [(barrier_y - boundary_low) * rescale, (barrier_y - boundary_low) * rescale], color='k', linewidth=10) ax.invert_yaxis() plt.axis('off') plt.savefig(savedir + '/%s.png' % itr) print('Save Itr', itr) plt.close()
from inverse_rl.envs.env_utils import CustomGymEnv from inverse_rl.utils.log_utils import rllab_logdir from inverse_rl.utils.hyper_sweep import run_sweep_parallel, run_sweep_serial #Loads a policy from the given pickle-file and records a video if __name__ == "__main__": #filename='data/ant_data_collect/2018_05_25_13_42_59_0/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_23_15_21_40_0/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_19_07_56_37_1/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_19_07_56_37_0/itr_1485.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_0/itr_999.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_1/itr_999.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_2/itr_999.pkl' filename = 'data/ant_transfer/2018_05_26_16_06_05_4/itr_999.pkl' import gym import joblib import rllab.misc.logger as rllablogger tf.reset_default_graph() with tf.Session(config=get_session_config()) as sess: rllablogger.set_snapshot_dir("data/video") saved = joblib.load(filename) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=True, record_log=True) ) #'DisabledAnt-v0' #Switch for the DisabledAnt for the transfer task policy = saved['policy'] observation = env.reset() for _ in range(1000): env.render() action, rest = policy.get_action(observation) observation, reward, done, info = env.step(action)
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 32 meta_batch_size = 50 entropy_weight = 0.1 left = 'right' if_filtered = True # tf.reset_default_graph() if left == 'left': env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) else: env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) if if_filtered: experts_filtered = [] good_range = [0.1, 0.4] #[0.3, 0.5] for expert in experts: if expert['contexts'][0, 0] >= good_range[0] and expert['contexts'][ 0, 0] <= good_range[1]: experts_filtered.append(expert) assert len(experts_filtered) >= meta_batch_size experts_filtered = experts_filtered[:-(len(experts_filtered) % meta_batch_size)] experts = experts_filtered irl_itr_list = [2800] results = [] for irl_itr in irl_itr_list: params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) algo = MetaIRLTRPO( init_irl_params=prior_params, init_pol_params=policy_prior_params, #policy_prior_params, init_context_encoder_params=init_context_encoder_params, env=env, policy=policy, irl_model=irl_model, n_itr=150, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, # True train_context_only=True, train_policy=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir( algo=algo, dirname= 'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s' % (entropy_weight, irl_itr, left, 'filter' if if_filtered else '', exp_name)): with tf.Session(): algo.train() results.append((irl_itr, np.max(algo.pol_ret))) tf.reset_default_graph() print(results)
def main(exp_name=None, params_folder='data/ant_state_irl'): # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True)) env = TfEnv( CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) irl_itr = 90 # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100 #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr)) prior_params = load_prior_params(params_file) '''q_itr = 400 # earlier IRL iterations overfit less; 100 seems to work well. #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr)) prior_params_q = load_prior_params(params_file)''' experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10) irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, score_discrim=False) empw_model = Empowerment(env=env, max_itrs=1) t_empw_model = Empowerment(env=env, scope='t_efn', max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params['irl_params'], init_empw_params=None, #prior_params['empw_params'], init_qvar_params=None, #prior_params['qvar_params'], init_policy_params=prior_params['policy_params'], #None env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=False, train_irl=True, train_empw=True, train_qvar=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, # plot=True, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer'): #%s'%exp_name): #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name): with tf.Session(): algo.train()
from inverse_rl.models.tf_util import get_session_config, load_prior_params from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from inverse_rl.envs.env_utils import CustomGymEnv from inverse_rl.utils.log_utils import rllab_logdir from inverse_rl.utils.hyper_sweep import run_sweep_parallel, run_sweep_serial import pdb import numpy as np from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler from rllab.sampler.utils import rollout # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False, force_reset=False)) env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) # logdir = '/home/usaywook/ext256/inverse_rl/data/ant_state_irl/itr_2999.pkl' logdir = '/home/usaywook/ext256/inverse_rl/data/ant_transfer/itr_1500.pkl' params = load_prior_params(logdir) loaded_params = params['policy_params'] policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if loaded_params is not None: # x = list(params['policy']._cached_params.values())[0] # y = list(params['policy']._cached_param_dtypes.values())[0] policy.set_param_values(loaded_params) # pdb.set_trace()