def main(): env = TfEnv(CustomGymEnv('PointMazeLeft-v0')) experts = load_latest_experts('data/point', n=50) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_traj'): with tf.Session(): algo.train() test_pointmaze(sess.run(policy))
def main(): env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False)) experts = load_latest_experts('data/ant', n=50) irl_model = GCLDiscrim( env_spec=env.spec, expert_trajs=experts, discrim_arch=disentangled_net) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=1000, discount=0.995, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/ant_airl'): with tf.Session(): algo.train()
def main(exp_name, params_folder=None): env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False)) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file) irl_model = AIRL(env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name): with tf.Session(): algo.train()
def main(num_examples=50, discount=0.99): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=num_examples) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000, max_path_length=100, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_traj'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99): env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) ### VGG 11/29/18: Added support to CSV files ## this method loads expert data saved as pickle file # experts = load_latest_experts('data/airsim_final', n=1) # this one uses csv: experts = load_experts('data/airsim_human_data/log.csv', pickle_format=False) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=5000, batch_size=60, max_path_length=60, discount=0.99, store_paths=True, discrim_train_itrs=100, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), n_parallel=0) with rllab_logdir(algo=algo, dirname='data/airsim_gail'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99): env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=1000, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/airsim', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=10, batch_size=100, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/airsim_gcl'): with tf.Session(): algo.train()
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main(eval_reward = False): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) n_experts = 10 experts = load_latest_experts('plotting/pendulum_final', n=n_experts) dirname='data/pendulum' # dir to save logs and images irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), eval_reward=True, fig_dir = dirname ) # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)): with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.fig_dirname = dirname algo.train()
def main(exp_name=None, fusion=True): # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) #experts = load_latest_experts('data/ant_data_collect', n=5) #qvar: inverse model q(a|s,s') qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=experts, fusion=True, max_itrs=10) #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, fusion=fusion, max_itrs=10, score_discrim=True) #Empowerment-based potential functions gamma* Phi(s')-Phi(s) empw_model = Empowerment(env=env, fusion=True, max_itrs=4) t_empw_model = Empowerment(env=env, scope='t_efn', fusion=True, max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=3000, #130, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, target_empw_update=5, irl_model_wt=1.0, entropy_weight=0.1, lambda_i=1.0, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), plot=False) with rllab_logdir(algo=algo, dirname='data/ant_state_irl'): #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \ max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None): env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/maze_right_data_collect', n=2, visible_gpus=visible_gpus) sess = tf.Session(config=tf_config) # sess = tf_debug.LocalCLIDebugWrapperSession(sess) max_path_length = 500 irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \ max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=max_path_length, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s' dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % ( max_nstep, exp_folder, exp_name ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % ( max_nstep, exp_name) with rllab_logdir(algo=algo, dirname=dirname): sess.__enter__() algo.train(sess) sess.close()
def __init__(self, *args, ablation='none', skip_policy_update=False, skip_discriminator=False, optimizer=None, optimizer_args={}, buffer_batch_size=16, policy_update_freq=1, **kwargs): if optimizer is None: optimizer = optimizers.PPOOptimizer(**optimizer_args) IRLTRPO.__init__(self, *args, optimizer=optimizer, **kwargs) self.ablation = ablation self.skip_policy_update = skip_policy_update self.skip_discriminator = skip_discriminator self.buffer_batch_size = buffer_batch_size self.policy_update_freq = policy_update_freq
def main(exp_name, params_folder=None, visible_gpus='0', discount=0.99): env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file, tf_config) irl_model = AIRL(discount=discount, env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/maze_left_transfer/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(exp_name=None, params_folder='data/ant_state_irl'): # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True)) env = TfEnv( CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) irl_itr = 90 # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100 #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr)) prior_params = load_prior_params(params_file) '''q_itr = 400 # earlier IRL iterations overfit less; 100 seems to work well. #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr)) prior_params_q = load_prior_params(params_file)''' experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10) irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, score_discrim=False) empw_model = Empowerment(env=env, max_itrs=1) t_empw_model = Empowerment(env=env, scope='t_efn', max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params['irl_params'], init_empw_params=None, #prior_params['empw_params'], init_qvar_params=None, #prior_params['qvar_params'], init_policy_params=prior_params['policy_params'], #None env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=False, train_irl=True, train_empw=True, train_qvar=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, # plot=True, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer'): #%s'%exp_name): #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name): with tf.Session(): algo.train()
def main(exp_name, rundir, ent_wt=1.0, env_name='Shaped_PM_MazeRoom_Small-v0', method="airl", beta=1e-2, disc_step=1e-3, disc_iters=100, disc_batch_size=32, disc_gp=None, trpo_step=1e-2, init_pol_std=1.0, adaptive_beta=False, target_kl=0.5, beta_step=1e-6, hid_size=None, hid_layers=None, max_traj=None): os.makedirs(rundir, exist_ok=True) if hid_size is None or hid_layers is None: assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size, init_pol_std \ = min_layers_hidsize_polstd_for(env_name) env_trpo_params = irltrpo_params_for(env_name, 'irl') env = TfEnv(CustomGymEnv(env_name, record_video=False, record_log=False)) expert_dir = os.path.join(rundir, 'env_%s/' % env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=5, max_traj=max_traj) disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: is_vairl = method == 'vairl' irl_model = shaped_airl.AIRL( env=env, expert_trajs=experts, state_only=True, fusion=True, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs, gp_coeff=disc_gp, # vairl flag vairl=is_vairl, # vairl fixed beta settings vairl_beta=beta, # vairl adaptive beta settings vairl_adaptive_beta=adaptive_beta, vairl_beta_step_size=beta_step, vairl_kl_target=target_kl) elif method in {'gail', 'vail'}: is_vail = method == 'vail' assert gp_coeff is None, "no GAIL/VAIL support for GP coeff" irl_model = GAIL( env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, # vail stuff (only adaptive beta for VAIL, no fixed beta like # VAIRL) vail=is_vail, # initial beta vail_init_beta=beta, vail_beta_step_size=beta_step, vail_kl_target=target_kl) else: raise NotImplementedError("don't know how to handle method '%s'" % (method, )) pol_hid_sizes = (hid_size, ) * hid_layers policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=pol_hid_sizes, init_std=init_pol_std) irltrpo_kwargs = dict( env=env, policy=policy, irl_model=irl_model, discount=0.99, store_paths=True, discrim_train_itrs=disc_iters, irl_model_wt=1.0, irl_lr=disc_step, irl_batch_size=disc_batch_size, step_size=trpo_step, # entropy_weight should be 1.0 but 0.1 seems to work better entropy_weight=ent_wt, force_batch_sampler=True, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) irltrpo_kwargs.update(env_trpo_params) n_itr = irltrpo_kwargs['n_itr'] print('irltrpo_kwargs:', irltrpo_kwargs) algo = IRLTRPO(**irltrpo_kwargs) run_name = 'env_{env_name}_{method}'.format(env_name=env_name.lower(), method=method) exp_folder = os.path.join(rundir, '%s/%s' % (run_name, exp_name)) with rllab_logdir(algo=algo, dirname=exp_folder): with tf.Session(): algo.train() this_dir = os.path.dirname(__file__) maze_retrain_path = os.path.join(this_dir, 'env_retrain.py') latest_irl_snap = '%s/itr_%d.pkl' % (exp_folder, n_itr - 1) subproc_cmd = [ # script 'python', maze_retrain_path, # experiment info latest_irl_snap, '--rundir', rundir, # TRPO params '--trpo-step', '%f' % trpo_step, '--trpo-ent', '%f' % ent_wt, # network params '--hid-layers', '%d' % hid_layers, '--hid-size', '%d' % hid_size, # we don't care about precise args relevant to given method because # we're just reloading a frozen model '--method', method, ] subprocess.run(subproc_cmd, check=True)
trajectories.append(df) experts.append(data_dict) log.info("trajs : {}".format(len(trajectories))) tf.reset_default_graph() # if not osp.exists(gail_iter_path): # os.makedirs(gail_iter_path) # rllog.set_snapshot_dir(gail_iter_path) with tf.Session(): env = TfEnv(env) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(64, 64)) # policy._mean_network = iter_data['policy']._mean_network algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=t_iter, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, discrim_train_itrs=75, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=GaussianMLPBaseline(env_spec=env.spec) ) algo.train() # rllog.set_snapshot_dir(None)
def main( exp_name, rundir='data', irl_pkl='', ent_wt=1.0, trpo_anneal_steps=None, trpo_anneal_init_ent=None, trpo_step=0.01, init_pol_std=1.0, method=None, hid_size=None, hid_layers=None, switch_env=None, ): orig_env_name = get_name(irl_pkl) if switch_env is not None: this_env_name = switch_env else: this_env_name = orig_env_name print("Running on environment '%s'" % this_env_name) env = TfEnv( CustomGymEnv(this_env_name, record_video=False, record_log=False)) if hid_size is None or hid_layers is None: assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size, init_pol_std \ = min_layers_hidsize_polstd_for(orig_env_name) env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain') folder = os.path.dirname(irl_pkl) prior_params = load_prior_params(irl_pkl) expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=5) # For some reason IRLTRPO is responsible for setting weights in this code. # It would equally be possible to run global_variables_initializer() # ourselves and then do irl_model.set_params(prior_params) if we just # wanted to query energy, reward, etc. from the trained AIRL model without # using IRLTRPO. disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, freeze=True, vairl=method == 'vairl', vairl_beta=1e-4, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs) elif method in {'gail', 'vail'}: irl_model = GAIL(env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, freeze=True, vail=method == 'vail') else: raise NotImplementedError("Don't know how to handle method '%s'" % method) pol_hid_sizes = (hid_size, ) * hid_layers policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=pol_hid_sizes, init_std=init_pol_std) irltrpo_kwargs = dict( env=env, policy=policy, irl_model=irl_model, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=ent_wt, # should be 1.0 but 0.1 seems to work better step_size=trpo_step, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), init_irl_params=prior_params, force_batch_sampler=True, entropy_anneal_init_weight=trpo_anneal_init_ent, entropy_anneal_steps=trpo_anneal_steps, retraining=True) irltrpo_kwargs.update(env_trpo_params) algo = IRLTRPO(**irltrpo_kwargs) folder_suffix = '' if switch_env is not None: # append lower case environment name to retrain folder path folder_suffix = '_%s' % switch_env.lower() with rllab_logdir(algo=algo, dirname='%s/retrain%s' % (folder, folder_suffix)): with tf.Session(): algo.train()