def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main(num_examples=50, discount=0.99): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=num_examples) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000, max_path_length=100, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_traj'): with tf.Session(): algo.train()
def main(): env = TfEnv(CustomGymEnv('PointMazeLeft-v0')) experts = load_latest_experts('data/point', n=50) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_traj'): with tf.Session(): algo.train() test_pointmaze(sess.run(policy))
def main(eval_reward = False): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) n_experts = 10 experts = load_latest_experts('plotting/pendulum_final', n=n_experts) dirname='data/pendulum' # dir to save logs and images irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), eval_reward=True, fig_dir = dirname ) # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)): with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.fig_dirname = dirname algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99): env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=1000, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/airsim', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=10, batch_size=100, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/airsim_gcl'): with tf.Session(): algo.train()
def main(): env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False)) experts = load_latest_experts('data/ant', n=50) irl_model = GCLDiscrim( env_spec=env.spec, expert_trajs=experts, discrim_arch=disentangled_net) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=1000, discount=0.995, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/ant_airl'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \ max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None): env = TfEnv(GymEnv('HalfCheetah-v3', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~5 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts('data/half_cheetah', n=5, visible_gpus=visible_gpus) sess = tf.Session(config=tf_config) # sess = tf_debug.LocalCLIDebugWrapperSession(sess) max_path_length = 500 irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, \ state_only=state_only, fusion=fusion, max_itrs=10, \ score_discrim=score_discrim, debug = debug, \ max_nstep = max_nstep, n_value_funct = n_val, \ n_rew_funct = n_rew, score_method=score_method) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO(env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=max_path_length, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), discrim_train_itrs=50) # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s' dirname = 'data/half_cheetah_bootstrap_%d_irl/%s/%s' % ( max_nstep, exp_folder, exp_name ) if exp_folder is not None else 'data/half_cheetah_bootstrap_%d_irl/%s' % ( max_nstep, exp_name) with rllab_logdir(algo=algo, dirname=dirname): sess.__enter__() algo.train(sess) sess.close()