Beispiel #1
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
Beispiel #2
0
def main(num_examples=50, discount=0.99):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=num_examples)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=2000,
        max_path_length=100,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_traj'):
        with tf.Session():
            algo.train()
Beispiel #3
0
def main():
    env = TfEnv(CustomGymEnv('PointMazeLeft-v0'))
    
    experts = load_latest_experts('data/point', n=50)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/point_traj'):
        with tf.Session():
            algo.train()
            test_pointmaze(sess.run(policy))
Beispiel #4
0
def main(eval_reward = False):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    n_experts = 10
    experts = load_latest_experts('plotting/pendulum_final', n=n_experts)
    dirname='data/pendulum' # dir to save logs and images

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        eval_reward=True,
        fig_dir = dirname
    )

    # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)):
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.fig_dirname = dirname
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99):
    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=1000,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0, # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
Beispiel #7
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/airsim', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=10,
        batch_size=100,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/airsim_gcl'):
        with tf.Session():
            algo.train()
Beispiel #8
0
def main():
    env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/ant', n=50)

    irl_model = GCLDiscrim(
        env_spec=env.spec,
        expert_trajs=experts,
        discrim_arch=disentangled_net)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=1000,
        discount=0.995,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/ant_airl'):
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \
         max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None):
    env = TfEnv(GymEnv('HalfCheetah-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    # load ~5 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts('data/half_cheetah',
                                  n=5,
                                  visible_gpus=visible_gpus)

    sess = tf.Session(config=tf_config)

    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    max_path_length = 500
    irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, \
                               state_only=state_only, fusion=fusion, max_itrs=10, \
                               score_discrim=score_discrim, debug = debug, \
                               max_nstep = max_nstep, n_value_funct = n_val, \
                               n_rew_funct = n_rew, score_method=score_method)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(env=env,
                   policy=policy,
                   irl_model=irl_model,
                   n_itr=1000,
                   batch_size=10000,
                   max_path_length=max_path_length,
                   discount=discount,
                   store_paths=True,
                   irl_model_wt=1.0,
                   entropy_weight=0.1,
                   zero_environment_reward=True,
                   baseline=LinearFeatureBaseline(env_spec=env.spec),
                   discrim_train_itrs=50)

    # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s'
    dirname = 'data/half_cheetah_bootstrap_%d_irl/%s/%s' % (
        max_nstep, exp_folder, exp_name
    ) if exp_folder is not None else 'data/half_cheetah_bootstrap_%d_irl/%s' % (
        max_nstep, exp_name)
    with rllab_logdir(algo=algo, dirname=dirname):
        sess.__enter__()
        algo.train(sess)
        sess.close()