Beispiel #1
0
def main():
    env = TfEnv(CustomGymEnv('PointMazeLeft-v0'))
    
    experts = load_latest_experts('data/point', n=50)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/point_traj'):
        with tf.Session():
            algo.train()
            test_pointmaze(sess.run(policy))
Beispiel #2
0
def main():
    env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/ant', n=50)

    irl_model = GCLDiscrim(
        env_spec=env.spec,
        expert_trajs=experts,
        discrim_arch=disentangled_net)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=1000,
        discount=0.995,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/ant_airl'):
        with tf.Session():
            algo.train()
Beispiel #3
0
def main(num_examples=50, discount=0.99):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=num_examples)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=2000,
        max_path_length=100,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_traj'):
        with tf.Session():
            algo.train()
Beispiel #4
0
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99):
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10)

    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Beispiel #5
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/airsim', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=10,
        batch_size=100,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/airsim_gcl'):
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99):
    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=1000,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Beispiel #7
0
def main(exp_name, params_folder=None):
    env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name):
        with tf.Session():
            algo.train()
Beispiel #8
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    ### VGG 11/29/18: Added support to CSV files
    ## this method loads expert data saved as pickle file
    # experts = load_latest_experts('data/airsim_final', n=1)
    # this one uses csv:
    experts = load_experts('data/airsim_human_data/log.csv',
                           pickle_format=False)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=5000,
        batch_size=60,
        max_path_length=60,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=100,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        n_parallel=0)

    with rllab_logdir(algo=algo, dirname='data/airsim_gail'):
        with tf.Session():
            algo.train()
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0, # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
Beispiel #10
0
def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()
Beispiel #11
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
Beispiel #12
0
def main(eval_reward = False):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    n_experts = 10
    experts = load_latest_experts('plotting/pendulum_final', n=n_experts)
    dirname='data/pendulum' # dir to save logs and images

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        eval_reward=True,
        fig_dir = dirname
    )

    # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)):
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.fig_dirname = dirname
            algo.train()
Beispiel #13
0
def main(exp_name=None, fusion=True):
    # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True))
    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)
    #experts = load_latest_experts('data/ant_data_collect', n=5)

    #qvar: inverse model q(a|s,s')
    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env,
                      qvar=qvar,
                      expert_trajs=experts,
                      fusion=True,
                      max_itrs=10)
    #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      fusion=fusion,
                      max_itrs=10,
                      score_discrim=True)

    #Empowerment-based potential functions gamma* Phi(s')-Phi(s)
    empw_model = Empowerment(env=env, fusion=True, max_itrs=4)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               fusion=True,
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=3000,  #130,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        target_empw_update=5,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        lambda_i=1.0,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        plot=False)
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl'):
        #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \
         max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None):
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/maze_right_data_collect',
                                                n=2,
                                                visible_gpus=visible_gpus)

    sess = tf.Session(config=tf_config)

    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    max_path_length = 500
    irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \
                               max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=max_path_length,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )

    # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s'
    dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % (
        max_nstep, exp_folder, exp_name
    ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % (
        max_nstep, exp_name)
    with rllab_logdir(algo=algo, dirname=dirname):
        sess.__enter__()
        algo.train(sess)
        sess.close()
Beispiel #15
0
def main(exp_name, params_folder=None, visible_gpus='0', discount=0.99):
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR,
                               '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file, tf_config)

    irl_model = AIRL(discount=discount,
                     env=env,
                     expert_trajs=None,
                     state_only=True)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo,
                      dirname='data/maze_left_transfer/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Beispiel #16
0
def main(exp_name=None, params_folder='data/ant_state_irl'):
    # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True))
    env = TfEnv(
        CustomGymEnv('DisabledAnt-v0',
                     record_video=False,
                     record_log=False,
                     force_reset=False))

    irl_itr = 90  # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr))
    prior_params = load_prior_params(params_file)
    '''q_itr = 400  # earlier IRL iterations overfit less; 100 seems to work well.
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr))
    prior_params_q = load_prior_params(params_file)'''

    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10)
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      score_discrim=False)
    empw_model = Empowerment(env=env, max_itrs=1)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    algo = IRLTRPO(
        init_irl_params=prior_params['irl_params'],
        init_empw_params=None,  #prior_params['empw_params'],
        init_qvar_params=None,  #prior_params['qvar_params'],
        init_policy_params=prior_params['policy_params'],  #None
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=True,
        train_empw=True,
        train_qvar=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
        # plot=True,
    )

    with rllab_logdir(algo=algo, dirname='data/ant_transfer'):  #%s'%exp_name):
        #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name):
        with tf.Session():
            algo.train()
Beispiel #17
0
def main(exp_name,
         rundir,
         ent_wt=1.0,
         env_name='Shaped_PM_MazeRoom_Small-v0',
         method="airl",
         beta=1e-2,
         disc_step=1e-3,
         disc_iters=100,
         disc_batch_size=32,
         disc_gp=None,
         trpo_step=1e-2,
         init_pol_std=1.0,
         adaptive_beta=False,
         target_kl=0.5,
         beta_step=1e-6,
         hid_size=None,
         hid_layers=None,
         max_traj=None):
    os.makedirs(rundir, exist_ok=True)

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(env_name)
    env_trpo_params = irltrpo_params_for(env_name, 'irl')

    env = TfEnv(CustomGymEnv(env_name, record_video=False, record_log=False))

    expert_dir = os.path.join(rundir, 'env_%s/' % env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5, max_traj=max_traj)

    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        is_vairl = method == 'vairl'
        irl_model = shaped_airl.AIRL(
            env=env,
            expert_trajs=experts,
            state_only=True,
            fusion=True,
            discrim_arch_args=disc_net_kwargs,
            fitted_value_fn_arch_args=disc_net_kwargs,
            gp_coeff=disc_gp,
            # vairl flag
            vairl=is_vairl,
            # vairl fixed beta settings
            vairl_beta=beta,
            # vairl adaptive beta settings
            vairl_adaptive_beta=adaptive_beta,
            vairl_beta_step_size=beta_step,
            vairl_kl_target=target_kl)
    elif method in {'gail', 'vail'}:
        is_vail = method == 'vail'
        assert gp_coeff is None, "no GAIL/VAIL support for GP coeff"
        irl_model = GAIL(
            env,
            expert_trajs=experts,
            discrim_arch_args=disc_net_kwargs,
            name=method,
            # vail stuff (only adaptive beta for VAIL, no fixed beta like
            # VAIRL)
            vail=is_vail,
            # initial beta
            vail_init_beta=beta,
            vail_beta_step_size=beta_step,
            vail_kl_target=target_kl)
    else:
        raise NotImplementedError("don't know how to handle method '%s'" %
                                  (method, ))

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=disc_iters,
        irl_model_wt=1.0,
        irl_lr=disc_step,
        irl_batch_size=disc_batch_size,
        step_size=trpo_step,
        # entropy_weight should be 1.0 but 0.1 seems to work better
        entropy_weight=ent_wt,
        force_batch_sampler=True,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    irltrpo_kwargs.update(env_trpo_params)
    n_itr = irltrpo_kwargs['n_itr']
    print('irltrpo_kwargs:', irltrpo_kwargs)
    algo = IRLTRPO(**irltrpo_kwargs)

    run_name = 'env_{env_name}_{method}'.format(env_name=env_name.lower(),
                                                method=method)
    exp_folder = os.path.join(rundir, '%s/%s' % (run_name, exp_name))
    with rllab_logdir(algo=algo, dirname=exp_folder):
        with tf.Session():
            algo.train()
    this_dir = os.path.dirname(__file__)
    maze_retrain_path = os.path.join(this_dir, 'env_retrain.py')
    latest_irl_snap = '%s/itr_%d.pkl' % (exp_folder, n_itr - 1)
    subproc_cmd = [
        # script
        'python',
        maze_retrain_path,
        # experiment info
        latest_irl_snap,
        '--rundir',
        rundir,
        # TRPO params
        '--trpo-step',
        '%f' % trpo_step,
        '--trpo-ent',
        '%f' % ent_wt,
        # network params
        '--hid-layers',
        '%d' % hid_layers,
        '--hid-size',
        '%d' % hid_size,
        # we don't care about precise args relevant to given method because
        # we're just reloading a frozen model
        '--method',
        method,
    ]
    subprocess.run(subproc_cmd, check=True)
Beispiel #18
0
    trajectories.append(df)
    experts.append(data_dict)
log.info("trajs : {}".format(len(trajectories)))

tf.reset_default_graph()

# if not osp.exists(gail_iter_path):
#         os.makedirs(gail_iter_path)
# rllog.set_snapshot_dir(gail_iter_path)
with tf.Session():
    env = TfEnv(env)
    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(64, 64))
    # policy._mean_network = iter_data['policy']._mean_network
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=t_iter,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=75,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=GaussianMLPBaseline(env_spec=env.spec)
    )
    algo.train()
    # rllog.set_snapshot_dir(None)
Beispiel #19
0
def main(
    exp_name,
    rundir='data',
    irl_pkl='',
    ent_wt=1.0,
    trpo_anneal_steps=None,
    trpo_anneal_init_ent=None,
    trpo_step=0.01,
    init_pol_std=1.0,
    method=None,
    hid_size=None,
    hid_layers=None,
    switch_env=None,
):
    orig_env_name = get_name(irl_pkl)
    if switch_env is not None:
        this_env_name = switch_env
    else:
        this_env_name = orig_env_name
    print("Running on environment '%s'" % this_env_name)
    env = TfEnv(
        CustomGymEnv(this_env_name, record_video=False, record_log=False))

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(orig_env_name)
    env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain')

    folder = os.path.dirname(irl_pkl)

    prior_params = load_prior_params(irl_pkl)
    expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5)

    # For some reason IRLTRPO is responsible for setting weights in this code.
    # It would equally be possible to run global_variables_initializer()
    # ourselves and then do irl_model.set_params(prior_params) if we just
    # wanted to query energy, reward, etc. from the trained AIRL model without
    # using IRLTRPO.
    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        irl_model = AIRL(env=env,
                         expert_trajs=experts,
                         state_only=True,
                         freeze=True,
                         vairl=method == 'vairl',
                         vairl_beta=1e-4,
                         discrim_arch_args=disc_net_kwargs,
                         fitted_value_fn_arch_args=disc_net_kwargs)
    elif method in {'gail', 'vail'}:
        irl_model = GAIL(env,
                         expert_trajs=experts,
                         discrim_arch_args=disc_net_kwargs,
                         name=method,
                         freeze=True,
                         vail=method == 'vail')
    else:
        raise NotImplementedError("Don't know how to handle method '%s'" %
                                  method)

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=ent_wt,  # should be 1.0 but 0.1 seems to work better
        step_size=trpo_step,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        init_irl_params=prior_params,
        force_batch_sampler=True,
        entropy_anneal_init_weight=trpo_anneal_init_ent,
        entropy_anneal_steps=trpo_anneal_steps,
        retraining=True)
    irltrpo_kwargs.update(env_trpo_params)
    algo = IRLTRPO(**irltrpo_kwargs)
    folder_suffix = ''
    if switch_env is not None:
        # append lower case environment name to retrain folder path
        folder_suffix = '_%s' % switch_env.lower()
    with rllab_logdir(algo=algo,
                      dirname='%s/retrain%s' % (folder, folder_suffix)):
        with tf.Session():
            algo.train()