Esempio n. 1
0
def main(exp_name, params_folder=None):
    env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name):
        with tf.Session():
            algo.train()
Esempio n. 2
0
def main(exp_name, params_folder=None, visible_gpus='0', discount=0.99):
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR,
                               '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file, tf_config)

    irl_model = AIRL(discount=discount,
                     env=env,
                     expert_trajs=None,
                     state_only=True)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo,
                      dirname='data/maze_left_transfer/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Esempio n. 3
0
def main(exp_name=None, params_folder='data/ant_state_irl'):
    # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True))
    env = TfEnv(
        CustomGymEnv('DisabledAnt-v0',
                     record_video=False,
                     record_log=False,
                     force_reset=False))

    irl_itr = 90  # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr))
    prior_params = load_prior_params(params_file)
    '''q_itr = 400  # earlier IRL iterations overfit less; 100 seems to work well.
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr))
    prior_params_q = load_prior_params(params_file)'''

    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10)
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      score_discrim=False)
    empw_model = Empowerment(env=env, max_itrs=1)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    algo = IRLTRPO(
        init_irl_params=prior_params['irl_params'],
        init_empw_params=None,  #prior_params['empw_params'],
        init_qvar_params=None,  #prior_params['qvar_params'],
        init_policy_params=prior_params['policy_params'],  #None
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=True,
        train_empw=True,
        train_qvar=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
        # plot=True,
    )

    with rllab_logdir(algo=algo, dirname='data/ant_transfer'):  #%s'%exp_name):
        #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name):
        with tf.Session():
            algo.train()
Esempio n. 4
0
def main(
    exp_name,
    rundir='data',
    irl_pkl='',
    ent_wt=1.0,
    trpo_anneal_steps=None,
    trpo_anneal_init_ent=None,
    trpo_step=0.01,
    init_pol_std=1.0,
    method=None,
    hid_size=None,
    hid_layers=None,
    switch_env=None,
):
    orig_env_name = get_name(irl_pkl)
    if switch_env is not None:
        this_env_name = switch_env
    else:
        this_env_name = orig_env_name
    print("Running on environment '%s'" % this_env_name)
    env = TfEnv(
        CustomGymEnv(this_env_name, record_video=False, record_log=False))

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(orig_env_name)
    env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain')

    folder = os.path.dirname(irl_pkl)

    prior_params = load_prior_params(irl_pkl)
    expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5)

    # For some reason IRLTRPO is responsible for setting weights in this code.
    # It would equally be possible to run global_variables_initializer()
    # ourselves and then do irl_model.set_params(prior_params) if we just
    # wanted to query energy, reward, etc. from the trained AIRL model without
    # using IRLTRPO.
    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        irl_model = AIRL(env=env,
                         expert_trajs=experts,
                         state_only=True,
                         freeze=True,
                         vairl=method == 'vairl',
                         vairl_beta=1e-4,
                         discrim_arch_args=disc_net_kwargs,
                         fitted_value_fn_arch_args=disc_net_kwargs)
    elif method in {'gail', 'vail'}:
        irl_model = GAIL(env,
                         expert_trajs=experts,
                         discrim_arch_args=disc_net_kwargs,
                         name=method,
                         freeze=True,
                         vail=method == 'vail')
    else:
        raise NotImplementedError("Don't know how to handle method '%s'" %
                                  method)

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=ent_wt,  # should be 1.0 but 0.1 seems to work better
        step_size=trpo_step,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        init_irl_params=prior_params,
        force_batch_sampler=True,
        entropy_anneal_init_weight=trpo_anneal_init_ent,
        entropy_anneal_steps=trpo_anneal_steps,
        retraining=True)
    irltrpo_kwargs.update(env_trpo_params)
    algo = IRLTRPO(**irltrpo_kwargs)
    folder_suffix = ''
    if switch_env is not None:
        # append lower case environment name to retrain folder path
        folder_suffix = '_%s' % switch_env.lower()
    with rllab_logdir(algo=algo,
                      dirname='%s/retrain%s' % (folder, folder_suffix)):
        with tf.Session():
            algo.train()
Esempio n. 5
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 16
    meta_batch_size = 1
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    # tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))
    barrier_range = [0.2, 0.6]
    barrier_y = 0.3

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)

    irl_itr_list = [2800]

    for irl_itr in irl_itr_list:
        # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800))
        policy_prior_params = load_prior_params(params_file, 'policy_params')
        # policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             reward_arch=reward_arch,
                             reward_arch_args=reward_arch_args,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr
        if not os.path.isdir(savedir):
            os.mkdir(savedir)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            irl_model.context_encoder.set_param_values(
                init_context_encoder_params)
            policy.set_param_values(policy_prior_params)
            irl_model.set_params(prior_params)
            boundary_low = -0.1
            boundary_high = 0.6

            expert_obs, expert_acts, expert_contexts = irl_model.extract_paths(
                irl_model.expert_trajs,
                keys=('observations', 'actions', 'contexts'),
                T=max_path_length)
            expert_trajs = np.concatenate(
                (expert_obs, expert_acts),
                axis=-1)  # num_experts x T x (state_dim + act_dim)

            grid_size = 0.005
            rescale = 1. / grid_size

            for itr in range(100):
                expert_traj_batch, m_batch = irl_model.sample_batch(
                    expert_trajs,
                    expert_contexts,
                    batch_size=1,
                    warm_up=False,
                    warm_up_idx=False)
                obs_batch = []
                num_y = 0
                for pos_y in np.arange(boundary_low, boundary_high, grid_size):
                    num_y += 1
                    num_x = 0
                    for pos_x in np.arange(boundary_low, boundary_high,
                                           grid_size):
                        num_x += 1
                        obs_batch.append([pos_x, pos_y, 0.])
                obs_batch = np.array(obs_batch).reshape(
                    [1, -1, max_path_length, 3])
                expert_traj_batch = np.tile(
                    np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]),
                    [1, obs_batch.shape[1], 1, 1])
                reward = tf.get_default_session().run(
                    irl_model.reward,
                    feed_dict={
                        irl_model.expert_traj_var: expert_traj_batch,
                        irl_model.obs_t: obs_batch
                    })
                score = reward[:, 0]
                ax = sns.heatmap(score.reshape([num_x, num_y]),
                                 cmap="YlGnBu_r")
                ax.scatter((m_batch[0][0][0] - boundary_low) * rescale,
                           (m_batch[0][0][1] - boundary_low) * rescale,
                           marker='*',
                           s=150,
                           c='r',
                           edgecolors='k',
                           linewidths=0.5)
                ax.scatter((0.3 - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           (0. - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           marker='o',
                           s=120,
                           c='white',
                           linewidths=0.5,
                           edgecolors='k')
                ax.plot([(barrier_range[0] - boundary_low) * rescale,
                         (barrier_range[1] - boundary_low) * rescale],
                        [(barrier_y - boundary_low) * rescale,
                         (barrier_y - boundary_low) * rescale],
                        color='k',
                        linewidth=10)
                ax.invert_yaxis()
                plt.axis('off')
                plt.savefig(savedir + '/%s.png' % itr)
                print('Save Itr', itr)
                plt.close()
Esempio n. 6
0
from inverse_rl.envs.env_utils import CustomGymEnv
from inverse_rl.utils.log_utils import rllab_logdir
from inverse_rl.utils.hyper_sweep import run_sweep_parallel, run_sweep_serial
import pdb
import numpy as np
from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler
from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler
from rllab.sampler.utils import rollout

# env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False, force_reset=False))
env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False))

# logdir = '/home/usaywook/ext256/inverse_rl/data/ant_state_irl/itr_2999.pkl'
logdir = '/home/usaywook/ext256/inverse_rl/data/ant_transfer/itr_1500.pkl'
params = load_prior_params(logdir)
loaded_params = params['policy_params']

policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    if loaded_params is not None:
        # x = list(params['policy']._cached_params.values())[0]
        # y = list(params['policy']._cached_param_dtypes.values())[0]
        policy.set_param_values(loaded_params)
# pdb.set_trace()

with tf.Session(config=get_session_config()) as sess:
    # algo = TRPO(
    #     env=env,
Esempio n. 7
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 32
    meta_batch_size = 50
    entropy_weight = 0.1
    left = 'right'
    if_filtered = True

    # tf.reset_default_graph()
    if left == 'left':
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeRight-v0',
                         record_video=False,
                         record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)
    if if_filtered:
        experts_filtered = []
        good_range = [0.1, 0.4]  #[0.3, 0.5]
        for expert in experts:
            if expert['contexts'][0,
                                  0] >= good_range[0] and expert['contexts'][
                                      0, 0] <= good_range[1]:
                experts_filtered.append(expert)
        assert len(experts_filtered) >= meta_batch_size
        experts_filtered = experts_filtered[:-(len(experts_filtered) %
                                               meta_batch_size)]
        experts = experts_filtered

    irl_itr_list = [2800]

    results = []
    for irl_itr in irl_itr_list:
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        algo = MetaIRLTRPO(
            init_irl_params=prior_params,
            init_pol_params=policy_prior_params,  #policy_prior_params,
            init_context_encoder_params=init_context_encoder_params,
            env=env,
            policy=policy,
            irl_model=irl_model,
            n_itr=150,
            meta_batch_size=meta_batch_size,
            batch_size=batch_size,
            max_path_length=max_path_length,
            discount=0.99,
            store_paths=True,
            train_irl=True,  # True
            train_context_only=True,
            train_policy=True,
            irl_model_wt=1.0,
            entropy_weight=entropy_weight,
            zero_environment_reward=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            log_params_folder=params_folder,
            log_experiment_name=exp_name,
        )
        with rllab_logdir(
                algo=algo,
                dirname=
                'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s'
                % (entropy_weight, irl_itr, left,
                   'filter' if if_filtered else '', exp_name)):
            with tf.Session():
                algo.train()
        results.append((irl_itr, np.max(algo.pol_ret)))
        tf.reset_default_graph()
    print(results)