Exemple #1
0
def get_name(irl_pkl):
    with tf.Session(config=get_session_config()):
        irl_pkl_data = joblib.load(irl_pkl)
        env_name = get_inner_env(irl_pkl_data['env']).env_name
        del irl_pkl_data
    tf.reset_default_graph()
    return env_name
Exemple #2
0
    def train(self, sess=None):
        created_session = True if (sess is None) else False
        if sess is None:
            sess = tf.Session(config=get_session_config())
            sess.__enter__()

        sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        for itr in range(self.start_itr, self.n_itr):
            itr_start_time = time.time()
            with logger.prefix('itr #%d | ' % itr):
                logger.log("Obtaining samples...")
                paths = self.obtain_samples(itr)
                logger.log("Processing samples...")
                samples_data = self.process_samples(itr, paths)
                logger.log("Logging diagnostics...")
                self.log_diagnostics(paths)
                logger.log("Optimizing policy...")
                self.optimize_policy(itr, samples_data)
                logger.log("Saving snapshot...")
                params = self.get_itr_snapshot(itr,
                                               samples_data)  # , **kwargs)
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                save_itr_params_pickle(itr, params)
                prune_old_snapshots(itr,
                                    keep_every=self.snap_keep_every,
                                    keep_latest=self.snap_keep_latest)
                logger.log("Saved")
                logger.record_tabular('Time', time.time() - start_time)
                logger.record_tabular('ItrTime', time.time() - itr_start_time)
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    rollout(self.env,
                            self.policy,
                            animated=True,
                            max_path_length=self.max_path_length)
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
        self.shutdown_worker()
        if created_session:
            sess.close()
Exemple #3
0
def main(exp_name, ent_wt=1.0, discrete=True):
    tf.reset_default_graph()
    if discrete:
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeLeftCont-v0',
                         record_video=False,
                         record_log=False))

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=2000,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
            turn_on_wandb=args.turn_on_wandb,
            render_env=True,
            gif_dir='logs/maze_wall_meta_irl',
            gif_header='',
            wandb_entity=args.wandb_entity,
            wandb_project=args.wandb_project,
            wandb_run_name=args.wandb_run_name,
            wandb_monitor_gym=args.wandb_monitor_gym,
        )
        if discrete:
            output = 'data/maze_left_data_collect_discrete-15/%s' % exp_name
        else:
            output = 'data/maze_left_data_collect/%s' % exp_name
    with rllab_logdir(algo=algo, dirname=output):
        algo.train()
Exemple #4
0
def main(exp_name, ent_wt=1.0):
    tf.reset_default_graph()
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=1500,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
        )
        with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s'%exp_name):
            algo.train(sess)
Exemple #5
0
def main(exp_name=None, fusion=False, latent_dim=3):
    max_path_length = 100
    info_coeff = 0.1
    imitation_coeff = 0.01
    batch_size = 16
    meta_batch_size = 50
    max_itrs = 20
    pre_epoch = args.pre_epoch
    entropy_weight = 1.0
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim)

    # contexual policy pi(a|s,m)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    # approximate posterior q(m|tau)
    context_encoder_spec = EnvSpec(
        observation_space=Box(
            np.tile(
                np.concatenate((env.observation_space.low[:-latent_dim],
                                env.action_space.low)), max_path_length),
            np.tile(
                np.concatenate((env.observation_space.high[:-latent_dim],
                                env.action_space.high)), max_path_length)),
        action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
    )
    context_encoder = GaussianMLPPolicy(name='context_encoder',
                                        env_spec=context_encoder_spec,
                                        hidden_sizes=(128, 128))

    pretrain_model = Pretrain(experts,
                              policy,
                              context_encoder,
                              env,
                              latent_dim,
                              batch_size=400,
                              kl_weight=0.1,
                              epoch=pre_epoch)
    # pretrain_model = None
    if pretrain_model is None:
        pre_epoch = 0

    irl_model = InfoAIRL(env=env,
                         policy=policy,
                         context_encoder=context_encoder,
                         reward_arch=reward_arch,
                         reward_arch_args=reward_arch_args,
                         expert_trajs=experts,
                         state_only=True,
                         max_path_length=max_path_length,
                         fusion=fusion,
                         max_itrs=max_itrs,
                         meta_batch_size=meta_batch_size,
                         imitation_coeff=imitation_coeff,
                         info_coeff=info_coeff,
                         latent_dim=latent_dim)

    algo = MetaIRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        randomize_policy=True,
        pretrain_model=pretrain_model,
        n_itr=args.n_itr,
        meta_batch_size=meta_batch_size,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        train_irl=True,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        turn_on_wandb=args.turn_on_wandb,
        render_env=True,
        gif_dir='logs/maze_wall_meta_irl',
        gif_header='',
        wandb_entity=args.wandb_entity,
        wandb_project=args.wandb_project,
        wandb_run_name=args.wandb_run_name,
        wandb_monitor_gym=args.wandb_monitor_gym,
    )
    if fusion:
        dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)
    else:
        dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)

    config = get_session_config()
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session(config=config):
            algo.train()
Exemple #6
0
# logdir = '/home/usaywook/ext256/inverse_rl/data/ant_state_irl/itr_2999.pkl'
logdir = '/home/usaywook/ext256/inverse_rl/data/ant_transfer/itr_1500.pkl'
params = load_prior_params(logdir)
loaded_params = params['policy_params']

policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    if loaded_params is not None:
        # x = list(params['policy']._cached_params.values())[0]
        # y = list(params['policy']._cached_param_dtypes.values())[0]
        policy.set_param_values(loaded_params)
# pdb.set_trace()

with tf.Session(config=get_session_config()) as sess:
    # algo = TRPO(
    #     env=env,
    #     sess=sess,
    #     policy=policy,
    #     n_itr=1,
    #     batch_size=20000,
    #     max_path_length=500,
    #     discount=0.99,
    #     store_paths=True,
    #     entropy_weight=0.1,
    #     baseline=LinearFeatureBaseline(env_spec=env.spec),
    #     exp_name=None,
    #     plot=True
    # )
    # algo.train()
def main(
    rundir='data',
    irl_pkl='',
    pol_pkl=None,
    method=None,
    hid_size=None,
    hid_layers=None,
    switch_env=None,
):
    print('irl_pkl =', irl_pkl, 'and pol_pkl =', pol_pkl)
    orig_env_name = get_name(irl_pkl)
    if switch_env is not None:
        this_env_name = switch_env
    else:
        this_env_name = orig_env_name
    print("Running on environment '%s'" % this_env_name)
    env = TfEnv(
        CustomGymEnv(this_env_name, record_video=False, record_log=False))

    if hid_size is None or hid_layers is None:
        # we want hidden size & layer count for the *original* environment,
        # since that's what the IRL model that we're trying to reconstruct was
        # trained on
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size = min_layers_hidsize_for(orig_env_name)
    # we want trajectory length for the new environment rather than the
    # original environment, though
    traj_length = irltrpo_params_for(this_env_name,
                                     'retrain')['max_path_length']
    print('Horizon is', traj_length)

    expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=1)

    with tf.Session(config=get_session_config(), graph=tf.Graph()):
        irl_pkl_data = joblib.load(irl_pkl)

        disc_net_kwargs = {
            'layers': hid_layers,
            'd_hidden': hid_size,
        }
        if method in {'airl', 'vairl'}:
            irl_model = AIRL(env=env,
                             expert_trajs=experts,
                             state_only=True,
                             freeze=True,
                             vairl=method == 'vairl',
                             vairl_beta=1e-4,
                             discrim_arch_args=disc_net_kwargs,
                             fitted_value_fn_arch_args=disc_net_kwargs)
        elif method in {'gail', 'vail'}:
            irl_model = GAIL(env,
                             expert_trajs=experts,
                             discrim_arch_args=disc_net_kwargs,
                             name=method,
                             freeze=True,
                             vail=method == 'vail')
        else:
            raise NotImplementedError("Don't know how to handle method '%s'" %
                                      method)
        irl_model.set_params(irl_pkl_data['irl_params'])

        if pol_pkl is not None:
            with tf.variable_scope('please-work'):
                pol_pkl_data = joblib.load(pol_pkl)
                policy = pol_pkl_data['policy']
                print('Using policy loaded from %s' % pol_pkl)
        else:
            print('Using original IRL policy')
            policy = irl_pkl_data['policy']

        # do a few rollouts with given policy on given reward
        # report both the IRL reward AND the mean reward for the policy
        n_rollouts = 30
        irl_rets = np.zeros((n_rollouts, ))
        env_rets = np.zeros((n_rollouts, ))
        for i in tqdm.trange(n_rollouts):
            # how do I get final return? Hmm
            path = rollout(env, policy, max_path_length=traj_length)
            env_rets[i] = np.sum(path['rewards'])
            irl_rew = irl_model.eval([path])
            irl_rets[i] = np.sum(irl_rew)

        print('Env mean %.2f (std %.2f)' %
              (np.mean(env_rets), np.std(env_rets)))
        print('IRL mean %.2f (std %.2f)' %
              (np.mean(irl_rets), np.std(irl_rets)))