Ejemplo n.º 1
0
def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('airl/CustomAnt-v0', record_video=False,
                     record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()
def run_expt(config):
    env_name = config['environment']
    env = get_env(env_name)
    experts = get_demos(env_name)
    irl_model = algo_string_to_model[config['algo']](env_spec=env.spec,
                                                     expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    # use params for each env
    algo = IRLTRPO(env=env,
                   policy=policy,
                   irl_model=irl_model,
                   n_itr=200,
                   batch_size=2000 if env_name == 'pendulum' else 10000,
                   max_path_length=100,
                   discount=0.99,
                   store_paths=True,
                   discrim_train_itrs=50,
                   irl_model_wt=1.0,
                   entropy_weight=1.0 if env_name == 'pointmass' else 0.1,
                   zero_environment_reward=True,
                   baseline=LinearFeatureBaseline(env_spec=env.spec))
    dirname = DATA_DIR + "/" + "___".join(
        [str(k) + "=" + str(v) for k, v in config.items()])
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
    # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after
    # it's been written to disk
    train_results = pd.read_csv(dirname + '/progress.csv')
    # return originaltaskaverageReturn for last iteation
    output = config.copy()
    output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn']
    return output
Ejemplo n.º 3
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gcl'):
        with tf.Session():
            algo.train()
Ejemplo n.º 4
0
def main(exp_name, params_folder=None):
    env = TfEnv(
        CustomGymEnv('airl/DisabledAnt-v0',
                     record_video=False,
                     record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR,
                               '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s' % exp_name):
        with tf.Session():
            algo.train()
Ejemplo n.º 5
0
def fu_irl(
    venv,
    is_airl,
    expert=None,
    expert_venv=None,
    expert_trajectories=None,
    total_timesteps=10000,
    gen_batch_size=200,
    policy_lr=1e-3,
    callback=None,
    **kwargs,
):
    # Disable algorithm's internal prints
    old_stdout = sys.stdout
    sys.stdout = open(os.devnull, 'w')

    raw_env = get_raw_env(venv)
    tf_env = TfEnv(GymEnv(env=raw_env, record_video=False, record_log=False))

    if expert_trajectories is None:
        expert_trajectories = sample_trajectories(
            expert_venv, expert, n_episodes=total_timesteps
        )
    expert_trajectories = to_rllab_trajectories(expert_trajectories, venv)

    if is_airl:
        irl_model = AIRLStateAction(
            env_spec=tf_env.spec, expert_trajs=expert_trajectories
        )
        entropy_weight = 1.0
    else:
        irl_model = GAIL(env_spec=tf_env.spec, expert_trajs=expert_trajectories)
        entropy_weight = 0.0

    if isinstance(venv.action_space, Discrete):
        policy = CategoricalMLPPolicy(
            name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32)
        )
    else:
        policy = GaussianMLPPolicy(
            name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32)
        )

    num_epochs = int(total_timesteps // gen_batch_size)

    algo = IRLTRPO(
        env=tf_env,
        policy=policy,
        irl_model=irl_model,
        n_itr=num_epochs,
        batch_size=gen_batch_size,
        max_path_length=100,
        discount=0.99,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=tf_env.spec),
    )
    algo.train()

    sys.stdout = old_stdout

    def predict_fn(ob, state=None, deterministic=False):
        act, _ = algo.policy.get_action(ob)
        return act, state

    results = {}
    results["policy"] = LightweightRLModel(predict_fn=predict_fn, env=venv)

    return results
Ejemplo n.º 6
0
def finetune(metainit,
             venv,
             trajectories,
             discount,
             seed,
             log_dir,
             *,
             tf_cfg,
             pol_itr=100,
             irl_itr=100,
             model_cfg=None,
             policy_cfg=None,
             training_cfg={}):
    envs = VecGymEnv(venv)
    envs = TfEnv(envs)
    experts = _convert_trajectories(trajectories)

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        if model_cfg is None:
            model_cfg = {
                'model': AIRLStateOnly,
                'state_only': True,
                'max_itrs': 10
            }
        model_kwargs = dict(model_cfg)
        model_cls = model_kwargs.pop('model')
        irl_model = model_cls(env_spec=envs.spec,
                              expert_trajs=experts,
                              **model_kwargs)

        if policy_cfg is None:
            policy_cfg = {
                'policy': GaussianMLPPolicy,
                'hidden_sizes': (32, 32)
            }
        else:
            policy_cfg = dict(policy_cfg)
        policy_fn = policy_cfg.pop('policy')
        policy = policy_fn(name='policy', env_spec=envs.spec, **policy_cfg)

        training_kwargs = {
            'batch_size': 10000,
            'max_path_length': 500,
            'irl_model_wt': 1.0,
            'entropy_weight': 0.1,
            # paths substantially increase storage requirements
            'store_paths': False,
        }
        training_kwargs.update(training_cfg)
        _kwargs, reward_params = metainit
        algo = IRLTRPO(env=envs,
                       policy=policy,
                       irl_model=irl_model,
                       discount=discount,
                       sampler_args=dict(n_envs=venv.num_envs),
                       zero_environment_reward=True,
                       baseline=LinearFeatureBaseline(env_spec=envs.spec),
                       init_irl_params=reward_params,
                       train_irl=False,
                       n_itr=pol_itr,
                       **training_kwargs)

        with tf.Session(config=tf_cfg):
            # First round: just optimize the policy, do not update IRL model
            with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'pol')):
                with rl_logger.prefix('finetune policy |'):
                    algo.train()
                    pol_params = policy.get_param_values()

            # Second round: we have a good policy (generator), update IRL
            with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'all')):
                with rl_logger.prefix('finetune all |'):
                    algo.train_irl = True
                    algo.init_pol_params = pol_params
                    algo.n_itr = irl_itr
                    algo.train()

            reward_params = irl_model.get_params()

            # Side-effect: forces policy to cache all parameters.
            # This ensures they are saved/restored during pickling.
            policy.get_params()
            # Must pickle policy rather than returning it directly,
            # since parameters in policy will not survive across tf sessions.
            policy_pkl = pickle.dumps(policy)

    reward = model_cfg, reward_params
    return reward, policy_pkl
Ejemplo n.º 7
0
def metalearn(venvs,
              trajectories,
              discount,
              seed,
              log_dir,
              *,
              tf_cfg,
              outer_itr=1000,
              lr=1e-2,
              model_cfg=None,
              policy_cfg=None,
              training_cfg={},
              policy_per_task=False):
    envs = {k: TfEnv(VecGymEnv(v)) for k, v in venvs.items()}
    env_spec = list(envs.values())[0].spec
    num_envs = list(venvs.values())[0].num_envs
    tasks = list(envs.keys())

    experts = {k: _convert_trajectories(v) for k, v in trajectories.items()}

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        if model_cfg is None:
            model_cfg = {
                'model': AIRLStateOnly,
                'state_only': True,
                'max_itrs': 10
            }
        model_kwargs = dict(model_cfg)
        model_cls = model_kwargs.pop('model')
        irl_model = model_cls(env_spec=env_spec, **model_kwargs)

        if policy_cfg is None:
            policy_cfg = {
                'policy': GaussianMLPPolicy,
                'hidden_sizes': (32, 32)
            }
        else:
            policy_cfg = dict(policy_cfg)
        policy_fn = policy_cfg.pop('policy')
        policy = policy_fn(name='policy', env_spec=env_spec, **policy_cfg)
        pol_params = {}

        training_kwargs = {
            'n_itr': 10,
            'batch_size': 10000,
            'max_path_length': 500,
            'irl_model_wt': 1.0,
            'entropy_weight': 0.1,
            # paths substantially increase storage requirements
            'store_paths': False,
        }
        training_kwargs.update(training_cfg)
        algos = {
            k: IRLTRPO(env=env,
                       policy=policy,
                       irl_model=irl_model,
                       discount=discount,
                       sampler_args=dict(n_envs=num_envs),
                       zero_environment_reward=True,
                       baseline=LinearFeatureBaseline(env_spec=env_spec),
                       **training_kwargs)
            for k, env in envs.items()
        }

        with tf.Session(config=tf_cfg) as sess:
            sess.run(tf.global_variables_initializer())
            meta_reward_params = irl_model.get_params()
            for i in range(outer_itr):
                task = random.choice(tasks)
                pol_task = task if policy_per_task else None
                itr_logdir = osp.join(
                    log_dir, '{}_{}'.format(i, sanitize_env_name(task)))
                with rllab_logdir(algo=algos[task], dirname=itr_logdir):
                    with rl_logger.prefix('outer itr {} | task {}'.format(
                            i, task)):
                        irl_model.set_demos(experts[task])
                        # TODO: rather than specifying these as initializers,
                        # might be more efficient to have AIRL not overwrite
                        # these variables each call to train()?
                        algos[task].init_irl_params = meta_reward_params
                        algos[task].init_pol_params = pol_params.get(pol_task)
                        algos[task].train()

                        # Meta-update reward
                        # {meta,task}_reward_params are lists of NumPy arrays
                        task_reward_params = irl_model.get_params()
                        assert len(task_reward_params) == len(
                            meta_reward_params)
                        for i in range(len(task_reward_params)):
                            meta, task = meta_reward_params[
                                i], task_reward_params[i]
                            # Reptile update: meta <- meta + lr * (task - meta)
                            #TODO: use Adam optimizer?
                            meta_reward_params[i] = (1 - lr) * meta + lr * task

                        # Store policy update (joint if not policy_per_task)
                        pol_params[pol_task] = policy.get_param_values()

    reward = model_kwargs, meta_reward_params

    return reward