Exemple #1
0
def _run_single_irl_train(irl, parallel, discount, seed,
                          env_name, log_dir, trajectories):
    logger.debug('[IRL] algo = %s [discount=%f, seed=%s, parallel=%d], ' 
                 'env = %s, n = %d',
                 irl, discount, seed, parallel, env_name, len(trajectories))
    # Setup
    utils.set_cuda_visible_devices()
    mon_dir = osp.join(log_dir, 'mon')
    os.makedirs(mon_dir)

    irl_algo = config.SINGLE_IRL_ALGORITHMS[irl]
    irl_seed = create_seed(seed + 'irl')
    with make_envs(env_name, irl_algo.vectorized, parallel, irl_seed,
                   log_prefix=osp.join(mon_dir, 'train')) as envs:
        reward, policy = irl_algo.train(envs, trajectories, discount=discount,
                                        seed=irl_seed, log_dir=log_dir)

    # Save learnt reward & policy for debugging purposes
    joblib.dump(reward, osp.join(log_dir, 'reward.pkl'))
    joblib.dump(policy, osp.join(log_dir, 'policy.pkl'))

    eval_seed = create_seed(seed + 'eval')
    with make_envs(env_name, irl_algo.vectorized, parallel, eval_seed,
                   log_prefix=osp.join(mon_dir, 'eval')) as envs:
        value = irl_algo.value(envs, policy, discount=1.00, seed=eval_seed)

    return reward, value
Exemple #2
0
def _run_population_irl_finetune(irl, parallel, discount, seed,
                                 env, trajs, metainit, log_dir):
    # Setup
    utils.set_cuda_visible_devices()
    logger.debug('[IRL] finetune: algo = %s [discount=%f, seed=%s, parallel=%d]' 
                 ', m = %d, env = %s',
                 irl, discount, seed, parallel, len(trajs), env)
    finetune_mon_prefix = osp.join(log_dir, 'mon')

    # Get algorithm from config
    irl_algo = config.POPULATION_IRL_ALGORITHMS[irl]
    # Seeding
    finetune_seed = create_seed(seed + 'irlfinetune')

    # Finetune IRL algorithm (i.e. run it) from meta-initialization
    with make_envs(env, irl_algo.vectorized, parallel,
                   finetune_seed,
                   log_prefix=finetune_mon_prefix) as envs:
        r, p = irl_algo.finetune(metainit, envs, trajs, discount=discount,
                                 seed=finetune_seed, log_dir=log_dir)
        joblib.dump(p, osp.join(log_dir, 'policy.pkl'))

    # Compute value of finetuned policy
    with make_envs(env, irl_algo.vectorized, parallel,
                   finetune_seed,
                   log_prefix=finetune_mon_prefix) as envs:
        eval_seed = create_seed(seed + 'eval')
        v = irl_algo.value(envs, p, discount=1.0, seed=eval_seed)

    return r, v
Exemple #3
0
def _compute_value(rl, discount, parallel, seed, env_name, log_dir, policy):
    utils.set_cuda_visible_devices()
    # Note discount is not used, but is needed as a caching key.
    logger.debug('[VALUE] %s [discount=%f, seed=%s, parallel=%d] on %s',
                 rl, discount, seed, parallel, env_name)
    # Each RL algorithm specifies a method to compute the value of its policy
    rl_algo = config.RL_ALGORITHMS[rl]

    # Create and set up logging directory
    mon_dir = osp.join(log_dir, 'mon')
    os.makedirs(mon_dir, exist_ok=True)

    # Compute value of policy
    eval_seed = create_seed(seed + 'eval')
    with make_envs(env_name, rl_algo.vectorized, parallel, eval_seed,
                   log_prefix=osp.join(mon_dir, 'eval')) as envs:
        value = rl_algo.value(envs, policy, discount=1.00, seed=eval_seed)

    return value
Exemple #4
0
def synthetic_data(rl, discount, parallel, seed, env_name, num_trajectories,
                   log_dir, policy):
    '''Precondition: policy produced by RL algorithm rl.'''
    # Setup
    utils.set_cuda_visible_devices()
    logger.debug('[SAMPLE] %s [discount=%f, seed=%s, parallel=%d] '
                 'for %d trajectories on %s',
                 rl, discount, seed, parallel, num_trajectories, env_name)

    # Create and set up logging directory
    mon_dir = osp.join(log_dir, 'mon')
    os.makedirs(mon_dir, exist_ok=True)

    # Sample from policy
    data_seed = create_seed(seed + 'data')
    rl_algo = config.RL_ALGORITHMS[rl]
    with make_envs(env_name, rl_algo.vectorized, parallel, data_seed,
                   log_prefix=osp.join(mon_dir, 'synthetic')) as envs:
        samples = rl_algo.sample(envs, policy, num_trajectories, data_seed)
    return [(obs, acts) for (obs, acts, rews) in samples]
Exemple #5
0
def _train_policy(rl, discount, parallel, seed, env_name, log_dir):
    # Setup
    utils.set_cuda_visible_devices()
    logger.debug('[TRAIN] %s [discount=%f, seed=%s, parallel=%d] on %s',
                 rl, discount, seed, parallel, env_name)
    mon_dir = osp.join(log_dir, 'mon')
    os.makedirs(mon_dir, exist_ok=True)
    train_seed = create_seed(seed + 'train')

    # Generate the policy
    rl_algo = config.RL_ALGORITHMS[rl]
    with make_envs(env_name, rl_algo.vectorized, parallel, train_seed,
                   log_prefix=osp.join(mon_dir, 'train')) as envs:
        # This nested parallelism is unfortunate. We're mostly doing this
        # as algorithms differ in their resource reservation.
        policy = rl_algo.train(envs, discount=discount, seed=train_seed,
                               log_dir=log_dir)

    joblib.dump(policy, osp.join(log_dir, 'policy.pkl'))  # save for debugging

    return policy
Exemple #6
0
def _value_helper(irl, n, m, rl, parallel, discount, seed, env_name, 
                  reward, log_dir):
    if reward is None:
        # reward will be None if the algorithm is a non-IRL imitation learner.
        # In this case, do not attempt to reoptimize.
        return None

    # Setup
    utils.set_cuda_visible_devices()
    logger.debug('[EVAL] %s [meta=%d, finetune=%d] ' 
                 'by %s [discount=%f, seed=%s, parallel=%d] '
                 'on %s (writing to %s)',
                 irl, n, m,
                 rl, discount, seed, parallel,
                 env_name, log_dir)
    mon_dir = osp.join(log_dir, 'mon')
    os.makedirs(mon_dir)

    if irl in config.SINGLE_IRL_ALGORITHMS:
        reward_wrapper = config.SINGLE_IRL_ALGORITHMS[irl].reward_wrapper
    else:
        reward_wrapper = config.POPULATION_IRL_ALGORITHMS[irl].reward_wrapper
    rw = functools.partial(reward_wrapper, new_reward=reward)
    rl_algo = config.RL_ALGORITHMS[rl]

    train_seed = create_seed(seed + 'eval_train')
    with make_envs(env_name, rl_algo.vectorized, parallel,
                   train_seed, post_wrapper=rw,
                   log_prefix=osp.join(mon_dir, 'train')) as envs:
        p = rl_algo.train(envs, discount=discount,
                          seed=train_seed, log_dir=log_dir)

    joblib.dump(p, osp.join(log_dir, 'policy.pkl'))

    eval_seed = create_seed(seed + 'eval_eval')
    with make_envs(env_name, rl_algo.vectorized, parallel,
                   eval_seed, log_prefix=osp.join(mon_dir, 'eval')) as envs:
        v = rl_algo.value(envs, p, discount=1.00, seed=eval_seed)

    return v
Exemple #7
0
def _run_population_irl_meta(irl, parallel, discount, seed, trajs, log_dir):
    # Setup
    utils.set_cuda_visible_devices()
    n = len(list(trajs.values())[0])
    logger.debug('[IRL] meta: algo = %s [discount=%f, seed=%s, parallel=%d], ' 
                 'n = %d, envs = %s',
                 irl, discount, seed, parallel, n, trajs.keys())
    meta_log_dir = osp.join(log_dir, 'meta:{}'.format(n))
    mon_dir = osp.join(meta_log_dir, 'mon')
    os.makedirs(mon_dir)

    # Get algorithm from config
    irl_algo = config.POPULATION_IRL_ALGORITHMS[irl]
    # Customize seeds
    irl_seed = create_seed(seed + 'irlmeta')

    # Set up environments for meta-learning
    ctxs = {}
    for env in trajs.keys():
        log_prefix = osp.join(mon_dir, sanitize_env_name(env) + '-')
        ctxs[env] = make_envs(env, irl_algo.vectorized, parallel,
                              irl_seed, log_prefix=log_prefix)
    meta_envs = {k: v.__enter__() for k, v in ctxs.items()}

    # Run metalearning
    subset = {k: v[:n] for k, v in trajs.items()}
    metainit = irl_algo.metalearn(meta_envs, subset, discount=discount,
                                  seed=irl_seed, log_dir=meta_log_dir)

    # Make sure to exit out of all the environments
    for env in ctxs.values():
        env.__exit__(None, None, None)

    # Save metalearning initialization for debugging
    joblib.dump(metainit, osp.join(log_dir, 'metainit.pkl'))

    return metainit