def _run_single_irl_train(irl, parallel, discount, seed, env_name, log_dir, trajectories): logger.debug('[IRL] algo = %s [discount=%f, seed=%s, parallel=%d], ' 'env = %s, n = %d', irl, discount, seed, parallel, env_name, len(trajectories)) # Setup utils.set_cuda_visible_devices() mon_dir = osp.join(log_dir, 'mon') os.makedirs(mon_dir) irl_algo = config.SINGLE_IRL_ALGORITHMS[irl] irl_seed = create_seed(seed + 'irl') with make_envs(env_name, irl_algo.vectorized, parallel, irl_seed, log_prefix=osp.join(mon_dir, 'train')) as envs: reward, policy = irl_algo.train(envs, trajectories, discount=discount, seed=irl_seed, log_dir=log_dir) # Save learnt reward & policy for debugging purposes joblib.dump(reward, osp.join(log_dir, 'reward.pkl')) joblib.dump(policy, osp.join(log_dir, 'policy.pkl')) eval_seed = create_seed(seed + 'eval') with make_envs(env_name, irl_algo.vectorized, parallel, eval_seed, log_prefix=osp.join(mon_dir, 'eval')) as envs: value = irl_algo.value(envs, policy, discount=1.00, seed=eval_seed) return reward, value
def _run_population_irl_finetune(irl, parallel, discount, seed, env, trajs, metainit, log_dir): # Setup utils.set_cuda_visible_devices() logger.debug('[IRL] finetune: algo = %s [discount=%f, seed=%s, parallel=%d]' ', m = %d, env = %s', irl, discount, seed, parallel, len(trajs), env) finetune_mon_prefix = osp.join(log_dir, 'mon') # Get algorithm from config irl_algo = config.POPULATION_IRL_ALGORITHMS[irl] # Seeding finetune_seed = create_seed(seed + 'irlfinetune') # Finetune IRL algorithm (i.e. run it) from meta-initialization with make_envs(env, irl_algo.vectorized, parallel, finetune_seed, log_prefix=finetune_mon_prefix) as envs: r, p = irl_algo.finetune(metainit, envs, trajs, discount=discount, seed=finetune_seed, log_dir=log_dir) joblib.dump(p, osp.join(log_dir, 'policy.pkl')) # Compute value of finetuned policy with make_envs(env, irl_algo.vectorized, parallel, finetune_seed, log_prefix=finetune_mon_prefix) as envs: eval_seed = create_seed(seed + 'eval') v = irl_algo.value(envs, p, discount=1.0, seed=eval_seed) return r, v
def _compute_value(rl, discount, parallel, seed, env_name, log_dir, policy): utils.set_cuda_visible_devices() # Note discount is not used, but is needed as a caching key. logger.debug('[VALUE] %s [discount=%f, seed=%s, parallel=%d] on %s', rl, discount, seed, parallel, env_name) # Each RL algorithm specifies a method to compute the value of its policy rl_algo = config.RL_ALGORITHMS[rl] # Create and set up logging directory mon_dir = osp.join(log_dir, 'mon') os.makedirs(mon_dir, exist_ok=True) # Compute value of policy eval_seed = create_seed(seed + 'eval') with make_envs(env_name, rl_algo.vectorized, parallel, eval_seed, log_prefix=osp.join(mon_dir, 'eval')) as envs: value = rl_algo.value(envs, policy, discount=1.00, seed=eval_seed) return value
def synthetic_data(rl, discount, parallel, seed, env_name, num_trajectories, log_dir, policy): '''Precondition: policy produced by RL algorithm rl.''' # Setup utils.set_cuda_visible_devices() logger.debug('[SAMPLE] %s [discount=%f, seed=%s, parallel=%d] ' 'for %d trajectories on %s', rl, discount, seed, parallel, num_trajectories, env_name) # Create and set up logging directory mon_dir = osp.join(log_dir, 'mon') os.makedirs(mon_dir, exist_ok=True) # Sample from policy data_seed = create_seed(seed + 'data') rl_algo = config.RL_ALGORITHMS[rl] with make_envs(env_name, rl_algo.vectorized, parallel, data_seed, log_prefix=osp.join(mon_dir, 'synthetic')) as envs: samples = rl_algo.sample(envs, policy, num_trajectories, data_seed) return [(obs, acts) for (obs, acts, rews) in samples]
def _train_policy(rl, discount, parallel, seed, env_name, log_dir): # Setup utils.set_cuda_visible_devices() logger.debug('[TRAIN] %s [discount=%f, seed=%s, parallel=%d] on %s', rl, discount, seed, parallel, env_name) mon_dir = osp.join(log_dir, 'mon') os.makedirs(mon_dir, exist_ok=True) train_seed = create_seed(seed + 'train') # Generate the policy rl_algo = config.RL_ALGORITHMS[rl] with make_envs(env_name, rl_algo.vectorized, parallel, train_seed, log_prefix=osp.join(mon_dir, 'train')) as envs: # This nested parallelism is unfortunate. We're mostly doing this # as algorithms differ in their resource reservation. policy = rl_algo.train(envs, discount=discount, seed=train_seed, log_dir=log_dir) joblib.dump(policy, osp.join(log_dir, 'policy.pkl')) # save for debugging return policy
def _value_helper(irl, n, m, rl, parallel, discount, seed, env_name, reward, log_dir): if reward is None: # reward will be None if the algorithm is a non-IRL imitation learner. # In this case, do not attempt to reoptimize. return None # Setup utils.set_cuda_visible_devices() logger.debug('[EVAL] %s [meta=%d, finetune=%d] ' 'by %s [discount=%f, seed=%s, parallel=%d] ' 'on %s (writing to %s)', irl, n, m, rl, discount, seed, parallel, env_name, log_dir) mon_dir = osp.join(log_dir, 'mon') os.makedirs(mon_dir) if irl in config.SINGLE_IRL_ALGORITHMS: reward_wrapper = config.SINGLE_IRL_ALGORITHMS[irl].reward_wrapper else: reward_wrapper = config.POPULATION_IRL_ALGORITHMS[irl].reward_wrapper rw = functools.partial(reward_wrapper, new_reward=reward) rl_algo = config.RL_ALGORITHMS[rl] train_seed = create_seed(seed + 'eval_train') with make_envs(env_name, rl_algo.vectorized, parallel, train_seed, post_wrapper=rw, log_prefix=osp.join(mon_dir, 'train')) as envs: p = rl_algo.train(envs, discount=discount, seed=train_seed, log_dir=log_dir) joblib.dump(p, osp.join(log_dir, 'policy.pkl')) eval_seed = create_seed(seed + 'eval_eval') with make_envs(env_name, rl_algo.vectorized, parallel, eval_seed, log_prefix=osp.join(mon_dir, 'eval')) as envs: v = rl_algo.value(envs, p, discount=1.00, seed=eval_seed) return v
def _run_population_irl_meta(irl, parallel, discount, seed, trajs, log_dir): # Setup utils.set_cuda_visible_devices() n = len(list(trajs.values())[0]) logger.debug('[IRL] meta: algo = %s [discount=%f, seed=%s, parallel=%d], ' 'n = %d, envs = %s', irl, discount, seed, parallel, n, trajs.keys()) meta_log_dir = osp.join(log_dir, 'meta:{}'.format(n)) mon_dir = osp.join(meta_log_dir, 'mon') os.makedirs(mon_dir) # Get algorithm from config irl_algo = config.POPULATION_IRL_ALGORITHMS[irl] # Customize seeds irl_seed = create_seed(seed + 'irlmeta') # Set up environments for meta-learning ctxs = {} for env in trajs.keys(): log_prefix = osp.join(mon_dir, sanitize_env_name(env) + '-') ctxs[env] = make_envs(env, irl_algo.vectorized, parallel, irl_seed, log_prefix=log_prefix) meta_envs = {k: v.__enter__() for k, v in ctxs.items()} # Run metalearning subset = {k: v[:n] for k, v in trajs.items()} metainit = irl_algo.metalearn(meta_envs, subset, discount=discount, seed=irl_seed, log_dir=meta_log_dir) # Make sure to exit out of all the environments for env in ctxs.values(): env.__exit__(None, None, None) # Save metalearning initialization for debugging joblib.dump(metainit, osp.join(log_dir, 'metainit.pkl')) return metainit