def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('airl/CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def main(exp_name, ent_wt=1.0): tf.reset_default_graph() env = TfEnv( CustomGymEnv('airl/CustomAnt-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=1500, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s' % exp_name): algo.train()
def run_expt(config): env_name = config['environment'] env = get_env(env_name) experts = get_demos(env_name) irl_model = algo_string_to_model[config['algo']](env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # use params for each env algo = IRLTRPO(env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000 if env_name == 'pendulum' else 10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=1.0 if env_name == 'pointmass' else 0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) dirname = DATA_DIR + "/" + "___".join( [str(k) + "=" + str(v) for k, v in config.items()]) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train() # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after # it's been written to disk train_results = pd.read_csv(dirname + '/progress.csv') # return originaltaskaverageReturn for last iteation output = config.copy() output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn'] return output
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gcl'): with tf.Session(): algo.train()
def main( log_dir, env_name, ent_coef, n_steps, total_timesteps, num_vec, ): tf.reset_default_graph() # n_steps is the `batch_size // num_vec` in `imitation`. batch_size = n_steps * num_vec n_itr = int(math.ceil(total_timesteps / batch_size)) if env_name.startswith("airl/"): env_cls = CustomGymEnv else: env_cls = GymEnv env = TfEnv(env_cls(env_name, record_video=False, record_log=False)) # NOTE: Haven't yet checked if hidden_sizes=(32, 32) matches the settings in # the `imitation` repo. We use the default Stable Baselines MLP policy. if isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, policy=policy, n_itr=n_itr, batch_size=batch_size, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_coef, baseline=LinearFeatureBaseline(env_spec=env.spec), # Maybe it will be the case the not every policy is compatible with # the VectorizedSampler. In that case, consider changing to # `sampler_cls=None` and adding a dummy `n_envs` kwargs to BatchSampler. sampler_cls=VectorizedSampler, sampler_args=dict(n_envs=num_vec), ) with rllab_logdir(algo=algo, dirname=log_dir): algo.train(sess)
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO(env=env, policy=policy, n_itr=200, batch_size=2000, max_path_length=100, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum'): algo.train()
def main(env_name, n_itr, batch_size, max_path_length): env_id = env_names_to_ids[env_name] env = TfEnv(GymEnv(env_id, record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = TRPO(env=env, policy=policy, n_itr=n_itr, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname=DATA_DIR + '/' + env_name): algo.train()
def main(exp_name, params_folder=None): env = TfEnv( CustomGymEnv('airl/DisabledAnt-v0', record_video=False, record_log=False)) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file) irl_model = AIRL(env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s' % exp_name): with tf.Session(): algo.train()
def airl(log_dir, *, tf_cfg, env_config, reward_model_cfg={}, policy_cfg={}, training_cfg={}, ablation='normal'): with TfEnvContext(tf_cfg, env_config) as context: training_kwargs, policy_cfg, reward_model_cfg, training_cfg = get_training_kwargs( venv=context.env_context.environments, reward_model_cfg=reward_model_cfg, policy_cfg=policy_cfg, training_cfg=training_cfg, ablation=ablation, ) print("Training arguments: ", training_kwargs) algo = IRLRunner( **training_kwargs, sampler_cls=sampling.PPOBatchSampler, ) irl_model = algo.irl_model policy = algo.policy with rllab_logdir(algo=algo, dirname=log_dir): print("Training!") algo.buffered_train() #algo.train() # need to return these explicitly because they don't survive # across tensorflow sessions reward_params = irl_model.get_params() policy_params = policy.tensor_values() policy = policy_cfg, policy_params reward = reward_model_cfg, reward_params return reward, policy
def finetune(metainit, venv, trajectories, discount, seed, log_dir, *, tf_cfg, pol_itr=100, irl_itr=100, model_cfg=None, policy_cfg=None, training_cfg={}): envs = VecGymEnv(venv) envs = TfEnv(envs) experts = _convert_trajectories(trajectories) train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) if model_cfg is None: model_cfg = { 'model': AIRLStateOnly, 'state_only': True, 'max_itrs': 10 } model_kwargs = dict(model_cfg) model_cls = model_kwargs.pop('model') irl_model = model_cls(env_spec=envs.spec, expert_trajs=experts, **model_kwargs) if policy_cfg is None: policy_cfg = { 'policy': GaussianMLPPolicy, 'hidden_sizes': (32, 32) } else: policy_cfg = dict(policy_cfg) policy_fn = policy_cfg.pop('policy') policy = policy_fn(name='policy', env_spec=envs.spec, **policy_cfg) training_kwargs = { 'batch_size': 10000, 'max_path_length': 500, 'irl_model_wt': 1.0, 'entropy_weight': 0.1, # paths substantially increase storage requirements 'store_paths': False, } training_kwargs.update(training_cfg) _kwargs, reward_params = metainit algo = IRLTRPO(env=envs, policy=policy, irl_model=irl_model, discount=discount, sampler_args=dict(n_envs=venv.num_envs), zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=envs.spec), init_irl_params=reward_params, train_irl=False, n_itr=pol_itr, **training_kwargs) with tf.Session(config=tf_cfg): # First round: just optimize the policy, do not update IRL model with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'pol')): with rl_logger.prefix('finetune policy |'): algo.train() pol_params = policy.get_param_values() # Second round: we have a good policy (generator), update IRL with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'all')): with rl_logger.prefix('finetune all |'): algo.train_irl = True algo.init_pol_params = pol_params algo.n_itr = irl_itr algo.train() reward_params = irl_model.get_params() # Side-effect: forces policy to cache all parameters. # This ensures they are saved/restored during pickling. policy.get_params() # Must pickle policy rather than returning it directly, # since parameters in policy will not survive across tf sessions. policy_pkl = pickle.dumps(policy) reward = model_cfg, reward_params return reward, policy_pkl
def metalearn(venvs, trajectories, discount, seed, log_dir, *, tf_cfg, outer_itr=1000, lr=1e-2, model_cfg=None, policy_cfg=None, training_cfg={}, policy_per_task=False): envs = {k: TfEnv(VecGymEnv(v)) for k, v in venvs.items()} env_spec = list(envs.values())[0].spec num_envs = list(venvs.values())[0].num_envs tasks = list(envs.keys()) experts = {k: _convert_trajectories(v) for k, v in trajectories.items()} train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) if model_cfg is None: model_cfg = { 'model': AIRLStateOnly, 'state_only': True, 'max_itrs': 10 } model_kwargs = dict(model_cfg) model_cls = model_kwargs.pop('model') irl_model = model_cls(env_spec=env_spec, **model_kwargs) if policy_cfg is None: policy_cfg = { 'policy': GaussianMLPPolicy, 'hidden_sizes': (32, 32) } else: policy_cfg = dict(policy_cfg) policy_fn = policy_cfg.pop('policy') policy = policy_fn(name='policy', env_spec=env_spec, **policy_cfg) pol_params = {} training_kwargs = { 'n_itr': 10, 'batch_size': 10000, 'max_path_length': 500, 'irl_model_wt': 1.0, 'entropy_weight': 0.1, # paths substantially increase storage requirements 'store_paths': False, } training_kwargs.update(training_cfg) algos = { k: IRLTRPO(env=env, policy=policy, irl_model=irl_model, discount=discount, sampler_args=dict(n_envs=num_envs), zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env_spec), **training_kwargs) for k, env in envs.items() } with tf.Session(config=tf_cfg) as sess: sess.run(tf.global_variables_initializer()) meta_reward_params = irl_model.get_params() for i in range(outer_itr): task = random.choice(tasks) pol_task = task if policy_per_task else None itr_logdir = osp.join( log_dir, '{}_{}'.format(i, sanitize_env_name(task))) with rllab_logdir(algo=algos[task], dirname=itr_logdir): with rl_logger.prefix('outer itr {} | task {}'.format( i, task)): irl_model.set_demos(experts[task]) # TODO: rather than specifying these as initializers, # might be more efficient to have AIRL not overwrite # these variables each call to train()? algos[task].init_irl_params = meta_reward_params algos[task].init_pol_params = pol_params.get(pol_task) algos[task].train() # Meta-update reward # {meta,task}_reward_params are lists of NumPy arrays task_reward_params = irl_model.get_params() assert len(task_reward_params) == len( meta_reward_params) for i in range(len(task_reward_params)): meta, task = meta_reward_params[ i], task_reward_params[i] # Reptile update: meta <- meta + lr * (task - meta) #TODO: use Adam optimizer? meta_reward_params[i] = (1 - lr) * meta + lr * task # Store policy update (joint if not policy_per_task) pol_params[pol_task] = policy.get_param_values() reward = model_kwargs, meta_reward_params return reward