def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) ### VGG 11/29/18: Added support to CSV files ## this method loads expert data saved as pickle file # experts = load_latest_experts('data/airsim_final', n=1) # this one uses csv: experts = load_experts('data/airsim_human_data/log.csv', pickle_format=False) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=5000, batch_size=60, max_path_length=60, discount=0.99, store_paths=True, discrim_train_itrs=100, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), n_parallel=0) with rllab_logdir(algo=algo, dirname='data/airsim_gail'): with tf.Session(): algo.train()
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def main( exp_name, rundir='data', irl_pkl='', ent_wt=1.0, trpo_anneal_steps=None, trpo_anneal_init_ent=None, trpo_step=0.01, init_pol_std=1.0, method=None, hid_size=None, hid_layers=None, switch_env=None, ): orig_env_name = get_name(irl_pkl) if switch_env is not None: this_env_name = switch_env else: this_env_name = orig_env_name print("Running on environment '%s'" % this_env_name) env = TfEnv( CustomGymEnv(this_env_name, record_video=False, record_log=False)) if hid_size is None or hid_layers is None: assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size, init_pol_std \ = min_layers_hidsize_polstd_for(orig_env_name) env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain') folder = os.path.dirname(irl_pkl) prior_params = load_prior_params(irl_pkl) expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=5) # For some reason IRLTRPO is responsible for setting weights in this code. # It would equally be possible to run global_variables_initializer() # ourselves and then do irl_model.set_params(prior_params) if we just # wanted to query energy, reward, etc. from the trained AIRL model without # using IRLTRPO. disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, freeze=True, vairl=method == 'vairl', vairl_beta=1e-4, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs) elif method in {'gail', 'vail'}: irl_model = GAIL(env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, freeze=True, vail=method == 'vail') else: raise NotImplementedError("Don't know how to handle method '%s'" % method) pol_hid_sizes = (hid_size, ) * hid_layers policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=pol_hid_sizes, init_std=init_pol_std) irltrpo_kwargs = dict( env=env, policy=policy, irl_model=irl_model, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=ent_wt, # should be 1.0 but 0.1 seems to work better step_size=trpo_step, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), init_irl_params=prior_params, force_batch_sampler=True, entropy_anneal_init_weight=trpo_anneal_init_ent, entropy_anneal_steps=trpo_anneal_steps, retraining=True) irltrpo_kwargs.update(env_trpo_params) algo = IRLTRPO(**irltrpo_kwargs) folder_suffix = '' if switch_env is not None: # append lower case environment name to retrain folder path folder_suffix = '_%s' % switch_env.lower() with rllab_logdir(algo=algo, dirname='%s/retrain%s' % (folder, folder_suffix)): with tf.Session(): algo.train()
def main( rundir='data', irl_pkl='', pol_pkl=None, method=None, hid_size=None, hid_layers=None, switch_env=None, ): print('irl_pkl =', irl_pkl, 'and pol_pkl =', pol_pkl) orig_env_name = get_name(irl_pkl) if switch_env is not None: this_env_name = switch_env else: this_env_name = orig_env_name print("Running on environment '%s'" % this_env_name) env = TfEnv( CustomGymEnv(this_env_name, record_video=False, record_log=False)) if hid_size is None or hid_layers is None: # we want hidden size & layer count for the *original* environment, # since that's what the IRL model that we're trying to reconstruct was # trained on assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size = min_layers_hidsize_for(orig_env_name) # we want trajectory length for the new environment rather than the # original environment, though traj_length = irltrpo_params_for(this_env_name, 'retrain')['max_path_length'] print('Horizon is', traj_length) expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=1) with tf.Session(config=get_session_config(), graph=tf.Graph()): irl_pkl_data = joblib.load(irl_pkl) disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, freeze=True, vairl=method == 'vairl', vairl_beta=1e-4, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs) elif method in {'gail', 'vail'}: irl_model = GAIL(env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, freeze=True, vail=method == 'vail') else: raise NotImplementedError("Don't know how to handle method '%s'" % method) irl_model.set_params(irl_pkl_data['irl_params']) if pol_pkl is not None: with tf.variable_scope('please-work'): pol_pkl_data = joblib.load(pol_pkl) policy = pol_pkl_data['policy'] print('Using policy loaded from %s' % pol_pkl) else: print('Using original IRL policy') policy = irl_pkl_data['policy'] # do a few rollouts with given policy on given reward # report both the IRL reward AND the mean reward for the policy n_rollouts = 30 irl_rets = np.zeros((n_rollouts, )) env_rets = np.zeros((n_rollouts, )) for i in tqdm.trange(n_rollouts): # how do I get final return? Hmm path = rollout(env, policy, max_path_length=traj_length) env_rets[i] = np.sum(path['rewards']) irl_rew = irl_model.eval([path]) irl_rets[i] = np.sum(irl_rew) print('Env mean %.2f (std %.2f)' % (np.mean(env_rets), np.std(env_rets))) print('IRL mean %.2f (std %.2f)' % (np.mean(irl_rets), np.std(irl_rets)))
if np.any(df.shape == [0, 0]): log.debug(file_name) else: log.debug(df.shape) trajectories.append(df) experts.append(data_dict) log.info("trajs : {}".format(len(trajectories))) tf.reset_default_graph() # if not osp.exists(gail_iter_path): # os.makedirs(gail_iter_path) # rllog.set_snapshot_dir(gail_iter_path) with tf.Session(): env = TfEnv(env) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(64, 64)) # policy._mean_network = iter_data['policy']._mean_network algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=t_iter, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, discrim_train_itrs=75, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True,
def main(exp_name, rundir, ent_wt=1.0, env_name='Shaped_PM_MazeRoom_Small-v0', method="airl", beta=1e-2, disc_step=1e-3, disc_iters=100, disc_batch_size=32, disc_gp=None, trpo_step=1e-2, init_pol_std=1.0, adaptive_beta=False, target_kl=0.5, beta_step=1e-6, hid_size=None, hid_layers=None, max_traj=None): os.makedirs(rundir, exist_ok=True) if hid_size is None or hid_layers is None: assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size, init_pol_std \ = min_layers_hidsize_polstd_for(env_name) env_trpo_params = irltrpo_params_for(env_name, 'irl') env = TfEnv(CustomGymEnv(env_name, record_video=False, record_log=False)) expert_dir = os.path.join(rundir, 'env_%s/' % env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=5, max_traj=max_traj) disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: is_vairl = method == 'vairl' irl_model = shaped_airl.AIRL( env=env, expert_trajs=experts, state_only=True, fusion=True, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs, gp_coeff=disc_gp, # vairl flag vairl=is_vairl, # vairl fixed beta settings vairl_beta=beta, # vairl adaptive beta settings vairl_adaptive_beta=adaptive_beta, vairl_beta_step_size=beta_step, vairl_kl_target=target_kl) elif method in {'gail', 'vail'}: is_vail = method == 'vail' assert gp_coeff is None, "no GAIL/VAIL support for GP coeff" irl_model = GAIL( env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, # vail stuff (only adaptive beta for VAIL, no fixed beta like # VAIRL) vail=is_vail, # initial beta vail_init_beta=beta, vail_beta_step_size=beta_step, vail_kl_target=target_kl) else: raise NotImplementedError("don't know how to handle method '%s'" % (method, )) pol_hid_sizes = (hid_size, ) * hid_layers policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=pol_hid_sizes, init_std=init_pol_std) irltrpo_kwargs = dict( env=env, policy=policy, irl_model=irl_model, discount=0.99, store_paths=True, discrim_train_itrs=disc_iters, irl_model_wt=1.0, irl_lr=disc_step, irl_batch_size=disc_batch_size, step_size=trpo_step, # entropy_weight should be 1.0 but 0.1 seems to work better entropy_weight=ent_wt, force_batch_sampler=True, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) irltrpo_kwargs.update(env_trpo_params) n_itr = irltrpo_kwargs['n_itr'] print('irltrpo_kwargs:', irltrpo_kwargs) algo = IRLTRPO(**irltrpo_kwargs) run_name = 'env_{env_name}_{method}'.format(env_name=env_name.lower(), method=method) exp_folder = os.path.join(rundir, '%s/%s' % (run_name, exp_name)) with rllab_logdir(algo=algo, dirname=exp_folder): with tf.Session(): algo.train() this_dir = os.path.dirname(__file__) maze_retrain_path = os.path.join(this_dir, 'env_retrain.py') latest_irl_snap = '%s/itr_%d.pkl' % (exp_folder, n_itr - 1) subproc_cmd = [ # script 'python', maze_retrain_path, # experiment info latest_irl_snap, '--rundir', rundir, # TRPO params '--trpo-step', '%f' % trpo_step, '--trpo-ent', '%f' % ent_wt, # network params '--hid-layers', '%d' % hid_layers, '--hid-size', '%d' % hid_size, # we don't care about precise args relevant to given method because # we're just reloading a frozen model '--method', method, ] subprocess.run(subproc_cmd, check=True)