def run_task(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.algos.vpg import VPG from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.box2d.cartpole_env import CartpoleEnv from sandbox.rocky.tf.envs.base import TfEnv env_name = variant['Environment'] if env_name == 'Cartpole': env = TfEnv(CartpoleEnv()) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(100, 100)) baseline = LinearFeatureBaseline(env_spec=env.spec) algorithm = VPG( env=env, policy=policy, baseline=baseline, n_itr=100, start_itr=0, batch_size=1000, max_path_length=1000, discount=0.99, ) algorithm.train()
def run_train_task(vv): env = TfEnv(normalize(vv['env_class']( fix_goal=vv['fix_goal'], ))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity'], adaptive_std=vv['adaptive_policy_std'] ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True ) algo.train()
def run_task(args, *_): metaworld_train_env = benchmark.get_train_tasks() wrapped_train_env = MetaworldWrapper(metaworld_train_env) env = TfEnv(wrapped_train_env) metaworld_test_env = benchmark.get_test_tasks() wrapped_test_env = MetaworldWrapper(metaworld_test_env) test_env = TfEnv(wrapped_test_env) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, test_env=test_env, policy=policy, baseline=baseline, batch_size=20000, # batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def generate_expert_dp(): env = TfEnv(normalize(InvertedPendulumEnv())) policy = GaussianMLPPolicy( name="expert_policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64), std_hidden_sizes=(64, 64), adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=64, discount=0.995, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)), gae_lambda=0.97, ) with tf.Session() as sess: algo.train(sess=sess) t = rollout(env=env, agent=policy, max_path_length=100, animated=False) print(sum(t['rewards'])) with open('expert_dp.pickle', 'wb') as handle: pickle.dump(policy, handle) while True: rollout(env=env, agent=policy, max_path_length=100, animated=False)
def run_task(args, *_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO #env = TfEnv(normalize(CartpoleEnv())) env = TfEnv(CartpoleEnv()) #metaworld_env = ML1.get_train_tasks("pick-place-v1") #tasks = metaworld_env.sample_tasks(1) #metaworld_env.set_task(tasks[0]) #metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space) #metaworld_env._action_space = convert_gym_space(metaworld_env.action_space) #env = TfEnv(normalize(metaworld_env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, # batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99): env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=1000, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(exp_name, params_folder=None): env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False)) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file) irl_model = AIRL(env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name): with tf.Session(): algo.train()
def get_algo(env_name, use_eval, init_path, horizon, batch_size, n_itr, discount, step_size, gae): env = get_env(env_name) policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=(32, 32), # output_nonlinearity=tf.nn.tanh ) baseline = LinearFeatureBaseline(env_spec=env.spec) kwargs = dict(env=env, policy=policy, baseline=baseline, batch_size=batch_size, max_path_length=horizon, n_itr=n_itr, discount=discount, step_size=step_size, gae_lambda=gae) if use_eval: kwargs["reset_init_path"] = os.path.join(config.PROJECT_PATH, get_eval_data_path[env_name]) kwargs["horizon"] = horizon if init_path is not None: kwargs["initialized_path"] = init_path return TRPO(**kwargs)
def run_task(args, *_): env = TfEnv(normalize( dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=50000, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def run_task(*_): env = TfEnv( normalize(GymEnv("Reacher-v1", force_reset=True, record_video=True))) #env = TfEnv(normalize(PusherEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=100 * 500, max_path_length=100, n_itr=200, discount=0.99, step_size=0.01, force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) algo.train()
def test_training(self): env = TfEnv(normalize(PointEnv())) tf.set_random_seed(22) np.random.seed(22) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh) baseline = LinearFeatureBaseline(env_spec=env.spec) dynamics_model = MLPDynamicsModel("dyn_model", env, hidden_sizes=(16, 16)) # fit dynamics model algo = ModelTRPO( env=env, policy=policy, dynamics_model=dynamics_model, baseline=baseline, batch_size_env_samples=5000, initial_random_samples=10000, batch_size_dynamics_samples=40000, max_path_length=100, dynamic_model_epochs=(30, 10), num_gradient_steps_per_iter=2, n_itr=20, discount=0.99, step_size=0.001, ) algo.train()
def run_task(vv): env = TfEnv( normalize( GymEnv('HalfCheetah-v1', record_video=False, record_log=False))) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), name="policy") baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=vv["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_train_task(vv): env = TfEnv( normalize(vv['env_class']( fix_goal=vv['fix_goal'], reward_type=vv['reward_type'], init_puck_low=INIT_PUCK_TARGET - vv['init_slack'], init_puck_high=INIT_PUCK_TARGET + vv['init_slack'], puck_goal_low=PUCK_GOAL_TARGET - vv['goal_slack'], puck_goal_high=PUCK_GOAL_TARGET + vv['goal_slack'], ))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity'], adaptive_std=vv['adaptive_policy_std']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True) algo.train()
def run_expt(config): env_name = config['environment'] env = get_env(env_name) experts = get_demos(env_name) irl_model = algo_string_to_model[config['algo']](env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # use params for each env algo = IRLTRPO(env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000 if env_name == 'pendulum' else 10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=1.0 if env_name == 'pointmass' else 0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) dirname = DATA_DIR + "/" + "___".join( [str(k) + "=" + str(v) for k, v in config.items()]) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train() # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after # it's been written to disk train_results = pd.read_csv(dirname + '/progress.csv') # return originaltaskaverageReturn for last iteation output = config.copy() output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn'] return output
def main(): env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False)) experts = load_latest_experts('data/ant', n=50) irl_model = GCLDiscrim( env_spec=env.spec, expert_trajs=experts, discrim_arch=disentangled_net) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=1000, discount=0.995, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/ant_airl'): with tf.Session(): algo.train()
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) ### VGG 11/29/18: Added support to CSV files ## this method loads expert data saved as pickle file # experts = load_latest_experts('data/airsim_final', n=1) # this one uses csv: experts = load_experts('data/airsim_human_data/log.csv', pickle_format=False) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=5000, batch_size=60, max_path_length=60, discount=0.99, store_paths=True, discrim_train_itrs=100, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), n_parallel=0) with rllab_logdir(algo=algo, dirname='data/airsim_gail'): with tf.Session(): algo.train()
def main(exp_name, ent_wt=1.0): tf.reset_default_graph() env = TfEnv( CustomGymEnv('airl/CustomAnt-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=1500, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s' % exp_name): algo.train()
def main(num_examples=50, discount=0.99): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=num_examples) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000, max_path_length=100, discount=discount, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_traj'): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99): env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def run_train_task(vv): env = TfEnv( normalize( CassieEnv(fixed_gains=vv['fixed_gains'], stability_cost_coef=vv['stability_cost_coef'], ctrl_cost_coef=vv['ctrl_cost_coef'], alive_bonus=vv['alive_bonus']))) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=vv['hidden_sizes'], hidden_nonlinearity=vv['hidden_nonlinearity']) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=vv['batch_size'], max_path_length=vv['path_length'], n_itr=vv['n_itr'], discount=vv['discount'], step_size=vv["step_size"], force_batch_sampler=True) algo.train()
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = GAIL(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.0, # GAIL should not use entropy unless for exploration zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gail'): with tf.Session(): algo.train()
def _build_policy_from_rllab(self, env, n_actions): """ Return both rllab policy and policy model function. """ sess = self.tf_sess scope_name = self.scope_name # Initialize training_policy to copy from policy training_policy = GaussianMLPPolicy( name=scope_name, env_spec=env.spec, hidden_sizes=self.policy_params["hidden_layers"], init_std=self.policy_opt_params["trpo"]["init_std"], output_nonlinearity=eval( self.policy_params["output_nonlinearity"])) training_policy_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy') sess.run([tf.variables_initializer(training_policy_vars)]) def policy_model(x, stochastic=1.0, collect_summary=False): dist_info_sym = training_policy.dist_info_sym(x, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] mean_var += stochastic * tf.random_normal( shape=(tf.shape(x)[0], n_actions)) * tf.exp(log_std_var) return mean_var return training_policy, policy_model
def main(eval_reward = False): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) n_experts = 10 experts = load_latest_experts('plotting/pendulum_final', n=n_experts) dirname='data/pendulum' # dir to save logs and images irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), eval_reward=True, fig_dir = dirname ) # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)): with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.fig_dirname = dirname algo.train()
def main(): env = TfEnv(CustomGymEnv('PointMazeLeft-v0')) experts = load_latest_experts('data/point', n=50) irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=2000, batch_size=10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec) ) with rllab_logdir(algo=algo, dirname='data/point_traj'): with tf.Session(): algo.train() test_pointmaze(sess.run(policy))
def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99): gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=tf_config) as sess: algo = TRPO( env=env, policy=policy, n_itr=3000, batch_size=20000, max_path_length=1000, discount=discount, store_paths=True, baseline=LinearFeatureBaseline(env_spec=env.spec), step_size=0.01, entropy_weight=ent_wt, sess=sess, exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/swimmer'): algo.train(sess)
def main(): env = TfEnv( GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/airsim', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=10, batch_size=100, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/airsim_gcl'): with tf.Session(): algo.train()
def test_multiagent_ngsim_env(self): basedir = os.path.expanduser('~/.julia/packages/NGSIM/9OYUa/data') filename = 'trajdata_i101_trajectories-0750am-0805am.txt' filepaths = [os.path.join(basedir, filename)] n_veh = 5 env = JuliaEnv(env_id='MultiagentNGSIMEnv', env_params=dict(n_veh=n_veh, trajectory_filepaths=filepaths, H=200, primesteps=50), using='AutoEnvs') low, high = env.action_space.low, env.action_space.high env = TfEnv(env) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32), std_hidden_sizes=(32, 32), adaptive_std=True, output_nonlinearity=None, learn_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, sampler_args=dict(n_envs=n_veh)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) try: algo.train(sess=sess) except Exception as e: self.fail('exception incorrectly raised: {}'.format(e))
def run_svrg(*_): envir = env_name_map[env_name] env = TfEnv(normalize(GymEnv(envir, record_video=False, force_reset=True))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64), ) policy_tilde = GaussianMLPPolicy( name="policy_tilde", env_spec=env.spec, hidden_sizes=(64, 64), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = SVRGPG(env=env, policy=policy, policy_tilde=policy_tilde, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, discount=0.995, delta=delta, optimizer_args=dict( batch_size=mini_batch_size, max_epochs=1, epsilon=1e-8, use_SGD=False, cg_iters=cg_iters, subsample_factor=subsample_factor, max_batch=max_batch, )) print("run svrg cg for env {:}".format(env_name)) print("max_epochs {:}".format(max_epochs)) print("cg_iters {:}".format(cg_iters)) print("step size: {:}".format(delta)) print("max_path_length : {:}".format(max_path_length)) print("Num of Iterations: {:}".format(n_itr)) print("Num of Examples: {:}".format(batch_size)) print("sub sample rate: {:}".format(subsample_factor)) print("batch size: {:}".format(mini_batch_size)) print("max num batches: {:}".format(max_batch)) return algo
def main(exp_name=None, fusion=True): # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) #experts = load_latest_experts('data/ant_data_collect', n=5) #qvar: inverse model q(a|s,s') qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=experts, fusion=True, max_itrs=10) #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, fusion=fusion, max_itrs=10, score_discrim=True) #Empowerment-based potential functions gamma* Phi(s')-Phi(s) empw_model = Empowerment(env=env, fusion=True, max_itrs=4) t_empw_model = Empowerment(env=env, scope='t_efn', fusion=True, max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=3000, #130, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, target_empw_update=5, irl_model_wt=1.0, entropy_weight=0.1, lambda_i=1.0, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), plot=False) with rllab_logdir(algo=algo, dirname='data/ant_state_irl'): #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above with tf.Session(): algo.train()