def _launch_ec2(func, exp_prefix, exp_name, params, run_experiment_kwargs): print("Launching task", exp_name) kwargs = dict(n_parallel=1, snapshot_mode="last", seed=params.get("seed", None), mode="ec2") kwargs.update(run_experiment_kwargs) kwargs.update( dict(exp_prefix=exp_prefix, exp_name=exp_name, variant=params, confirm_remote=False)) run_experiment(func, **kwargs)
hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=True, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=10, n_rollout_steps=50, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e3), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) ddpg.train(sess=sess) run_experiment( run_task, n_parallel=1, exp_prefix="ddpg_point_compose", seed=1, plot=True, )
skill_integration_idx = CategoricalMLPSkillIntegrator.get_index_of_method_str(skill_integration_method) # Seed seed = seed if args.seed == 'keep' \ else None if args.seed == 'random' \ else int(args.seed) # Launch training run_experiment( run_task, # Configure TF use_tf=True, use_gpu=True, # Name experiment exp_prefix='asa-resume-with-new-skill', exp_name=exp_name_direct or \ (datetime.now().strftime('%Y_%m_%d-%H_%M') + '--resumed_' + snapshot_name + (('--' + exp_name_extra) if exp_name_extra else '') + '--skill_' + skill_policy_exp_name + '--integ' + str(skill_integration_idx) + '_' + skill_integration_method + (('--s' + str(seed)) if seed else '') ), # Number of parallel workers for sampling n_parallel=0, # Snapshot information snapshot_mode="all", # Specifies the seed for the experiment (random seed if None) seed=seed )
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=1000, discount=0.99, step_size=0.01, plot=True, #optimizer_args=dict(max_grad_norm=0.5) ) algo.train() config = dict( batch_size=4096, max_path_length=100, # 50 policy_init_std=0.1, # 1.0 ) run_experiment( run_task, exp_prefix='sawyer_reach_trpo_torque', n_parallel=8, seed=1, variant=config, plot=True, )
from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy def run(*_): """Stub method for running trpo.""" env = TheanoEnv( ReacherEnv(control_method='position_control', sparse_reward=False)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=100, baseline=baseline, n_itr=2500, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train() run_experiment( run, n_parallel=2, plot=True, )
domain_name='cartpole', task_name='balance', visualize_reward=True)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=400, discount=0.99, step_size=0.01, plot=True, ) algo.train() run_experiment( run_task, n_parallel=1, snapshot_mode="last", plot=True, )
from garage.theano.optimizers import FiniteDifferenceHvp from garage.theano.policies import GaussianGRUPolicy def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train() run_experiment( run_task, n_parallel=1, seed=1, )
:return: """ env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(64, 64)) baseline = GaussianMLPBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=488, discount=0.99, step_size=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False) algo.train() run_experiment( run_task, n_parallel=1, snapshot_mode="last", seed=1, plot=False, )
env_spec=env, hidden_sizes=(64, 64), init_std=20, # std_share_network=False, # adaptive_std=True ) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=100, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, ) algo.train(sess=sess) run_experiment( run_task, n_parallel=4, exp_prefix="ppo_sawyer_compose", seed=2, plot=True, )
name="Critic", hidden_sizes=[200, 100], hidden_nonlinearity=tf.nn.relu) ddpg = DDPG(env, actor=actor_net, actor_lr=1e-4, critic_lr=1e-3, critic=critic_net, plot=True, target_update_tau=1e-2, n_epochs=500, n_epoch_cycles=10, n_rollout_steps=200, n_train_steps=50, discount=0.9, replay_buffer_size=int(1e6), min_buffer_size=int(1e4), exploration_strategy=action_noise, actor_optimizer=tf.train.AdamOptimizer, critic_optimizer=tf.train.AdamOptimizer) ddpg.train(sess=sess) run_experiment( run_task, exp_prefix='ddpg_sawyer_reach', n_parallel=1, seed=1, plot=True, )
algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=2000, discount=0.99, step_size=0.2, optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train() config = dict( batch_size=4096, max_path_length=500, # 50 policy_init_std=1.0, # 1.0 ) run_experiment( run_task, exp_prefix='sawyer_pusher_ppo_done', n_parallel=4, seed=1, variant=config, plot=True, )
policy_ent_coeff=v.policy_ent_coeff, embedding_ent_coeff=v.embedding_ent_coeff, inference_ce_coeff=v.inference_ce_coeff, use_softplus_entropy=True, ) algo.train() config = dict( tasks=TASKS, latent_length=3, inference_window=6, batch_size=4096 * len(TASKS), policy_ent_coeff=5e-3, # 1e-2 embedding_ent_coeff=1e-3, # 1e-3 inference_ce_coeff=5e-3, # 1e-4 max_path_length=200, embedding_init_std=1.0, embedding_max_std=2.0, policy_init_std=1.0, ) run_experiment( run_task, exp_prefix='sawyer_reach_embed_8goal', n_parallel=8, seed=1, variant=config, plot=False, )
init_std=0.5, # TODO was 100 ) baseline = LinearFeatureBaseline(env_spec=env_spec_embed) algo = TRPOTaskEmbedding( env=env, policy=policy, baseline=baseline, inference=traj_embedding, batch_size=4000, max_path_length=100, n_itr=500, discount=0.99, step_size=0.01, plot=plot, plot_warmup_itrs=20, policy_ent_coeff=0.0, # 0.001, #0.1, embedding_ent_coeff=0.0, #0.1, inference_ce_ent_coeff=0., # 0.03, #0.1, # 0.1, ) algo.train() run_experiment( run_task, exp_prefix='trpo_point_embed', n_parallel=16, plot=True, )
n_itr=500, discount=0.99, step_size=0.2, plot=True, policy_ent_coeff=v.policy_ent_coeff, embedding_ent_coeff=v.embedding_ent_coeff, inference_ce_coeff=v.inference_ce_coeff, num_tasks_held_out=1, ) algo.train() config = dict( tasks=MY_TASKS, latent_length=3, inference_window=2, batch_size=1024 * len(MY_TASKS), # 4096 policy_ent_coeff=1e-2, # 1e-2 # embedding_ent_coeff=1e-2, # 1e-3 inference_ce_coeff=1e-4, # 1e-4 ) run_experiment( run_task, exp_prefix='point3d_embed', n_parallel=16, seed=1, variant=config, plot=True, )
name="traj_embedding", embedding_spec=traj_embed_spec, hidden_sizes=(32, 32), adaptive_std=True, # Must be True for embedding learning ) baseline = LinearFeatureBaseline(env_spec=env_spec_embed) algo = TRPOTaskEmbedding( env=env, policy=policy, baseline=baseline, embedding=task_embedding, inference=traj_embedding, batch_size=4000, max_path_length=MAX_PATH_LENGTH, n_itr=400000000, discount=0.99, step_size=0.01, plot=True, ) algo.train() run_experiment( run_task, exp_prefix='trpo_pr2_clock_embed', n_parallel=N_PARALLEL, plot=True, )
# General experiment settings seed = 2 # Will be ignored if --seed option is used exp_name_direct = None # If None, exp_name will be constructed from exp_name_extra and other info. De-bug value = 'instant_run' exp_name_extra = 'Full_stack_Minibot' # Name of run # Seed seed = seed if args.seed == 'keep' \ else None if args.seed == 'random' \ else int(args.seed) # Launch training run_experiment( run_task, # Configure TF use_tf=True, use_gpu=True, # Name experiment exp_prefix='asa-full-stack', exp_name=exp_name_direct or \ (datetime.now().strftime('%Y_%m_%d-%H_%M') + (('--' + exp_name_extra) if exp_name_extra else '') + (('--s' + str(seed)) if seed else '') ), # Number of parallel workers for sampling n_parallel=0, # Snapshot information snapshot_mode="all", # Specifies the seed for the experiment (random seed if None) seed=seed, )
algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=v.batch_size, # 4096 max_path_length=v.max_path_length, n_itr=10000, discount=0.99, step_size=0.2, policy_ent_coeff=0., optimizer_args=dict(batch_size=32, max_epochs=10), plot=True, ) algo.train() config = dict( batch_size=4096, max_path_length=150, # 50 policy_init_std=1, # 1.0 ) run_experiment( run_task, exp_prefix='sawyer_reach_ppo_position', n_parallel=1, seed=1, variant=config, plot=True, )
baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train() run_experiment( run_task, # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random # seed will be used seed=1, # plot=True, )
from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.tf.algos import VPG from garage.tf.envs import TfEnv from garage.tf.policies import GaussianMLPPolicy from garage.misc.instrument import run_experiment env = TfEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=500, n_itr=40, discount=0.995, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=1e-4, ))) run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1, use_gpu=True, use_tf=True)
max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train() for step_size in [0.01, 0.05, 0.1]: for seed in [1, 11, 21, 31, 41]: run_experiment( run_task, exp_prefix="first_exp", # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a # random seed will be used seed=seed, # mode="local", mode="ec2", variant=dict(step_size=step_size, seed=seed) # plot=True, # terminate_machine=False, ) sys.exit()
stop_ce_gradient=True, ) algo.train() config = dict( tasks=TASKS, latent_length=2, inference_window=2, batch_size=1024 * len(TASKS), policy_ent_coeff=192e-2, # 2e-2 embedding_ent_coeff=2.2e-3, # 1e-2 inference_ce_coeff=5e-2, # 1e-2 max_path_length=100, embedding_init_std=1.0, embedding_max_std=2.0, embedding_min_std=0.38, policy_init_std=1.0, policy_max_std=None, policy_min_std=None, ) run_experiment( run_task, exp_prefix='ppo_point_embed_random_start_192_polent_300maxpath', n_parallel=2, seed=1, variant=config, plot=True, )
rospy.on_shutdown(pnp_env.shutdown) pnp_env.initialize() env = pnp_env policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train() run_experiment( run_task, n_parallel=1, plot=False, )
from garage.misc.instrument import run_experiment from garage.misc.instrument import stub from garage.tf.algos import TRPO from garage.tf.envs import TfEnv from garage.tf.policies import CategoricalMLPPolicy stub(globals()) # Need to wrap in a tf environment and force_reset to true # see https://github.com/openai/rllab/issues/87#issuecomment-282519288 env = TfEnv(gym.make("CartPole-v0")) policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=120, discount=0.99, step_size=0.01, ) run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
env_spec=env.spec, hidden_sizes=(64, 64), init_std=20, std_share_network=False, adaptive_std=True) baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=1024, # 4096 max_path_length=50, n_itr=1500, discount=0.99, step_size=0.2, policy_ent_coeff=1e-6, plot=True, use_mpc_es=True, ) algo.train(sess=sess) run_experiment( run_task, n_parallel=4, exp_prefix="ppo_point_compose_test_mpc", seed=2, plot=True, )
def run_task(*_): env = FlatGoalEnv(SawyerPickEnv(), obs_keys=["state_observation"]) env = TfEnv(normalize(env)) policy = GaussianMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=500, n_itr=500, discount=0.99, step_size=0.01, plot=True) algo.train() run_experiment( run_task, n_parallel=16, exp_prefix="trpo_sawyer_multiworld", seed=1, plot=True, )