Esempio n. 1
0
def _launch_ec2(func, exp_prefix, exp_name, params, run_experiment_kwargs):
    print("Launching task", exp_name)
    kwargs = dict(n_parallel=1,
                  snapshot_mode="last",
                  seed=params.get("seed", None),
                  mode="ec2")
    kwargs.update(run_experiment_kwargs)
    kwargs.update(
        dict(exp_prefix=exp_prefix,
             exp_name=exp_name,
             variant=params,
             confirm_remote=False))

    run_experiment(func, **kwargs)
Esempio n. 2
0
                                        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(env,
                actor=actor_net,
                actor_lr=1e-4,
                critic_lr=1e-3,
                critic=critic_net,
                plot=True,
                target_update_tau=1e-2,
                n_epochs=500,
                n_epoch_cycles=10,
                n_rollout_steps=50,
                n_train_steps=50,
                discount=0.9,
                replay_buffer_size=int(1e6),
                min_buffer_size=int(1e3),
                exploration_strategy=action_noise,
                actor_optimizer=tf.train.AdamOptimizer,
                critic_optimizer=tf.train.AdamOptimizer)

    ddpg.train(sess=sess)


run_experiment(
    run_task,
    n_parallel=1,
    exp_prefix="ddpg_point_compose",
    seed=1,
    plot=True,
)
Esempio n. 3
0
skill_integration_idx = CategoricalMLPSkillIntegrator.get_index_of_method_str(skill_integration_method)

# Seed
seed = seed if args.seed == 'keep' \
       else None if args.seed == 'random' \
       else int(args.seed)

# Launch training
run_experiment(
        run_task,
        # Configure TF
        use_tf=True,
        use_gpu=True,
        # Name experiment
        exp_prefix='asa-resume-with-new-skill',
        exp_name=exp_name_direct or \
                 (datetime.now().strftime('%Y_%m_%d-%H_%M')
                  + '--resumed_' + snapshot_name
                  + (('--' + exp_name_extra) if exp_name_extra else '')
                  + '--skill_' + skill_policy_exp_name
                  + '--integ' + str(skill_integration_idx) + '_' + skill_integration_method
                  + (('--s' + str(seed)) if seed else '')
                 ),
        # Number of parallel workers for sampling
        n_parallel=0,
        # Snapshot information
        snapshot_mode="all",
        # Specifies the seed for the experiment  (random seed if None)
        seed=seed
)
Esempio n. 4
0
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=1000,
        discount=0.99,
        step_size=0.01,
        plot=True,
        #optimizer_args=dict(max_grad_norm=0.5)
    )
    algo.train()


config = dict(
    batch_size=4096,
    max_path_length=100,  # 50
    policy_init_std=0.1,  # 1.0
)

run_experiment(
    run_task,
    exp_prefix='sawyer_reach_trpo_torque',
    n_parallel=8,
    seed=1,
    variant=config,
    plot=True,
)
Esempio n. 5
0
from garage.theano.envs import TheanoEnv
from garage.theano.policies import GaussianMLPPolicy


def run(*_):
    """Stub method for running trpo."""
    env = TheanoEnv(
        ReacherEnv(control_method='position_control', sparse_reward=False))
    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(
        env=env,
        policy=policy,
        batch_size=4000,
        max_path_length=100,
        baseline=baseline,
        n_itr=2500,
        discount=0.99,
        step_size=0.01,
        plot=True,
        force_batch_sampler=True,
    )
    algo.train()


run_experiment(
    run,
    n_parallel=2,
    plot=True,
)
Esempio n. 6
0
            domain_name='cartpole', task_name='balance',
            visualize_reward=True))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=400,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    snapshot_mode="last",
    plot=True,
)
Esempio n. 7
0
from garage.theano.optimizers import FiniteDifferenceHvp
from garage.theano.policies import GaussianGRUPolicy


def run_task(*_):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = GaussianGRUPolicy(env_spec=env.spec, )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=10,
        discount=0.99,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    seed=1,
)
Esempio n. 8
0
    :return:
    """
    env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

    policy = GaussianMLPPolicy(
        name="policy", env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2048,
        max_path_length=100,
        n_itr=488,
        discount=0.99,
        step_size=0.01,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=False)
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    snapshot_mode="last",
    seed=1,
    plot=False,
)
Esempio n. 9
0
        env_spec=env,
        hidden_sizes=(64, 64),
        init_std=20,
        # std_share_network=False,
        # adaptive_std=True
    )
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=100,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
    )
    algo.train(sess=sess)


run_experiment(
    run_task,
    n_parallel=4,
    exp_prefix="ppo_sawyer_compose",
    seed=2,
    plot=True,
)
Esempio n. 10
0
                                        name="Critic",
                                        hidden_sizes=[200, 100],
                                        hidden_nonlinearity=tf.nn.relu)

    ddpg = DDPG(env,
                actor=actor_net,
                actor_lr=1e-4,
                critic_lr=1e-3,
                critic=critic_net,
                plot=True,
                target_update_tau=1e-2,
                n_epochs=500,
                n_epoch_cycles=10,
                n_rollout_steps=200,
                n_train_steps=50,
                discount=0.9,
                replay_buffer_size=int(1e6),
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
                actor_optimizer=tf.train.AdamOptimizer,
                critic_optimizer=tf.train.AdamOptimizer)
    ddpg.train(sess=sess)


run_experiment(
    run_task,
    exp_prefix='ddpg_sawyer_reach',
    n_parallel=1,
    seed=1,
    plot=True,
)
Esempio n. 11
0
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=2000,
        discount=0.99,
        step_size=0.2,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()


config = dict(
    batch_size=4096,
    max_path_length=500,  # 50
    policy_init_std=1.0,  # 1.0
)

run_experiment(
    run_task,
    exp_prefix='sawyer_pusher_ppo_done',
    n_parallel=4,
    seed=1,
    variant=config,
    plot=True,
)
Esempio n. 12
0
        policy_ent_coeff=v.policy_ent_coeff,
        embedding_ent_coeff=v.embedding_ent_coeff,
        inference_ce_coeff=v.inference_ce_coeff,
        use_softplus_entropy=True,
    )
    algo.train()


config = dict(
    tasks=TASKS,
    latent_length=3,
    inference_window=6,
    batch_size=4096 * len(TASKS),
    policy_ent_coeff=5e-3,  # 1e-2
    embedding_ent_coeff=1e-3,  # 1e-3
    inference_ce_coeff=5e-3,  # 1e-4
    max_path_length=200,
    embedding_init_std=1.0,
    embedding_max_std=2.0,
    policy_init_std=1.0,
)

run_experiment(
    run_task,
    exp_prefix='sawyer_reach_embed_8goal',
    n_parallel=8,
    seed=1,
    variant=config,
    plot=False,
)
Esempio n. 13
0
        init_std=0.5,  # TODO was 100
    )

    baseline = LinearFeatureBaseline(env_spec=env_spec_embed)

    algo = TRPOTaskEmbedding(
        env=env,
        policy=policy,
        baseline=baseline,
        inference=traj_embedding,
        batch_size=4000,
        max_path_length=100,
        n_itr=500,
        discount=0.99,
        step_size=0.01,
        plot=plot,
        plot_warmup_itrs=20,
        policy_ent_coeff=0.0,  # 0.001,  #0.1,
        embedding_ent_coeff=0.0,  #0.1,
        inference_ce_ent_coeff=0.,  # 0.03,  #0.1,  # 0.1,
    )
    algo.train()


run_experiment(
    run_task,
    exp_prefix='trpo_point_embed',
    n_parallel=16,
    plot=True,
)
Esempio n. 14
0
        n_itr=500,
        discount=0.99,
        step_size=0.2,
        plot=True,
        policy_ent_coeff=v.policy_ent_coeff,
        embedding_ent_coeff=v.embedding_ent_coeff,
        inference_ce_coeff=v.inference_ce_coeff,
        num_tasks_held_out=1,
    )
    algo.train()


config = dict(
    tasks=MY_TASKS,
    latent_length=3,
    inference_window=2,
    batch_size=1024 * len(MY_TASKS),  # 4096
    policy_ent_coeff=1e-2,  # 1e-2 #
    embedding_ent_coeff=1e-2,  # 1e-3
    inference_ce_coeff=1e-4,  # 1e-4
)

run_experiment(
    run_task,
    exp_prefix='point3d_embed',
    n_parallel=16,
    seed=1,
    variant=config,
    plot=True,
)
Esempio n. 15
0
        name="traj_embedding",
        embedding_spec=traj_embed_spec,
        hidden_sizes=(32, 32),
        adaptive_std=True,  # Must be True for embedding learning
    )

    baseline = LinearFeatureBaseline(env_spec=env_spec_embed)

    algo = TRPOTaskEmbedding(
        env=env,
        policy=policy,
        baseline=baseline,
        embedding=task_embedding,
        inference=traj_embedding,
        batch_size=4000,
        max_path_length=MAX_PATH_LENGTH,
        n_itr=400000000,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()


run_experiment(
    run_task,
    exp_prefix='trpo_pr2_clock_embed',
    n_parallel=N_PARALLEL,
    plot=True,
)
Esempio n. 16
0
# General experiment settings
seed = 2  # Will be ignored if --seed option is used
exp_name_direct = None  # If None, exp_name will be constructed from exp_name_extra and other info. De-bug value = 'instant_run'
exp_name_extra = 'Full_stack_Minibot'  # Name of run

# Seed
seed = seed if args.seed == 'keep' \
       else None if args.seed == 'random' \
       else int(args.seed)

# Launch training
run_experiment(
        run_task,
        # Configure TF
        use_tf=True,
        use_gpu=True,
        # Name experiment
        exp_prefix='asa-full-stack',
        exp_name=exp_name_direct or \
                 (datetime.now().strftime('%Y_%m_%d-%H_%M')
                  + (('--' + exp_name_extra) if exp_name_extra else '')
                  + (('--s' + str(seed)) if seed else '')
                 ),
        # Number of parallel workers for sampling
        n_parallel=0,
        # Snapshot information
        snapshot_mode="all",
        # Specifies the seed for the experiment  (random seed if None)
        seed=seed,
)
Esempio n. 17
0
    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=10000,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=0.,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()


config = dict(
    batch_size=4096,
    max_path_length=150,  # 50
    policy_init_std=1,  # 1.0
)

run_experiment(
    run_task,
    exp_prefix='sawyer_reach_ppo_position',
    n_parallel=1,
    seed=1,
    variant=config,
    plot=True,
)
Esempio n. 18
0
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()


run_experiment(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random
    # seed will be used
    seed=1,
    # plot=True,
)
Esempio n. 19
0
from garage.envs.box2d import CartpoleEnv
from garage.envs.mujoco import SwimmerEnv
from garage.tf.algos import VPG
from garage.tf.envs import TfEnv
from garage.tf.policies import GaussianMLPPolicy
from garage.misc.instrument import run_experiment

env = TfEnv(normalize(SwimmerEnv()))

policy = GaussianMLPPolicy(name="policy",
                           env_spec=env.spec,
                           hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(env=env,
           policy=policy,
           baseline=baseline,
           batch_size=5000,
           max_path_length=500,
           n_itr=40,
           discount=0.995,
           optimizer_args=dict(tf_optimizer_args=dict(learning_rate=1e-4, )))

run_experiment(algo.train(),
               n_parallel=1,
               snapshot_mode="last",
               seed=1,
               use_gpu=True,
               use_tf=True)
Esempio n. 20
0
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable
        # plotting
        plot=True,
    )
    algo.train()


for step_size in [0.01, 0.05, 0.1]:
    for seed in [1, 11, 21, 31, 41]:
        run_experiment(
            run_task,
            exp_prefix="first_exp",
            # Number of parallel workers for sampling
            n_parallel=1,
            # Only keep the snapshot parameters for the last iteration
            snapshot_mode="last",
            # Specifies the seed for the experiment. If this is not provided, a
            # random seed will be used
            seed=seed,
            # mode="local",
            mode="ec2",
            variant=dict(step_size=step_size, seed=seed)
            # plot=True,
            # terminate_machine=False,
        )
        sys.exit()
Esempio n. 21
0
        stop_ce_gradient=True,
    )
    algo.train()


config = dict(
    tasks=TASKS,
    latent_length=2,
    inference_window=2,
    batch_size=1024 * len(TASKS),
    policy_ent_coeff=192e-2,  # 2e-2
    embedding_ent_coeff=2.2e-3,  # 1e-2
    inference_ce_coeff=5e-2,  # 1e-2
    max_path_length=100,
    embedding_init_std=1.0,
    embedding_max_std=2.0,
    embedding_min_std=0.38,
    policy_init_std=1.0,
    policy_max_std=None,
    policy_min_std=None,
)

run_experiment(
    run_task,
    exp_prefix='ppo_point_embed_random_start_192_polent_300maxpath',
    n_parallel=2,
    seed=1,
    variant=config,
    plot=True,
)
Esempio n. 22
0
    rospy.on_shutdown(pnp_env.shutdown)

    pnp_env.initialize()

    env = pnp_env

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=100,
        discount=0.99,
        step_size=0.01,
        plot=False,
        force_batch_sampler=True,
    )
    algo.train()


run_experiment(
    run_task,
    n_parallel=1,
    plot=False,
)
Esempio n. 23
0
from garage.misc.instrument import run_experiment
from garage.misc.instrument import stub
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy

stub(globals())

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(gym.make("CartPole-v0"))

policy = CategoricalMLPPolicy(name="policy",
                              env_spec=env.spec,
                              hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    step_size=0.01,
)

run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
Esempio n. 24
0
                               env_spec=env.spec,
                               hidden_sizes=(64, 64),
                               init_std=20,
                               std_share_network=False,
                               adaptive_std=True)
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=50,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
        use_mpc_es=True,
    )
    algo.train(sess=sess)


run_experiment(
    run_task,
    n_parallel=4,
    exp_prefix="ppo_point_compose_test_mpc",
    seed=2,
    plot=True,
)
Esempio n. 25
0
def run_task(*_):
    env = FlatGoalEnv(SawyerPickEnv(), obs_keys=["state_observation"])
    env = TfEnv(normalize(env))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=4000,
                max_path_length=500,
                n_itr=500,
                discount=0.99,
                step_size=0.01,
                plot=True)
    algo.train()


run_experiment(
    run_task,
    n_parallel=16,
    exp_prefix="trpo_sawyer_multiworld",
    seed=1,
    plot=True,
)