def run_task(*_): """Run task function.""" initial_goal = np.array([0.6, -0.1, 0.30]) rospy.init_node('trpo_real_sawyer_reacher_exp', anonymous=True) env = TheanoEnv( ReacherEnv( initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=False, robot_control_mode='position')) rospy.on_shutdown(env.shutdown) env.initialize() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def run_task(*_): initial_goal = np.array([0.6, -0.1, 0.80]) rospy.init_node('trpo_real_sawyer_pnp_exp', anonymous=True) pnp_env = TheanoEnv( PickAndPlaceEnv(initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=False)) rospy.on_shutdown(pnp_env.shutdown) pnp_env.initialize() env = pnp_env policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def run_task(*_): env_name = "Ant" hidden_sizes = (32,32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = ZeroBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.01, batch_size=5000, minibatch_size=500, n_sub_itr = 10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = TheanoEnv(normalize(gym.make("CartPole-v0"))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): env_name = "HumanoidStandup-v2" hidden_sizes = (100, 50, 25) env = TheanoEnv(normalize(gym.make(env_name))) print(env.spec.observation_space, env.spec.action_space) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy, n_timestep=5e6, learning_rate=0.05, batch_size=5000, minibatch_size=500, n_sub_itr=10, baseline=baseline, max_path_length=500, discount=0.99, decay_learing_rate=True, log_dir='./logs/' + env_name, ) algo.train()
def run_task(v): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def test_flatten(self): env = TheanoEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) for i in range(10): env.reset() for e in range(5): env.render() action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert next_obs.shape == env.observation_space.low.shape if done: break env.close()
def test_baseline(self, baseline_cls): env = TheanoEnv(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6, )) baseline = baseline_cls(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100) algo.train()
def test_polopt_algo(self, algo_cls, env_cls, policy_cls): print("Testing %s, %s, %s" % (algo_cls.__name__, env_cls.__name__, policy_cls.__name__)) env = TheanoEnv(env_cls()) policy = policy_cls(env_spec=env.spec, ) baseline = ZeroBaseline(env_spec=env.spec) algo = algo_cls(env=env, policy=policy, baseline=baseline, **(algo_args.get(algo_cls, dict()))) algo.train() assert not np.any(np.isnan(policy.get_param_values()))
def test_trpo_relu_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_adaptive_std(): """ Checks if the adaptive_std parameter works. """ env = TheanoEnv(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def test_trpo_deterministic_nan(): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = InstrumentedTRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=4, discount=0.99, step_size=0.01, plot=True) algo.train()
def test_unflatten(self): env = TheanoEnv( normalize(gym.make('Blackjack-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=False)) for i in range(10): env.reset() for e in range(5): action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert (env.observation_space.flatten(next_obs).shape == env.observation_space.flat_dim) # yapf: disable if done: break env.close()
def run_pick_and_place(*_): initial_goal = np.array([0.6, -0.1, 0.80]) env = TheanoEnv(PickAndPlaceEnv(initial_goal)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
def test_ddpg(self): env = TheanoEnv(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) algo = DDPG( env=env, policy=policy, qf=qf, es=es, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, replay_pool_size=1000, eval_samples=100, ) algo.train()
def run_block_stacking(*_): """Run TRPO with block stacking. """ env = TheanoEnv(BlockStackingEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run(*_): """Stub method for running trpo.""" env = TheanoEnv( ReacherEnv(control_method='position_control', sparse_reward=False)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=100, baseline=baseline, n_itr=2500, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(gym.make("Acrobot-v1"))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.max_episode_steps, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def test_ddpg(self): env = TheanoEnv(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qf = ContinuousMLPQFunction(env.spec) es = OUStrategy(env.spec) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1000), time_horizon=100) algo = DDPG( env=env, policy=policy, qf=qf, es=es, pool=replay_buffer, n_epochs=1, epoch_length=100, batch_size=32, min_pool_size=50, eval_samples=100, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting #plot=True ) algo.train()
def test_dm_control_theano_policy(self): task = ALL_TASKS[0] env = TheanoEnv(DmControlEnv(domain_name=task[0], task_name=task[1])) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=5, n_itr=1, discount=0.99, step_size=0.01, ) algo.train()
from garage.algos import TRPO from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.point_env import PointEnv from garage.policies import GaussianMLPPolicy from garage.theano.envs import TheanoEnv env = TheanoEnv(normalize(PointEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, ) algo.train()
from lasagne.updates import adam import numpy as np import theano import theano.tensor as TT from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.policies import GaussianMLPPolicy from garage.theano.envs import TheanoEnv # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(CartpoleEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1
from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.algos import TRPO from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, # plot=True ) algo.train()
from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.theano.algos.capg import CAPG from garage.theano.envs import TheanoEnv from garage.theano.baselines import GaussianMLPBaseline from garage.theano.policies import GaussianMLPPolicy from garage.misc.instrument import run_experiment from garage.misc.ext import set_seed import numpy as np for batchsize in [5000]: for learning_rate in [0.05, 0.01]: for i in range(3): seed = np.random.randint(1, 10000) env_name = "SGD_Swimmer_-t" hidden_sizes = (32, 32) env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy,
from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.theano.algos.capg import CAPG from garage.theano.envs import TheanoEnv from garage.theano.baselines import GaussianMLPBaseline from garage.theano.policies import GaussianMLPPolicy from garage.misc.instrument import run_experiment from garage.misc.ext import set_seed import numpy as np for learning_rate in [0.01]: seed = np.random.randint(1,10000) env_name = "SGD_Hopper" hidden_sizes = (32, 32) env = TheanoEnv(normalize(gym.make("InvertedPendulum-v2"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy,
from garage.baselines import ZeroBaseline from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.envs.mujoco import SwimmerEnv from garage.theano.algos.capg_corrected import CAPG from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy from garage.misc.instrument import run_experiment from garage.misc.ext import set_seed import numpy as np for i in range(15): seed = np.random.randint(1, 10000) env_name = "CAPG_Walker2d" hidden_sizes = (64, 64) env = TheanoEnv(normalize(gym.make("Walker2d-v2"))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = CAPG( env=env, policy=policy, backup_policy=backup_policy, mix_policy=mix_policy, pos_eps_policy=pos_eps_policy, neg_eps_policy=neg_eps_policy,