def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = TheanoEnv(normalize(gym.make("CartPole-v0"))) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(v): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, # each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable # plotting plot=True, ) algo.train()
def run_task(*_): env = normalize( DmControlEnv( domain_name='cartpole', task_name='balance', visualize_reward=True)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=400, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_task(*_): """Run task function.""" initial_goal = np.array([0.6, -0.1, 0.40]) # Initialize moveit_commander moveit_commander.roscpp_initialize(sys.argv) rospy.init_node('trpo_sim_sawyer_reacher_exp', anonymous=True) env = ReacherEnv(initial_goal, initial_joint_pos=INITIAL_ROBOT_JOINT_POS, simulated=True) rospy.on_shutdown(env.shutdown) env.initialize() policy = GaussianMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=spec(env)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=False, force_batch_sampler=True, ) algo.train()
def test_issue_3(): """ As reported in https://github.com/garage/garage/issues/3, the adaptive_std parameter was not functioning properly """ env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def test_trpo_deterministic_nan(): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_relu_nan(): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_nonlinearity=naive_relu, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_pick_and_place(*_): initial_goal = np.array([0.6, -0.1, 0.80]) env = PickAndPlaceEnv(initial_goal) policy = GaussianMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=spec(env)) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
def run_block_stacking(*_): """Run TRPO with block stacking. """ env = BlockStackingEnv() policy = GaussianMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=spec(env)) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=2000, baseline=baseline, n_itr=1000, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(*_): env = normalize(gym.make("Acrobot-v1")) policy = CategoricalMLPPolicy(env_spec=spec(env), hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=spec(env)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=horizon(env), n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run(*_): """Stub method for running trpo.""" env = TheanoEnv( ReacherEnv(control_method='position_control', sparse_reward=False)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, batch_size=4000, max_path_length=100, baseline=baseline, n_itr=2500, discount=0.99, step_size=0.01, plot=True, force_batch_sampler=True, ) algo.train()
def run_task(v): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # plot=True, ) algo.train()
def run_task(*_): env = TheanoEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable # plotting #plot=True ) algo.train()
def run_task(*_): env = normalize( OneHotMultiTaskEnv( task_env_cls=PR2ArmClockEnv, task_args=TASK_ARGS, task_kwargs=TASK_KWARGS)) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=400000000, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_task(*_): env = normalize(PR2ArmEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=100, discount=0.99, step_size=0.01, plot=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) algo.train()
from garage.algos import TRPO from garage.baselines import LinearFeatureBaseline from garage.envs import normalize from garage.envs.point_env import PointEnv from garage.policies import GaussianMLPPolicy from garage.theano.envs import TheanoEnv env = TheanoEnv(normalize(PointEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, ) algo.train()