def test_trpo_relu_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_deterministic_nan(self): env = TheanoEnv(DummyEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, )) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
from garage.envs import normalize from garage.envs.box2d import CartpoleEnv from garage.theano.envs import TheanoEnv from garage.theano.policies import GaussianMLPPolicy from garage.sampler import parallel_sampler # normalize() makes sure that the actions for the environment lies within the # range [-1, 1] (only works for environments with continuous actions) env = TheanoEnv(normalize(CartpoleEnv())) # Initialize a neural network policy with a single hidden layer of 8 hidden # units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) parallel_sampler.populate_task(env, policy) parallel_sampler.initialize(10) paths = parallel_sampler.sample_paths(policy.get_param_values(), 100) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.01 # Construct the computation graph # Create a Theano variable for storing the observations We could have simply # written `observations_var = TT.matrix('observations')` instead for this