Ejemplo n.º 1
0
 def test_trpo_relu_nan(self):
     env = TheanoEnv(DummyEnv())
     policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, ))
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env,
                 policy=policy,
                 baseline=baseline,
                 n_itr=1,
                 batch_size=1000,
                 max_path_length=100,
                 step_size=0.001)
     algo.train()
     assert not np.isnan(np.sum(policy.get_param_values()))
Ejemplo n.º 2
0
 def test_trpo_deterministic_nan(self):
     env = TheanoEnv(DummyEnv())
     policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(1, ))
     policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
     baseline = ZeroBaseline(env_spec=env.spec)
     algo = TRPO(env=env,
                 policy=policy,
                 baseline=baseline,
                 n_itr=10,
                 batch_size=1000,
                 max_path_length=100,
                 step_size=0.01)
     algo.train()
     assert not np.isnan(np.sum(policy.get_param_values()))
Ejemplo n.º 3
0
from garage.envs import normalize
from garage.envs.box2d import CartpoleEnv
from garage.theano.envs import TheanoEnv
from garage.theano.policies import GaussianMLPPolicy
from garage.sampler import parallel_sampler

# normalize() makes sure that the actions for the environment lies within the
# range [-1, 1] (only works for environments with continuous actions)
env = TheanoEnv(normalize(CartpoleEnv()))
# Initialize a neural network policy with a single hidden layer of 8 hidden
# units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ))
parallel_sampler.populate_task(env, policy)
parallel_sampler.initialize(10)
paths = parallel_sampler.sample_paths(policy.get_param_values(), 100)
# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 100
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.01

# Construct the computation graph

# Create a Theano variable for storing the observations We could have simply
# written `observations_var = TT.matrix('observations')` instead for this