# abstract way allows us to delegate to the environment for handling # the correct data type for the variable. For instance, for an # environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of # observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic # expressions for quantities related to the distribution of the actions. For a # Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under # garage.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. # Below we use dist.log_likelihood_sym to compute the symbolic # log-likelihood. For this example, the corresponding distribution is # an instance of the class garage.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters.
env = TheanoEnv(normalize(SwimmerEnv())) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes) backup_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) mix_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) pos_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) neg_eps_policy = GaussianMLPPolicy(env.spec, hidden_sizes=hidden_sizes) observations_var = env.observation_space.new_tensor_variable('observations', extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) rewards_var = tensor_utils.new_tensor('rewards', ndim=1, dtype=theano.config.floatX) dist = policy.distribution dist_info_vars = policy.dist_info_sym(observations_var) old_dist_info_vars = backup_policy.dist_info_sym(observations_var) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) mean_kl = TT.mean(kl) max_kl = TT.max(kl) #for test surr_ll = dist.log_likelihood_sym(actions_var, dist_info_vars) surr_ll_cumsum = dist.log_likelihood_sym_cumsum(actions_var, dist_info_vars) surr = TT.sum(surr_ll_cumsum * rewards_var) f_surr_ll = theano.function(inputs=[observations_var, actions_var], outputs=surr_ll) f_surr_ll_cumsum = theano.function(inputs=[observations_var, actions_var], outputs=surr_ll_cumsum)