def _policy_opt_input_values(self, samples_data): """Update policy optimize input values based on samples data. Args: samples_data (dict): Processed sample data. See metarl.tf.paths_to_tensors() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _policy_opt_input_values(self, samples_data): """Map rollout samples to the policy optimizer inputs. Args: samples_data (dict): Processed sample data. See metarl.tf.paths_to_tensors() for details. Returns: list(np.ndarray): Flatten policy optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] policy_opt_input_values = self._policy_opt_inputs._replace( obs_var=samples_data['observations'], action_var=samples_data['actions'], reward_var=samples_data['rewards'], baseline_var=samples_data['baselines'], valid_var=samples_data['valids'], policy_state_info_vars_list=policy_state_info_list, ) return flatten_inputs(policy_opt_input_values)
def _dual_opt_input_values(self, samples_data): """Update dual func optimize input values based on samples data. Args: samples_data (dict): Processed sample data. See process_samples() for details. Returns: list(np.ndarray): Flatten dual function optimization input values. """ policy_state_info_list = [ samples_data['agent_infos'][k] for k in self.policy.state_info_keys ] # yapf: disable policy_old_dist_info_list = [ samples_data['agent_infos'][k] for k in self.policy.distribution.dist_info_keys ] # pylint: disable=unexpected-keyword-arg dual_opt_input_values = self._dual_opt_inputs._replace( reward_var=samples_data['rewards'], valid_var=samples_data['valids'], feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, policy_state_info_vars_list=policy_state_info_list, policy_old_dist_info_vars_list=policy_old_dist_info_list, ) return flatten_inputs(dual_opt_input_values)
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self.policy.distribution with tf.name_scope('policy_entropy'): if self._use_neg_logli_entropy: policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') else: policy_entropy = pol_dist.entropy() # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) # dense form, match the shape of advantage policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length]) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return policy_entropy
def init_opt(self): """Initialize the optimization procedure.""" pol_loss_inputs, pol_opt_inputs, dual_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs self._dual_opt_inputs = dual_opt_inputs pol_loss = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, inputs=flatten_inputs( self._policy_opt_inputs))
def init_opt(self): """Initialize optimizater.""" pol_loss_inputs, pol_opt_inputs = self._build_inputs() self._policy_opt_inputs = pol_opt_inputs pol_loss, pol_kl = self._build_policy_loss(pol_loss_inputs) self._optimizer.update_opt(loss=pol_loss, target=self.policy, leq_constraint=(pol_kl, self._max_kl_step), inputs=flatten_inputs( self._policy_opt_inputs), constraint_name='mean_kl')
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. Raises: NotImplementedError: If is_recurrent is True. """ pol_dist = self.policy.distribution # Initialize dual params self._param_eta = 15. self._param_v = np.random.rand( self._env_spec.observation_space.flat_dim * 2 + 4) with tf.name_scope('bellman_error'): delta_v = tf.boolean_mask(i.reward_var, i.valid_var) + tf.tensordot( i.feat_diff, i.param_v, 1) with tf.name_scope('policy_loss'): ll = pol_dist.log_prob(i.action_var) ll = tf.boolean_mask(ll, i.valid_var) loss = -tf.reduce_mean( ll * tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta))) reg_params = self.policy.get_regularizable_vars() loss += self._l2_reg_loss * tf.reduce_sum( [tf.reduce_mean(tf.square(param)) for param in reg_params]) / len(reg_params) with tf.name_scope('kl'): kl = self._old_policy.distribution.kl_divergence( self.policy.distribution) pol_mean_kl = tf.reduce_mean(kl) with tf.name_scope('dual'): dual_loss = i.param_eta * self._epsilon + ( i.param_eta * tf.math.log( tf.reduce_mean( tf.exp(delta_v / i.param_eta - tf.reduce_max(delta_v / i.param_eta)))) + i.param_eta * tf.reduce_max(delta_v / i.param_eta)) dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) + tf.square(1 / i.param_eta)) dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v]) # yapf: disable self._f_dual = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_loss, log_name='f_dual') # yapf: enable self._f_dual_grad = tensor_utils.compile_function( flatten_inputs(self._dual_opt_inputs), dual_grad, log_name='f_dual_grad') self._f_policy_kl = tensor_utils.compile_function( flatten_inputs(self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') return loss
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ policy_entropy = self._build_entropy_term(i) rewards = i.reward_var if self._maximum_entropy: with tf.name_scope('augmented_rewards'): rewards = i.reward_var + (self._policy_ent_coeff * policy_entropy) with tf.name_scope('policy_loss'): adv = compute_advantages(self._discount, self._gae_lambda, self.max_path_length, i.baseline_var, rewards, name='adv') adv = tf.reshape(adv, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self._center_adv: adv = center_advs(adv, axes=[0], eps=eps) if self._positive_adv: adv = positive_advs(adv, eps) with tf.name_scope('kl'): kl = self._old_policy.distribution.kl_divergence( self.policy.distribution) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope('vanilla_loss'): ll = self.policy.distribution.log_prob(i.action_var, name='log_likelihood') vanilla = ll * adv # Calculate surrogate loss with tf.name_scope('surrogate_loss'): lr = tf.exp( ll - self._old_policy.distribution.log_prob(i.action_var)) surrogate = lr * adv # Finalize objective function with tf.name_scope('loss'): if self._pg_loss == 'vanilla': # VPG uses the vanilla objective obj = tf.identity(vanilla, name='vanilla_obj') elif self._pg_loss == 'surrogate': # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name='surr_obj') elif self._pg_loss == 'surrogate_clip': lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') surr_clip = lr_clip * adv obj = tf.minimum(surrogate, surr_clip, name='surr_obj') if self._entropy_regularzied: obj += self._policy_ent_coeff * policy_entropy # filter only the valid values obj = tf.boolean_mask(obj, i.valid_var) # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] loss = -tf.reduce_mean(obj) # Diagnostic functions self._f_policy_kl = tf.compat.v1.get_default_session( ).make_callable(pol_mean_kl, feed_list=flatten_inputs(self._policy_opt_inputs)) self._f_rewards = tf.compat.v1.get_default_session().make_callable( rewards, feed_list=flatten_inputs(self._policy_opt_inputs)) returns = discounted_returns(self._discount, self.max_path_length, rewards) self._f_returns = tf.compat.v1.get_default_session().make_callable( returns, feed_list=flatten_inputs(self._policy_opt_inputs)) return loss, pol_mean_kl
def _build_entropy_term(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ with tf.name_scope('policy_entropy'): if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name='policy_dist_info_2') policy_neg_log_likeli = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.action_var, policy_dist_info, name='policy_log_likeli') if self._use_neg_logli_entropy: policy_entropy = policy_neg_log_likeli else: policy_entropy = self.policy.distribution.entropy_sym( policy_dist_info) else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat_2') policy_neg_log_likeli_flat = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.flat.action_var, policy_dist_info_flat, name='policy_log_likeli_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid_2') policy_neg_log_likeli_valid = -self.policy.distribution.log_likelihood_sym( # noqa: E501 i.valid.action_var, policy_dist_info_valid, name='policy_log_likeli_valid') if self._use_neg_logli_entropy: if self._maximum_entropy: policy_entropy = tf.reshape(policy_neg_log_likeli_flat, [-1, self.max_path_length]) else: policy_entropy = policy_neg_log_likeli_valid else: if self._maximum_entropy: policy_entropy_flat = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_flat) policy_entropy = tf.reshape(policy_entropy_flat, [-1, self.max_path_length]) else: policy_entropy_valid = self.policy.distribution.entropy_sym( # noqa: E501 policy_dist_info_valid) policy_entropy = policy_entropy_valid # This prevents entropy from becoming negative for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) if self._stop_entropy_gradient: policy_entropy = tf.stop_gradient(policy_entropy) self._f_policy_entropy = compile_function(flatten_inputs( self._policy_opt_inputs), policy_entropy, log_name='f_policy_entropy') return policy_entropy
def _build_policy_loss(self, i): """Build policy loss and other output tensors. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy loss. tf.Tensor: Mean policy KL divergence. """ pol_dist = self.policy.distribution policy_entropy = self._build_entropy_term(i) rewards = i.reward_var if self._maximum_entropy: with tf.name_scope('augmented_rewards'): rewards = i.reward_var + (self._policy_ent_coeff * policy_entropy) with tf.name_scope('policy_loss'): adv = compute_advantages(self.discount, self.gae_lambda, self.max_path_length, i.baseline_var, rewards, name='adv') adv_flat = flatten_batch(adv, name='adv_flat') adv_valid = filter_valids(adv_flat, i.flat.valid_var, name='adv_valid') if self.policy.recurrent: adv = tf.reshape(adv, [-1, self.max_path_length]) # Optionally normalize advantages eps = tf.constant(1e-8, dtype=tf.float32) if self.center_adv: if self.policy.recurrent: adv = center_advs(adv, axes=[0], eps=eps) else: adv_valid = center_advs(adv_valid, axes=[0], eps=eps) if self.positive_adv: if self.policy.recurrent: adv = positive_advs(adv, eps) else: adv_valid = positive_advs(adv_valid, eps) if self.policy.recurrent: policy_dist_info = self.policy.dist_info_sym( i.obs_var, i.policy_state_info_vars, name='policy_dist_info') else: policy_dist_info_flat = self.policy.dist_info_sym( i.flat.obs_var, i.flat.policy_state_info_vars, name='policy_dist_info_flat') policy_dist_info_valid = filter_valids_dict( policy_dist_info_flat, i.flat.valid_var, name='policy_dist_info_valid') policy_dist_info = policy_dist_info_valid # Calculate loss function and KL divergence with tf.name_scope('kl'): if self.policy.recurrent: kl = pol_dist.kl_sym( i.policy_old_dist_info_vars, policy_dist_info, ) pol_mean_kl = tf.reduce_sum( kl * i.valid_var) / tf.reduce_sum(i.valid_var) else: kl = pol_dist.kl_sym( i.valid.policy_old_dist_info_vars, policy_dist_info_valid, ) pol_mean_kl = tf.reduce_mean(kl) # Calculate vanilla loss with tf.name_scope('vanilla_loss'): if self.policy.recurrent: ll = pol_dist.log_likelihood_sym(i.action_var, policy_dist_info, name='log_likelihood') vanilla = ll * adv * i.valid_var else: ll = pol_dist.log_likelihood_sym(i.valid.action_var, policy_dist_info_valid, name='log_likelihood') vanilla = ll * adv_valid # Calculate surrogate loss with tf.name_scope('surrogate_loss'): if self.policy.recurrent: lr = pol_dist.likelihood_ratio_sym( i.action_var, i.policy_old_dist_info_vars, policy_dist_info, name='lr') surrogate = lr * adv * i.valid_var else: lr = pol_dist.likelihood_ratio_sym( i.valid.action_var, i.valid.policy_old_dist_info_vars, policy_dist_info_valid, name='lr') surrogate = lr * adv_valid # Finalize objective function with tf.name_scope('loss'): if self._pg_loss == 'vanilla': # VPG uses the vanilla objective obj = tf.identity(vanilla, name='vanilla_obj') elif self._pg_loss == 'surrogate': # TRPO uses the standard surrogate objective obj = tf.identity(surrogate, name='surr_obj') elif self._pg_loss == 'surrogate_clip': lr_clip = tf.clip_by_value(lr, 1 - self._lr_clip_range, 1 + self._lr_clip_range, name='lr_clip') if self.policy.recurrent: surr_clip = lr_clip * adv * i.valid_var else: surr_clip = lr_clip * adv_valid obj = tf.minimum(surrogate, surr_clip, name='surr_obj') if self._entropy_regularzied: obj += self._policy_ent_coeff * policy_entropy # Maximize E[surrogate objective] by minimizing # -E_t[surrogate objective] if self.policy.recurrent: loss = -tf.reduce_sum(obj) / tf.reduce_sum(i.valid_var) else: loss = -tf.reduce_mean(obj) # Diagnostic functions self._f_policy_kl = compile_function(flatten_inputs( self._policy_opt_inputs), pol_mean_kl, log_name='f_policy_kl') self._f_rewards = compile_function(flatten_inputs( self._policy_opt_inputs), rewards, log_name='f_rewards') returns = discounted_returns(self.discount, self.max_path_length, rewards) self._f_returns = compile_function(flatten_inputs( self._policy_opt_inputs), returns, log_name='f_returns') return loss, pol_mean_kl