def _build_ppo_loss(self): self.policy_optimizer = tf.train.AdamOptimizer( learning_rate=self._input_ph['lr'], ) for i in range(self.params.num_subtasks): self._update_operator['pol_loss_unclipped_{}'.format(TASKS(i))] = \ -self._tensor['ratio_{}'.format(TASKS(i))] * \ tf.reshape(self._input_ph['advantage'], [-1]) self._update_operator['pol_loss_clipped_{}'.format(TASKS(i))] = \ -self._tensor['ratio_clipped_{}'.format(TASKS(i))] * \ tf.reshape(self._input_ph['advantage'], [-1]) self._update_operator['surr_loss_{}'.format(TASKS(i))] = tf.reduce_mean( tf.maximum(self._update_operator['pol_loss_unclipped_{}'.format(TASKS(i))], self._update_operator['pol_loss_clipped_{}'.format(TASKS(i))]) ) self._update_operator['loss_{}'.format(TASKS(i))] = \ self._update_operator['surr_loss_{}'.format(TASKS(i))] # if self.params.use_kl_penalty: # self._update_operator['kl_pen'] = \ # self._input_ph['kl_lambda'] * \ # self._tensor['kl'] # self._update_operator['kl_loss'] = self._kl_eta * \ # tf.square(tf.maximum(0., self._tensor['kl'] - # 2. * self.params.target_kl)) # self._update_operator['loss'] += \ # self._update_operator['kl_pen'] + \ # self._update_operator['kl_loss'] if self.params.use_weight_decay: self._update_operator['weight_decay_loss_{}'] = \ tf_util.l2_loss(self._tensor['variable_list_{}'.format(TASKS(i))]) self._update_operator['loss_{}'.format(TASKS(i))] += \ self._update_operator['weight_decay_loss_{}'.format(TASKS(i))] * \ self.params.weight_decay_coefficient self._update_operator['sparse_correlation_loss_{}'.format(TASKS(i))] = \ tf_util.correlation_loss(self._tensor['policy_masks_{}'.format(TASKS(i))], [self._tensor['policy_masks_{}'.format(TASKS(j))] \ for j in range(self.params.num_subtasks) if j != i], apply_sigmoid=True) self._update_operator['sparse_mask_loss_{}'.format(TASKS(i))] = \ tf_util.l2_loss(self._tensor['policy_masks_{}'.format(TASKS(i))], apply_sigmoid=True) self._update_operator['surr_loss_{}'.format(TASKS(i))] += \ self._update_operator['sparse_correlation_loss_{}'.format(TASKS(i))] * \ self.params.correlation_coefficient + \ self._update_operator['sparse_mask_loss_{}'.format(TASKS(i))] * \ self.params.mask_penalty self._update_operator['update_op_{}'.format(TASKS(i))] = \ self.policy_optimizer.minimize(self._update_operator['surr_loss_{}'.format(TASKS(i))])
def _build_sac_loss(self): self._update_operator['loss'] = \ tf.reduce_mean(self._tensor['log_p_n'] - self._input_ph['advantage']) if self.params.use_weight_decay: self._update_operator['weight_decay_loss'] = \ tf_util.l2_loss(self._trainable_var_list) self._update_operator['loss'] += \ self._update_operator['weight_decay_loss'] * \ self.params.weight_decay_coefficient self._update_operator['policy_gradients'] = { self._input_ph['policy_sparse_masks'][i]: tf.gradients(self._update_operator['surr_loss'], self._MLP._w[i]) \ for i in range(len(self._input_ph['policy_sparse_masks']))} self._update_operator['update_op'] = tf.train.AdamOptimizer( learning_rate=self._input_ph['lr'], # # beta1=0.5, beta2=0.99, epsilon=1e-4 ).minimize(self._update_operator['surr_loss']) # the actual parameter values self._update_operator['get_flat_param'] = \ tf_util.GetFlat(self._session, self._trainable_var_list) # deprecated dont use # call this to set parameter values self._update_operator['set_from_flat_param'] = \ tf_util.SetFromFlat(self._session, self._trainable_var_list)
def _build_ppo_loss(self): self._update_operator['pol_loss_unclipped'] = \ -self._tensor['ratio'] * \ tf.reshape(self._input_ph['advantage'], [-1]) self._update_operator['pol_loss_clipped'] = \ -self._tensor['ratio_clipped'] * \ tf.reshape(self._input_ph['advantage'], [-1]) self._update_operator['surr_loss'] = tf.reduce_mean( tf.maximum(self._update_operator['pol_loss_unclipped'], self._update_operator['pol_loss_clipped']) ) self._update_operator['loss'] = self._update_operator['surr_loss'] # if self.params.use_kl_penalty: # self._update_operator['kl_pen'] = \ # self._input_ph['kl_lambda'] * \ # self._tensor['kl'] # self._update_operator['kl_loss'] = self._kl_eta * \ # tf.square(tf.maximum(0., self._tensor['kl'] - # 2. * self.params.target_kl)) # self._update_operator['loss'] += \ # self._update_operator['kl_pen'] + \ # self._update_operator['kl_loss'] if self.params.use_weight_decay: self._update_operator['weight_decay_loss'] = \ tf_util.l2_loss(self._trainable_var_list) self._update_operator['loss'] += \ self._update_operator['weight_decay_loss'] * \ self.params.weight_decay_coefficient self._update_operator['update_op'] = tf.train.AdamOptimizer( learning_rate=self._input_ph['lr'], # beta1=0.5, beta2=0.99, epsilon=1e-4 ).minimize(self._update_operator['surr_loss']) # the actual parameter values self._update_operator['get_flat_param'] = \ tf_util.GetFlat(self._session, self._trainable_var_list) # call this to set parameter values self._update_operator['set_from_flat_param'] = \ tf_util.SetFromFlat(self._session, self._trainable_var_list)
def _build_value_network_and_loss(self): # build the placeholder for training the value function self._input_ph['value_target'] = \ tf.placeholder(tf.float32, [None, 1], name='value_target') self._input_ph['old_values'] = \ tf.placeholder(tf.float32, [None, 1], name='old_value_est') # build the baseline-value function network_shape = [self._observation_size] + \ self.params.value_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.params.value_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.params.value_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) if self.params.use_subtask_value: self._value_MLP = network_util.SparseMultitaskMLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, num_tasks=self.params.num_subtasks ) for i in range(self.params.num_subtasks): self._tensor['value_weights_{}'.format(TASKS(i))] = self._value_MLP._w[i] self._tensor['value_masks_{}'.format(TASKS(i))] = self._value_MLP._sparse_mask[i] self._tensor['value_b'] = self._value_MLP._b output = self._value_MLP(self._tensor['net_input']) self.value_optimizer = tf.train.AdamOptimizer( learning_rate=self.params.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ) for i in range(self.params.num_subtasks): self._tensor['pred_value_{}'.format(TASKS(i))] = output[i] self._update_operator['vf_loss_{}'.format(TASKS(i))] = .5 * tf.reduce_mean( tf.square( self._tensor['pred_value_{}'.format(TASKS(i))] - self._input_ph['value_target'] ) ) self._update_operator['sparse_value_correlation_loss_{}'] = \ tf_util.correlation_loss(self._tensor['value_masks_{}'.format(TASKS(i))], [self._tensor['value_masks_{}'.format(TASKS(j))] \ for j in range(self.params.num_subtasks) if j != i]) self._update_operator['sparse_value_mask_loss_{}'] = \ tf_util.l2_loss(self._tensor['value_masks_{}'.format(TASKS(i))], apply_sigmoid=True) self._update_operator['vf_loss_{}'.format(TASKS(i))] += \ self._update_operator['sparse_value_correlation_loss_{}'] * \ self.params.correlation_coefficient + \ self._update_operator['sparse_value_mask_loss_{}'] * \ self.params.mask_penalty self._update_operator['vf_update_op_{}'.format(TASKS(i))] = \ self.value_optimizer.minimize(self._update_operator['vf_loss_{}'.format(TASKS(i))]) self._tensor['variable_list_{}'.format(TASKS(i))] = [ *self._tensor['policy_weights_{}'.format(TASKS(i))], *self._tensor['policy_b'], *self._tensor['value_weights_{}'.format(TASKS(i))], *self._tensor['value_b'], self._tensor['action_logstd'] ] else: self._value_MLP = network_util.MLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data, linear_last_layer=True, ) self._tensor['value_weights'] = self._value_MLP._w self._tensor['value_b'] = self._value_MLP._b self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input']) self.value_optimizer = tf.train.AdamOptimizer( learning_rate=self.params.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ) self._update_operator['vf_loss'] = .5 * tf.reduce_mean( tf.square( self._tensor['pred_value'] - self._input_ph['value_target'] ) ) self._update_operator['vf_update_op'] = \ self.value_optimizer.minimize(self._update_operator['vf_loss']) for i in range(self.params.num_subtasks): self._tensor['variable_list_{}'.format(TASKS(i))] = [ *self._tensor['policy_weights_{}'.format(TASKS(i))], *self._tensor['policy_b'], *self._tensor['value_weights'], *self._tensor['value_b'], self._tensor['action_logstd'] ]