def __init__(self, input_shape, output_dim, name='CategoricalMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, tr_optimizer=None, tr_optimizer_args=None, use_trust_region=True, max_kl_step=0.01, normalize_inputs=True, layer_normalization=False): super().__init__(input_shape, output_dim, name) self._use_trust_region = use_trust_region self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if tr_optimizer_args is None: tr_optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) if tr_optimizer is None: self._tr_optimizer = make_optimizer(ConjugateGradientOptimizer, **tr_optimizer_args) else: self._tr_optimizer = make_optimizer(tr_optimizer, **tr_optimizer_args) self._first_optimized = False self.model = NormalizedInputMLPModel( input_shape, output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) self._initialize()
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """Construct operation graph for the optimizer. Args: loss (tf.Tensor): Loss objective to minimize. target (object): Target object to optimize. The object should implemenet `get_params()` and `get_param_values`. inputs (list[tf.Tensor]): List of input placeholders. extra_inputs (list[tf.Tensor]): List of extra input placeholders. kwargs (dict): Extra unused keyword arguments. Some optimizers have extra input, e.g. KL constraint. """ del kwargs with tf.name_scope(self._name): self._target = target tf_optimizer = make_optimizer(self._tf_optimizer, **self._learning_rate) self._train_op = tf_optimizer.minimize( loss, var_list=target.get_params()) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = LazyDict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), )
def __init__(self, inner_algo, env, policy, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.max_path_length = inner_algo.max_path_length self._meta_evaluator = meta_evaluator self._policy = policy self._env = env self._value_function = copy.deepcopy(inner_algo._value_function) self._initial_vf_state = self._value_function.state_dict() self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, module=policy, lr=_Default(outer_lr), eps=_Default(1e-5)) self._evaluate_every_n_epochs = evaluate_every_n_epochs
def test_torch_make_optimizer_with_tuple(self): """Test make_optimizer function with tuple as first argument.""" optimizer_type = (torch.optim.Adam, {'lr': 0.1}) module = torch.nn.Linear(2, 1) optimizer = make_optimizer(optimizer_type, module=module) # pylint: disable=isinstance-second-argument-not-valid-type assert isinstance(optimizer, optimizer_type) assert optimizer.defaults['lr'] == optimizer_type[1]['lr']
def __init__(self, optimizer, module, max_optimization_epochs=1, minibatch_size=None): self._optimizer = make_optimizer(optimizer, module=module) self._max_optimization_epochs = max_optimization_epochs self._minibatch_size = minibatch_size
def test_torch_make_optimizer_with_type(self): """Test make_optimizer function with type as first argument.""" optimizer_type = torch.optim.Adam module = torch.nn.Linear(2, 1) lr = 0.123 optimizer = make_optimizer(optimizer_type, module=module, lr=lr) assert isinstance(optimizer, optimizer_type) assert optimizer.defaults['lr'] == lr
def test_tf_make_optimizer_raise_value_error(self): """Test make_optimizer raises value error.""" lr = 0.123 optimizer_type = (tf.compat.v1.train.AdamOptimizer, { 'learning_rate': lr }) with pytest.raises(ValueError): _ = make_optimizer(optimizer_type, learning_rate=lr)
def __init__(self, env_spec, policy, baseline, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, epsilon=0.5, l2_reg_dual=0., l2_reg_loss=0., optimizer=LbfgsOptimizer, optimizer_args=None, dual_optimizer=scipy.optimize.fmin_l_bfgs_b, dual_optimizer_args=None, name='REPS'): optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50)) dual_optimizer_args = dual_optimizer_args or dict(maxiter=50) self.policy = policy self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._flatten_input = True self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._feat_diff = None self._param_eta = None self._param_v = None self._f_dual = None self._f_dual_grad = None self._f_policy_kl = None self._optimizer = make_optimizer(optimizer, **optimizer_args) self._dual_optimizer = dual_optimizer self._dual_optimizer_args = dual_optimizer_args self._epsilon = float(epsilon) self._l2_reg_dual = float(l2_reg_dual) self._l2_reg_loss = float(l2_reg_loss) self._episode_reward_mean = collections.deque(maxlen=100) if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.init_opt()
def __init__(self, input_shape, output_dim, name='ContinuousMLPRegressor', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, normalize_inputs=True): super().__init__(input_shape, output_dim, name) self._normalize_inputs = normalize_inputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) self.model = NormalizedInputMLPModel( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init) self._initialize()
def test_tf_make_optimizer_with_tuple(self): """Test make_optimizer function with tuple as first argument.""" lr = 0.123 optimizer_type = (tf.compat.v1.train.AdamOptimizer, { 'learning_rate': lr }) optimizer = make_optimizer(optimizer_type) # pylint: disable=isinstance-second-argument-not-valid-type assert isinstance(optimizer, optimizer_type) self.sess.run(tf.compat.v1.global_variables_initializer()) assert np.allclose( optimizer._lr, lr ) # Adam holds the value of learning rate in private variable self._lr
def test_tf_make_optimizer_with_type(self): """Test make_optimizer function with type as first argument.""" optimizer_type = tf.compat.v1.train.AdamOptimizer lr = 0.123 optimizer = make_optimizer(optimizer_type, learning_rate=lr, name='testOptimizer') assert isinstance(optimizer, optimizer_type) self.sess.run(tf.compat.v1.global_variables_initializer()) assert optimizer._name == 'testOptimizer' assert np.allclose( optimizer._lr, lr ) # Adam holds the value of learning rate in private variable self._lr
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy and qf network self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self._target_policy.model.networks['default'].input], outputs=self._target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self._target_qf.model.networks['default'].inputs, outputs=self._target_qf.model.networks['default'].outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self.env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self._policy_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._policy_weight_decay) for var in self.policy.get_regularizable_vars(): policy_reg = regularizer(var) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self._qf_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._qf_weight_decay) for var in self.qf.get_regularizable_vars(): qf_reg = regularizer(var) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval_loss, var_list=self.qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target
def __init__(self, input_shape, output_dim, filters, strides, padding, hidden_sizes, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform(), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform(), output_b_init=tf.zeros_initializer(), name='GaussianCNNRegressor', learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_filters=(), std_strides=(), std_padding='SAME', std_hidden_sizes=(), std_hidden_nonlinearity=None, std_output_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1., optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01): super().__init__(input_shape, output_dim, name) self._use_trust_region = use_trust_region self._subsample_factor = subsample_factor self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs with tf.compat.v1.variable_scope(self._name, reuse=False) as vs: self._variable_scope = vs if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) self.model = GaussianCNNRegressorModel( input_shape=input_shape, output_dim=output_dim, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_filters=std_filters, std_strides=std_strides, std_padding=std_padding, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization='exp', layer_normalization=layer_normalization) self._initialize()
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_path_length=None, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] super().__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, max_eval_path_length=max_eval_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_policy=exploration_policy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return) self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self.qf) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, module=self.qf, lr=qf_lr)
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy (actor) and qf (critic) networks self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self._target_policy.model.networks['default'].input], outputs=self._target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self._target_qf.model.networks['default'].inputs, outputs=self._target_qf.model.networks['default'].outputs) self.target_qf2_f_prob_online = tensor_utils.compile_function( inputs=self._target_qf2.model.networks['default'].inputs, outputs=self._target_qf2.model.networks['default'].outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = tensor_utils.get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops( self.qf2.get_global_vars(), self._target_qf2.get_global_vars(), self._tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self.env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') q2val = self.qf2.get_qval_sym(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = qf_optimizer.minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target self.f_train_qf2 = f_train_qf2
def __init__(self, env_spec, policy, baseline, scope=None, max_path_length=100, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, pg_loss='surrogate', lr_clip_range=0.01, max_kl_step=0.01, optimizer=None, optimizer_args=None, policy_ent_coeff=0.0, use_softplus_entropy=False, use_neg_logli_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', flatten_input=True, name='NPO'): self.policy = policy self.scope = scope self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._flatten_input = flatten_input self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._use_softplus_entropy = use_softplus_entropy self._use_neg_logli_entropy = use_neg_logli_entropy self._stop_entropy_gradient = stop_entropy_gradient self._pg_loss = pg_loss if optimizer is None: if optimizer_args is None: optimizer_args = dict() optimizer = LbfgsOptimizer self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, use_neg_logli_entropy, policy_ent_coeff) if pg_loss not in ['vanilla', 'surrogate', 'surrogate_clip']: raise ValueError('Invalid pg_loss') self._optimizer = make_optimizer(optimizer, **optimizer_args) self._lr_clip_range = float(lr_clip_range) self._max_kl_step = float(max_kl_step) self._policy_ent_coeff = float(policy_ent_coeff) self._f_rewards = None self._f_returns = None self._f_policy_kl = None self._f_policy_entropy = None self._episode_reward_mean = collections.deque(maxlen=100) if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.init_opt()
def test_torch_make_optimizer_raise_value_error(self): """Test make_optimizer raises value error.""" optimizer_type = (torch.optim.Adam, {'lr': 0.1}) module = torch.nn.Linear(2, 1) with pytest.raises(ValueError): _ = make_optimizer(optimizer_type, module=module, lr=0.123)
def init_opt(self): """Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self.env_spec.action_space.n self.episode_rewards = [] self.episode_qf_losses = [] # build q networks with tf.name_scope(self._name): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = tensor_utils.get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars()) self._qf_update_ops = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim, on_value=1., off_value=0.) q_selected = tf.reduce_sum( self.qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self._double_q: target_qval_with_online_q = self.qf.get_qval_sym( self._target_qf.input, self.qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self._target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim, on_value=1., off_value=0.), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self._target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self.discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr) if self._grad_norm_clipping is not None: gradients = qf_optimizer.compute_gradients( loss, var_list=self.qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self._grad_norm_clipping), var) optimize_loss = qf_optimizer.apply_gradients(gradients) else: optimize_loss = qf_optimizer.minimize( loss, var_list=self.qf.get_trainable_vars()) self._train_qf = tensor_utils.compile_function( inputs=[ self.qf.input, action_t_ph, reward_t_ph, done_t_ph, self._target_qf.input ], outputs=[loss, optimize_loss])