def _line_search_inner_bisection(value_and_gradients_function, search_interval, active, f_lim): """Performs bisection and updates the interval.""" midpoint = (search_interval.left.x + search_interval.right.x) / 2 val_mid = value_and_gradients_function(midpoint) is_valid_mid = hzl.is_finite(val_mid) still_active = active & is_valid_mid new_failed = active & ~is_valid_mid next_inteval = search_interval._replace( failed=search_interval.failed | new_failed, func_evals=search_interval.func_evals + 1) def _apply_update(): update_result = hzl.update(value_and_gradients_function, next_inteval.left, next_inteval.right, val_mid, f_lim, active=still_active) return HagerZhangLineSearchResult( converged=next_inteval.converged, failed=next_inteval.failed | update_result.failed, iterations=next_inteval.iterations + update_result.iteration, func_evals=next_inteval.func_evals + update_result.num_evals, left=update_result.left, right=update_result.right) return prefer_static.cond(tf.reduce_any(input_tensor=still_active), _apply_update, lambda: next_inteval)
def _line_search_inner_bisection(value_and_gradients_function, val_left, val_right, f_lim): """Performs bisection and updates the interval.""" midpoint = (val_left.x + val_right.x) / 2 result_mid = value_and_gradients_function(midpoint) f_mid, df_mid = result_mid.f, result_mid.df val_mid = _FnDFn(x=midpoint, f=f_mid, df=df_mid, full_result=result_mid) val_mid_finite = hzl.is_finite(val_mid) def _success_fn(): """Action to take if the midpoint evaluation succeeded.""" update_result = hzl.update(value_and_gradients_function, val_left, val_right, val_mid, f_lim) return _UpdateResult(failed=update_result.failed, num_evals=update_result.num_evals + 1, left=update_result.left, right=update_result.right) def _failed_fn(): return _UpdateResult(failed=True, num_evals=1, left=val_left, right=val_right) return prefer_static.cond(val_mid_finite, true_fn=_success_fn, false_fn=_failed_fn)
def _fix_step_size(value_and_gradients_function, val_c_input, active, step_size_shrink_param): """Shrinks the input step size until the value and grad become finite.""" # The maximum iterations permitted are determined as the number of halvings # it takes to reduce 1 to 0 in the given dtype. iter_max = np.ceil(-np.log2(_machine_eps(val_c_input.x.dtype))) def _cond(i, val_c, to_fix): del val_c # Unused. return (i < iter_max) & tf.reduce_any(input_tensor=to_fix) def _body(i, val_c, to_fix): next_c = tf.where(to_fix, val_c.x * step_size_shrink_param, val_c.x) next_val_c = value_and_gradients_function(next_c) still_to_fix = to_fix & ~hzl.is_finite(next_val_c) return (i + 1, next_val_c, still_to_fix) to_fix = active & ~hzl.is_finite(val_c_input) return tf.while_loop(cond=_cond, body=_body, loop_vars=(0, val_c_input, to_fix))
def hager_zhang(value_and_gradients_function, initial_step_size=None, value_at_initial_step=None, value_at_zero=None, threshold_use_approximate_wolfe_condition=1e-6, shrinkage_param=0.66, expansion_param=5.0, sufficient_decrease_param=0.1, curvature_param=0.9, step_size_shrink_param=0.1, max_iterations=50, name=None): """The Hager Zhang line search algorithm. Performs an inexact line search based on the algorithm of [Hager and Zhang (2006)][2]. The univariate objective function `value_and_gradients_function` is typically generated by projecting a multivariate objective function along a search direction. Suppose the multivariate function to be minimized is `g(x1,x2, .. xn)`. Let (d1, d2, ..., dn) be the direction along which we wish to perform a line search. Then the projected univariate function to be used for line search is ```None f(a) = g(x1 + d1 * a, x2 + d2 * a, ..., xn + dn * a) ``` The directional derivative along (d1, d2, ..., dn) is needed for this procedure. This also corresponds to the derivative of the projected function `f(a)` with respect to `a`. Note that this derivative must be negative for `a = 0` if the direction is a descent direction. The usual stopping criteria for the line search is the satisfaction of the (weak) Wolfe conditions. For details of the Wolfe conditions, see ref. [3]. On a finite precision machine, the exact Wolfe conditions can be difficult to satisfy when one is very close to the minimum and as argued by [Hager and Zhang (2005)][1], one can only expect the minimum to be determined within square root of machine precision. To improve the situation, they propose to replace the Wolfe conditions with an approximate version depending on the derivative of the function which is applied only when one is very close to the minimum. The following algorithm implements this enhanced scheme. ### Usage: Primary use of line search methods is as an internal component of a class of optimization algorithms (called line search based methods as opposed to trust region methods). Hence, the end user will typically not want to access line search directly. In particular, inexact line search should not be confused with a univariate minimization method. The stopping criteria of line search is the satisfaction of Wolfe conditions and not the discovery of the minimum of the function. With this caveat in mind, the following example illustrates the standalone usage of the line search. ```python # Define value and gradient namedtuple ValueAndGradient = namedtuple('ValueAndGradient', ['x', 'f', 'df']) # Define a quadratic target with minimum at 1.3. def value_and_gradients_function(x): return ValueAndGradient(x=x, f=(x - 1.3) ** 2, df=2 * (x-1.3)) # Set initial step size. step_size = tf.constant(0.1) ls_result = tfp.optimizer.linesearch.hager_zhang( value_and_gradients_function, initial_step_size=step_size) # Evaluate the results. with tf.Session() as session: results = session.run(ls_result) # Ensure convergence. assert results.converged # If the line search converged, the left and the right ends of the # bracketing interval are identical. assert results.left.x == result.right.x # Print the number of evaluations and the final step size. print ("Final Step Size: %f, Evaluations: %d" % (results.left.x, results.func_evals)) ``` ### References: [1]: William Hager, Hongchao Zhang. A new conjugate gradient method with guaranteed descent and an efficient line search. SIAM J. Optim., Vol 16. 1, pp. 170-172. 2005. https://www.math.lsu.edu/~hozhang/papers/cg_descent.pdf [2]: William Hager, Hongchao Zhang. Algorithm 851: CG_DESCENT, a conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software, Vol 32., 1, pp. 113-137. 2006. http://users.clas.ufl.edu/hager/papers/CG/cg_compare.pdf [3]: Jorge Nocedal, Stephen Wright. Numerical Optimization. Springer Series in Operations Research. pp 33-36. 2006 Args: value_and_gradients_function: A Python callable that accepts a real scalar tensor and returns a namedtuple with the fields 'x', 'f', and 'df' that correspond to scalar tensors of real dtype containing the point at which the function was evaluated, the value of the function, and its derivative at that point. The other namedtuple fields, if present, should be tensors or sequences (possibly nested) of tensors. In usual optimization application, this function would be generated by projecting the multivariate objective function along some specific direction. The direction is determined by some other procedure but should be a descent direction (i.e. the derivative of the projected univariate function must be negative at 0.). Alternatively, the function may represent the batching of `n` such line functions (e.g. projecting a single multivariate objective function along `n` distinct directions at once) accepting n points as input, i.e. a tensor of shape [n], and the fields 'x', 'f' and 'df' in the returned namedtuple should each be a tensor of shape [n], with the corresponding input points, function values, and derivatives at those input points. initial_step_size: (Optional) Scalar positive `Tensor` of real dtype, or a tensor of shape [n] in batching mode. The initial value (or values) to try to bracket the minimum. Default is `1.` as a float32. Note that this point need not necessarily bracket the minimum for the line search to work correctly but the supplied value must be greater than 0. A good initial value will make the search converge faster. value_at_initial_step: (Optional) The full return value of evaluating value_and_gradients_function at initial_step_size, i.e. a namedtuple with 'x', 'f', 'df', if already known by the caller. If supplied the value of `initial_step_size` will be ignored, otherwise the tuple will be computed by evaluating value_and_gradients_function. value_at_zero: (Optional) The full return value of value_and_gradients_function at `0.`, i.e. a namedtuple with 'x', 'f', 'df', if already known by the caller. If not supplied the tuple will be computed by evaluating value_and_gradients_function. threshold_use_approximate_wolfe_condition: Scalar positive `Tensor` of real dtype. Corresponds to the parameter 'epsilon' in [Hager and Zhang (2006)][2]. Used to estimate the threshold at which the line search switches to approximate Wolfe conditions. shrinkage_param: Scalar positive Tensor of real dtype. Must be less than `1.`. Corresponds to the parameter `gamma` in [Hager and Zhang (2006)][2]. If the secant**2 step does not shrink the bracketing interval by this proportion, a bisection step is performed to reduce the interval width. expansion_param: Scalar positive `Tensor` of real dtype. Must be greater than `1.`. Used to expand the initial interval in case it does not bracket a minimum. Corresponds to `rho` in [Hager and Zhang (2006)][2]. sufficient_decrease_param: Positive scalar `Tensor` of real dtype. Bounded above by the curvature param. Corresponds to `delta` in the terminology of [Hager and Zhang (2006)][2]. curvature_param: Positive scalar `Tensor` of real dtype. Bounded above by `1.`. Corresponds to 'sigma' in the terminology of [Hager and Zhang (2006)][2]. step_size_shrink_param: Positive scalar `Tensor` of real dtype. Bounded above by `1`. If the supplied step size is too big (i.e. either the objective value or the gradient at that point is infinite), this factor is used to shrink the step size until it is finite. max_iterations: Positive scalar `Tensor` of integral dtype or None. The maximum number of iterations to perform in the line search. The number of iterations used to bracket the minimum are also counted against this parameter. name: (Optional) Python str. The name prefixed to the ops created by this function. If not supplied, the default name 'hager_zhang' is used. Returns: results: A namedtuple containing the following attributes. converged: Boolean `Tensor` of shape [n]. Whether a point satisfying Wolfe/Approx wolfe was found. failed: Boolean `Tensor` of shape [n]. Whether line search failed e.g. if either the objective function or the gradient are not finite at an evaluation point. iterations: Scalar int32 `Tensor`. Number of line search iterations made. func_evals: Scalar int32 `Tensor`. Number of function evaluations made. left: A namedtuple, as returned by value_and_gradients_function, of the left end point of the final bracketing interval. Values are equal to those of `right` on batch members where converged is True. Otherwise, it corresponds to the last interval computed. right: A namedtuple, as returned by value_and_gradients_function, of the right end point of the final bracketing interval. Values are equal to those of `left` on batch members where converged is True. Otherwise, it corresponds to the last interval computed. """ with tf.compat.v1.name_scope(name, 'hager_zhang', [ initial_step_size, value_at_initial_step, value_at_zero, threshold_use_approximate_wolfe_condition, shrinkage_param, expansion_param, sufficient_decrease_param, curvature_param ]): val_0, val_initial, f_lim, prepare_evals = _prepare_args( value_and_gradients_function, initial_step_size, value_at_initial_step, value_at_zero, threshold_use_approximate_wolfe_condition) valid_inputs = (hzl.is_finite(val_0) & (val_0.df < 0) & tf.math.is_finite(val_initial.x) & (val_initial.x > 0)) # Note: _fix_step_size returns immediately if either all inputs are invalid # or none need fixing. fix_step_evals, val_c, fix_failed = _fix_step_size( value_and_gradients_function, val_initial, valid_inputs, step_size_shrink_param) failed = ~valid_inputs | fix_failed init_interval = HagerZhangLineSearchResult( converged=tf.zeros_like(failed), # i.e. all False. failed=failed, func_evals=prepare_evals + fix_step_evals, iterations=tf.convert_to_tensor(value=0), left=val_0, right=val_c) def _apply_bracket_and_search(): """Bracketing and searching to do for valid inputs.""" return _bracket_and_search(value_and_gradients_function, init_interval, f_lim, max_iterations, shrinkage_param, expansion_param, sufficient_decrease_param, curvature_param) return prefer_static.cond(tf.reduce_any(input_tensor=~failed), _apply_bracket_and_search, lambda: init_interval)
def _body(i, val_c, to_fix): next_c = tf.where(to_fix, val_c.x * step_size_shrink_param, val_c.x) next_val_c = value_and_gradients_function(next_c) still_to_fix = to_fix & ~hzl.is_finite(next_val_c) return (i + 1, next_val_c, still_to_fix)
def _valid_inputs_fn(): """Performs bracketing and line search if inputs are valid.""" # If the value or the gradient at the supplied step is not finite, # we attempt to repair it. step_size_too_large = ~(tf.math.is_finite(val_c_input.df) & tf.math.is_finite(val_c_input.f)) def _is_too_large_fn(): return _fix_step_size(value_and_gradients_function, val_c_input, step_size_shrink_param) val_c, fix_evals = prefer_static.cond(step_size_too_large, _is_too_large_fn, lambda: (val_c_input, 0)) # Check if c is fixed now. valid_at_c = hzl.is_finite(val_c) & (val_c.x > 0) def _failure_fn(): # If c is still not good, just return 0. return HagerZhangLineSearchResult( converged=tf.convert_to_tensor(value=True, name='converged'), failed=tf.convert_to_tensor(value=False, name='failed'), func_evals=prepare_evals + fix_evals, iterations=tf.convert_to_tensor(value=0), left_pt=val_0.x, objective_at_left_pt=val_0.f, grad_objective_at_left_pt=val_0.df, right_pt=val_0.x, objective_at_right_pt=val_0.f, grad_objective_at_right_pt=val_0.df, full_result=val_0.full_result) def success_fn(): """Bracketing and searching to do if all inputs are valid.""" result = _bracket_and_search( value_and_gradients_function, val_0, val_c, f_lim, max_iterations, shrinkage_param=shrinkage_param, expansion_param=expansion_param, sufficient_decrease_param=sufficient_decrease_param, curvature_param=curvature_param) converged = tf.convert_to_tensor(value=result.found_wolfe, name='converged') return HagerZhangLineSearchResult( converged=converged, failed=tf.convert_to_tensor(value=result.failed, name='failed'), func_evals=result.num_evals + prepare_evals + fix_evals, iterations=result.iteration, left_pt=result.left.x, objective_at_left_pt=result.left.f, grad_objective_at_left_pt=result.left.df, right_pt=result.right.x, objective_at_right_pt=result.right.f, grad_objective_at_right_pt=result.right.df, full_result=result.left.full_result) return prefer_static.cond(valid_at_c, true_fn=success_fn, false_fn=_failure_fn)