def postprocess(x, g, dnorm, X, G, w, verbose=1): """ postprocessing of set of sampled or bundled gradients if x is not one of the columns of X, prepend it to X and g to G and recompute w and dnorm: this can only reduce dnorm also set loc.dnorm to dnorm and loc.evaldist to the max distance from x to columns of X note: w is needed as input argument for the usual case that w is not recomputed but is just passed back to output """ dist = [linalg.norm(x - X[..., j], 2) for j in xrange(X.shape[1])] evaldist = np.max(dist) # for returning indx = np.argmin(dist) # for checking if x is a column of X mindist = dist[indx] if mindist == 0 and indx == 1: # nothing to do pass elif mindist == 0 and indx > 1: # this should not happen in HANSO 2.0 # swap x and g into first positions of X and G # might be necessary after local bundle, which is not used in HANSO 2.0 X[..., [1, indx]] = X[..., [indx, 1]] G[..., [1, indx]] = G[..., [indx, 1]] w[..., [1, indx]] = w[..., [indx, 1]] else: # this cannot happen after BFGS, but it may happen after gradient # sampling, for example if max iterations exceeded: line search found a # lower point but quit before solving new QP # prepend x to X and g to G and recompute w X = np.vstack((x, X.T)).T if not np.any(np.isnan(g)): G = np.hstack((g, G)) w, d, _, _ = qpspecial(G, verbose=verbose) # Anders Skajaa's QP code dnorm = linalg.norm(d, 2) return {"dnorm": dnorm, "evaldist": evaldist}, X, G, w
def gradsampfixed(func, x0, grad=None, f0=None, g0=None, samprad=1e-4, maxit=10, gradnormtol=1e-6, fvalquit=-np.inf, cpumax=np.inf, verbose=2, ngrad=None, **kwargs): """" Gradient sampling minimization with fixed sampling radius intended to be called by gradsamp1run only Parameters ---------- func : callable func(x) function to minimise. x0: 1D array of len nvar, optional (default None) intial point grad : callable grad(x, *args) the gradient of `func`. If None, then `func` returns the function value and the gradient (``f, g = func(x, *args)``), unless `approx_grad` is True in which case `func` returns only ``f``. f0: float, optional (default None) function value at x0 g0: 1D array of length nvar = len(x0), optional (default None) gradient at x0 samprad: float, optional (default 1e-4) radius around x0, for sampling gradients See for example bfgs1run for the meaning of the other params. See Also -------- `bfgs` and `bfgs1run` """ def _fg(x): return func(x) if grad is None else func(x), grad(x) def _log(msg, level=0): if verbose > level: print msg _log('gradsamp: sampling radius = %7.1e' % samprad) x = np.array(x0) f0 = _fg(x0)[0] if f0 is None else f0 g0 = _fg(x0)[1] if g0 is None else g0 f = f0 g = g0 X = x G = np.array([g]).T w = 1 quitall = 0 cpufinish = time.time() + cpumax dnorm = np.inf for it in xrange(maxit): # evaluate gradients at randomly generated points near x # first column of Xnew and Gnew are respectively x and g Xnew, Gnew = getbundle(func, x, grad=grad, g0=g, samprad=samprad, n=ngrad) # solve QP subproblem wnew, dnew, _, _ = qpspecial(Gnew, verbose=verbose) dnew = -dnew # this is a descent direction gtdnew = np.dot(g.T, dnew) # gradient value at current point dnormnew = linalg.norm(dnew, 2) if dnormnew < dnorm: # for returning, may not be the final one dnorm = dnormnew X = Xnew G = Gnew w = wnew if dnormnew < gradnormtol: # since dnormnew is first to satisfy tolerance, it must equal dnorm _log(' tolerance met at iter %d, f = %g, dnorm = %5.1e' % ( it, f, dnorm)) return x, f, g, dnorm, X, G, w, quitall elif gtdnew >= 0 or np.isnan(gtdnew): # dnorm, not dnormnew, which may be bigger _log(' not descent direction, quit at iter %d, f = %g, ' 'dnorm = %5.1e' % (it, f, dnorm)) return x, f, g, dnorm, X, G, w, quitall # note that dnew is NOT normalized, but we set second Wolfe # parameter to 0 so that sign of derivative must change # and this is accomplished by expansion steps when necessary, # so it does not seem necessary to normalize d wolfe1 = 0 wolfe2 = 0 alpha, x, f, g, fail, _, _, _ = linesch_ww( func, x, dnew, grad=grad, func0=f, grad0=g, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) _log(' iter %d: step = %5.1e, f = %g, dnorm = %5.1e' % ( it, alpha, f, dnormnew), level=1) if f < fvalquit: _log(' reached target objective, quit at iter %d ' % iter) quitall = 1 return x, f, g, dnorm, X, G, w, quitall # if fail == 1 # Wolfe conditions not both satisfied, DO NOT quit, # because this typically means gradient set not rich enough and we # should continue sampling if fail == -1: # function apparently unbounded below _log(' f may be unbounded below, quit at iter %d, f = %g' % ( it, f)) quitall = 1 return x, f, g, dnorm, X, G, w, quitall if time.time() > cpufinish: _log(' cpu time limit exceeded, quit at iter #d' % it) quitall = 1 return x, f, g, dnorm, X, G, w, quitall _log(' %d iters reached, f = %g, dnorm = %5.1e' % (maxit, f, dnorm)) return x, f, g, dnorm, np.array(X), np.array(G), w, quitall
def bfgs1run(func, x0, grad=None, maxit=100, nvec=0, verbose=1, funcrtol=1e-6, gradnormtol=1e-4, fvalquit=-np.inf, xnormquit=np.inf, cpumax=np.inf, strongwolfe=False, wolfe1=0, wolfe2=.5, quitLSfail=1, ngrad=None, evaldist=1e-4, H0=None, scale=1): """ Make a single run of BFGS (with inexact line search) from one starting point. Intended to be called from bfgs. Parameters ---------- func : callable func(x) function to minimise. x0: 1D array of len nvar, optional (default None) intial point grad : callable grad(x, *args) the gradient of `func`. If None, then `func` returns the function value and the gradient (``f, g = func(x, *args)``), unless `approx_grad` is True in which case `func` returns only ``f``. nvar: int, optional (default None) number of dimensions in the problem (exclusive x0) maxit: int, optional (default 100) maximum number of BFGS iterates we are ready to pay for wolfe1: float, optional (default 0) param passed to linesch_ww[sw] function wolfe2: float, optional (default .5) param passed to linesch_ww[sw] function strongwolfe: boolean, optional (default 1) 0 for weak Wolfe line search (default) 1 for strong Wolfe line search Strong Wolfe line search is not recommended for use with BFGS; it is very complicated and bad if f is nonsmooth; however, it can be useful to simulate an exact line search fvalquit: float, optional (default -inf) param passed to bfgs1run function gradnormtol: float, optional (default 1e-6) termination tolerance on d: smallest vector in convex hull of up to ngrad gradients xnormquit: float, optional (default inf) quit if norm(x) exceeds this value evaldist: float, optional default (1e-4) the gradients used in the termination test qualify only if they are evaluated at points approximately within distance evaldist of x H0: 2D array of shape (nvar, nvar), optional (default identity matrix) for full BFGS: initial inverse Hessian approximation (must be positive definite, but this is not checked), this could be draw drawn from a Wishart distribution; for limited memory BFGS: same, but applied every iteration (must be sparse in this case) scale: boolean, optional (default True) for full BFGS: 1 to scale H0 at first iteration, 0 otherwise for limited memory BFGS: 1 to scale H0 every time, 0 otherwise cpumax: float, optional (default inf) quit if cpu time in secs exceeds this (applies to total running time) verbose: int, optional (default 1) param passed to bfgs1run function quitLSfail: int, optional (default 1) 1 if quit when line search fails, 0 (potentially useful if func is not numerically continuous) ngrad: int, optional (default min(100, 2 * nvar, nvar + 10)) number of gradients willing to save and use in solving QP to check optimality tolerance on smallest vector in their convex hull; see also next two options Returns ------- x: 1D array of same length nvar = len(x0) final iterate f: float final function value d: 1D array of same length nvar final smallest vector in convex hull of saved gradients H: 2D array of shape (nvar, nvar) final inverse Hessian approximation iter: int number of iterations info: int reason for termination 0: tolerance on smallest vector in convex hull of saved gradients met 1: max number of iterations reached 2: f reached target value 3: norm(x) exceeded limit 4: cpu time exceeded limit 5: f or g is inf or nan at initial point 6: direction not a descent direction (because of rounding) 7: line search bracketed minimizer but Wolfe conditions not satisfied 8: line search did not bracket minimizer: f may be unbounded below 9: relative tolerance on function value met on last iteration X: 2D array of shape (iter, nvar) iterates where saved gradients were evaluated G: 2D array of shape (nvar, nvar) gradients evaluated at these points w: 1D array weights defining convex combination d = G*w fevalrec: 1D array of length iter record of all function evaluations in the line searches xrec: 2D array of length (iter, nvar) record of x iterates Hrec: 2D array of shape (iter, nvar) record of H (Hessian) iterates times: list of floats time consumed in each iteration Raises ------ ImportError """ def _fg(x): return func(x) if grad is None else (func(x), grad(x)) def _log(msg, level=0): if verbose > level: print msg # sanitize input x0 = np.array(x0).ravel() nvar = np.prod(x0.shape) H0 = np.eye(nvar) if H0 is None else H0 ngrad = min(100, min(2 * nvar, nvar + 10)) if ngrad is None else ngrad x = np.array(x0) H = np.array(H0) # initialize auxiliary variables S = [] Y = [] xrec = [] fevalrec = [] Hrec = [] X = np.array([x]).T nG = 1 w = 1 # prepare for timing cpufinish = time.time() + cpumax time0 = time.time() times = [] # first evaluation f, g = _fg(x) # times.append((time.time() - time0, f)) # check that all is still well d = np.array(g) G = np.array([g]).T if np.isnan(f) or np.isinf(f): _log('bfgs1run: f is infinite or nan at initial iterate') info = 5 return x, f, d, H, 0, info, X, G, w, fevalrec, xrec, Hrec, times if np.any(np.isnan(g)) or np.any(np.isinf(g)): _log('bfgs1run: grad is infinite or nan at initial iterate') info = 5 return x, f, d, H, 0, info, X, G, w, fevalrec, xrec, Hrec, times # enter: main loop dnorm = linalg.norm(g, 2) # initialize dnorm stopping criterion f_old = f for it in xrange(maxit): p = -np.dot(H, g) if nvec == 0 else -hgprod(H, g, S, Y) gtp = np.dot(g.T, p) if gtp >= 0 or np.any(np.isnan(gtp)): _log('bfgs1run: not descent direction, quitting after %d ' 'iteration(s), f = %g, dnorm = %5.1e, gtp=%s' % (it + 1, f, dnorm, gtp)) info = 6 times.append((time.time() - time0, f)) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times gprev = np.array(g) # for BFGS update if strongwolfe: # strong Wolfe line search is not recommended except to simulate # exact line search _log("Starting inexact line search (strong Wolfe) ...") # have we coded strong Wolfe line search ? try: from linesch_sw import linesch_sw except ImportError: raise ImportError( '"linesch_sw" is not in path: it can be obtained from the' ' NLCG distribution') alpha, x, f, g, fail, _, _, fevalrecline = linesch_sw( func, x, p, grad=grad, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) # function values are not returned in strongwolfe, so set # fevalrecline to nan # fevalrecline = np.nan _log("... done.") # exact line search: increase alpha slightly to get to other side # of an discontinuity in nonsmooth case if wolfe2 == 0: increase = 1e-8 * (1 + alpha) x = x + increase * p _log(' exact line sch simulation: slightly increasing step ' 'from %g to %g' % (alpha, alpha + increase), level=1) f, g = func(x), grad(x) else: _log("Starting inexact line search (weak Wolfe) ...") alpha, x, f, g, fail, _, _, fevalrecline = linesch_ww( func, x, p, grad=grad, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) _log("... done.") # for the optimal check: discard the saved gradients iff the # new point x is not sufficiently close to the previous point # and replace them with new gradient if alpha * linalg.norm(p, 2) > evaldist: nG = 1 G = np.array([g]).T X = np.array([x]).T # otherwise add new gradient to set of saved gradients, # discarding oldest # if alread have ngrad saved gradients elif nG < ngrad: nG += 1 G = np.vstack((g, G.T)).T X = np.vstack((x, X.T)).T else: # nG = ngrad G = np.vstack((g, G[..., :ngrad - 1].T)).T X = np.vstack((x, X[..., :ngrad - 1].T)).T # optimality check: compute smallest vector in convex hull # of qualifying gradients: reduces to norm of latest gradient # if ngrad = 1, and the set # must always have at least one gradient: could gain efficiency # here by updating previous QP solution if nG > 1: _log("Computing shortest l2-norm vector in convex hull of " "cached gradients: G = %s ..." % G.T) w, d, _, _ = qpspecial(G, verbose=verbose) _log("... done.") else: w = 1 d = np.array(g) dnorm = linalg.norm(d, 2) # XXX these recordings shoud be optional! xrec.append(x) fevalrec.append(fevalrecline) Hrec.append(H) if verbose > 1: nfeval = len(fevalrecline) _log('bfgs1run: iter %d: nfevals = %d, step = %5.1e, f = %g, ' 'nG = %d, dnorm = %5.1e' % (it, nfeval, alpha, f, nG, dnorm), level=1) if f < fvalquit: # this is checked inside the line search _log('bfgs1run: reached target objective, quitting after' ' %d iteration(s)' % (it + 1)) info = 2 times.append((time.time() - time0, f)) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times # this is not checked inside the line search elif linalg.norm(x, 2) > xnormquit: _log('bfgs1run: norm(x) exceeds specified limit, quitting after' ' %d iteration(s)' % (it + 1)) info = 3 times.append(time.time() - time0, f) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times # line search failed (Wolfe conditions not both satisfied) if fail == 1: if not quitLSfail: _log('bfgs1run: continue although line search failed', level=1) else: # quit since line search failed _log(('bfgs1run: line search failed. Quitting after %d ' 'iteration(s), f = %g, dnorm = %5.1e' % (it + 1, f, dnorm))) info = 7 times.append((time.time() - time0, f)) return (x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times) # function apparently unbounded below elif fail == -1: _log('bfgs1run: f may be unbounded below, quitting after %d ' 'iteration(s), f = %g' % (it + 1, f)) info = 8 times.append((time.time() - time0, f)) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec # are we trapped in a local minimum ? relative_change = np.abs(1 - 1. * f_old / f) if f != f_old else 0 if relative_change < funcrtol: _log('bfgs1run: relative change in func over last iteration (%g)' ' below tolerance (%g) , quiting after %d iteration(s),' ' f = %g' % (relative_change, funcrtol, it + 1, f)) info = 9 times.append((time.time() - time0, f)) return (x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times) # check near-stationarity if dnorm <= gradnormtol: if nG == 1: _log('bfgs1run: gradient norm below tolerance, quiting ' 'after %d iteration(s), f = %g' % (it + 1, f)) else: _log('bfgs1run: norm of smallest vector in convex hull of' ' gradients below tolerance, quitting after ' '%d iteration(s), f = %g' % (it + 1, f)) info = 0 times.append((time.time() - time0, f)) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times if time.time() > cpufinish: _log('bfgs1run: cpu time limit exceeded, quitting after %d ' 'iteration(s) %d' % (it + 1)) info = 4 times.append((time.time() - time0, f)) return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times s = (alpha * p).reshape((-1, 1)) y = g - gprev sty = np.dot(s.T, y) # successful line search ensures this is positive assert sty > 0 if nvec == 0: # perform rank two BFGS update to the inverse Hessian H if sty > 0: if it == 0 and scale: # for full BFGS, Nocedal and Wright recommend # scaling I before the first update only H = (1. * sty / np.dot(y.T, y)) * H # for formula, see Nocedal and Wright's book # M = I - rho*s*y', H = M*H*M' + rho*s*s', so we have # H = H - rho*s*y'*H - rho*H*y*s' + rho^2*s*y'*H*y*s' # + rho*s*s' note that the last two terms combine: # (rho^2*y'Hy + rho)ss' rho = 1. / sty Hy = np.dot(H, y).reshape((-1, 1)) rhoHyst = rho * np.dot(Hy, s.T) # old version: update may not be symmetric because of rounding # H = H - rhoHyst' - rhoHyst + rho*s*(y'*rhoHyst) + rho*s*s'; # new in version 2.02: make H explicitly symmetric # also saves one outer product # in practice, makes little difference, except H=H' exactly ytHy = np.dot(y.T, Hy) # could be < 0 if H not numerically pos def sstfactor = np.max([rho * rho * ytHy + rho, 0]) sscaled = np.sqrt(sstfactor) * s H = H - (rhoHyst.T + rhoHyst) + np.dot(sscaled, sscaled.T) # alternatively add the update terms together first: does # not seem to make significant difference # update = sscaled*sscaled' - (rhoHyst' + rhoHyst); # H = H + update; # should not happen unless line search fails, and in that # case should normally have quit else: _log('bfgs1run: sty <= 0, skipping BFGS update at iteration ' '%d ' % it, level=1) else: # save s and y vectors for limited memory update s = alpha * p y = g - gprev if it < nvec: S = np.vstack((S.T, s)).T if len(S) else s Y = np.vstack((Y.T, y)).T if len(Y) else y # could be more efficient here by avoiding moving the columns else: S = np.vstack((S[..., 1:nvec].T, s)).T Y = np.vstack((Y[..., 1:nvec].T, y)).T if scale: # recommended by Nocedal-Wright H = np.dot(np.dot(s.T, y), np.dot(np.dot(y.T, y), H0)) f_old = f times.append((time.time() - time0, f)) # end of 'for loop' _log('bfgs1run: %d iteration(s) reached, f = %g, dnorm = %5.1e' % (maxit, f, dnorm)) info = 1 # quit since max iterations reached return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times
def gradsampfixed(func, x0, grad=None, f0=None, g0=None, samprad=1e-4, maxit=10, gradnormtol=1e-6, fvalquit=-np.inf, cpumax=np.inf, verbose=2, ngrad=None, **kwargs): """" Gradient sampling minimization with fixed sampling radius intended to be called by gradsamp1run only Parameters ---------- func : callable func(x) function to minimise. x0: 1D array of len nvar, optional (default None) intial point grad : callable grad(x, *args) the gradient of `func`. If None, then `func` returns the function value and the gradient (``f, g = func(x, *args)``), unless `approx_grad` is True in which case `func` returns only ``f``. f0: float, optional (default None) function value at x0 g0: 1D array of length nvar = len(x0), optional (default None) gradient at x0 samprad: float, optional (default 1e-4) radius around x0, for sampling gradients See for example bfgs1run for the meaning of the other params. See Also -------- `bfgs` and `bfgs1run` """ def _fg(x): return func(x) if grad is None else func(x), grad(x) def _log(msg, level=0): if verbose > level: print msg _log('gradsamp: sampling radius = %7.1e' % samprad) x = np.array(x0) f0 = _fg(x0)[0] if f0 is None else f0 g0 = _fg(x0)[1] if g0 is None else g0 f = f0 g = g0 X = x G = np.array([g]).T w = 1 quitall = 0 cpufinish = time.time() + cpumax dnorm = np.inf for it in xrange(maxit): # evaluate gradients at randomly generated points near x # first column of Xnew and Gnew are respectively x and g Xnew, Gnew = getbundle(func, x, grad=grad, g0=g, samprad=samprad, n=ngrad) # solve QP subproblem wnew, dnew, _, _ = qpspecial(Gnew, verbose=verbose) dnew = -dnew # this is a descent direction gtdnew = np.dot(g.T, dnew) # gradient value at current point dnormnew = linalg.norm(dnew, 2) if dnormnew < dnorm: # for returning, may not be the final one dnorm = dnormnew X = Xnew G = Gnew w = wnew if dnormnew < gradnormtol: # since dnormnew is first to satisfy tolerance, it must equal dnorm _log(' tolerance met at iter %d, f = %g, dnorm = %5.1e' % (it, f, dnorm)) return x, f, g, dnorm, X, G, w, quitall elif gtdnew >= 0 or np.isnan(gtdnew): # dnorm, not dnormnew, which may be bigger _log(' not descent direction, quit at iter %d, f = %g, ' 'dnorm = %5.1e' % (it, f, dnorm)) return x, f, g, dnorm, X, G, w, quitall # note that dnew is NOT normalized, but we set second Wolfe # parameter to 0 so that sign of derivative must change # and this is accomplished by expansion steps when necessary, # so it does not seem necessary to normalize d wolfe1 = 0 wolfe2 = 0 alpha, x, f, g, fail, _, _, _ = linesch_ww(func, x, dnew, grad=grad, func0=f, grad0=g, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) _log(' iter %d: step = %5.1e, f = %g, dnorm = %5.1e' % (it, alpha, f, dnormnew), level=1) if f < fvalquit: _log(' reached target objective, quit at iter %d ' % iter) quitall = 1 return x, f, g, dnorm, X, G, w, quitall # if fail == 1 # Wolfe conditions not both satisfied, DO NOT quit, # because this typically means gradient set not rich enough and we # should continue sampling if fail == -1: # function apparently unbounded below _log(' f may be unbounded below, quit at iter %d, f = %g' % (it, f)) quitall = 1 return x, f, g, dnorm, X, G, w, quitall if time.time() > cpufinish: _log(' cpu time limit exceeded, quit at iter #d' % it) quitall = 1 return x, f, g, dnorm, X, G, w, quitall _log(' %d iters reached, f = %g, dnorm = %5.1e' % (maxit, f, dnorm)) return x, f, g, dnorm, np.array(X), np.array(G), w, quitall
def bfgs1run(func, x0, grad=None, maxit=100, nvec=0, funcrtol=1e-6, gradnormtol=1e-4, fvalquit=-np.inf, xnormquit=np.inf, cpumax=np.inf, strongwolfe=False, wolfe1=0, wolfe2=.5, quitLSfail=1, ngrad=None, evaldist=1e-4, H0=None, scale=1, verbose=2, callback=None): """ Make a single run of BFGS (with inexact line search) from one starting point. Intended to be called iteratively from bfgs on several starting points. Parameters ---------- func : callable func(x) function to minimise. x0: 1D array of len nvar, optional (default None) intial point grad : callable grad(x, *args) the gradient of `func`. If None, then `func` returns the function value and the gradient (``f, g = func(x, *args)``), unless `approx_grad` is True in which case `func` returns only ``f``. nvar: int, optional (default None) number of dimensions in the problem (exclusive x0) maxit: int, optional (default 100) maximum number of BFGS iterates we are ready to pay for wolfe1: float, optional (default 0) param passed to linesch_ww[sw] function wolfe2: float, optional (default .5) param passed to linesch_ww[sw] function strongwolfe: boolean, optional (default 1) 0 for weak Wolfe line search (default) 1 for strong Wolfe line search Strong Wolfe line search is not recommended for use with BFGS; it is very complicated and bad if f is nonsmooth; however, it can be useful to simulate an exact line search fvalquit: float, optional (default -inf) param passed to bfgs1run function gradnormtol: float, optional (default 1e-6) termination tolerance on d: smallest vector in convex hull of up to ngrad gradients xnormquit: float, optional (default inf) quit if norm(x) exceeds this value evaldist: float, optional default (1e-4) the gradients used in the termination test qualify only if they are evaluated at points approximately within distance evaldist of x H0: 2D array of shape (nvar, nvar), optional (default identity matrix) for full BFGS: initial inverse Hessian approximation (must be positive definite, but this is not checked), this could be draw drawn from a Wishart distribution; for limited memory BFGS: same, but applied every iteration (must be sparse in this case) scale: boolean, optional (default True) for full BFGS: 1 to scale H0 at first iteration, 0 otherwise for limited memory BFGS: 1 to scale H0 every time, 0 otherwise cpumax: float, optional (default inf) quit if cpu time in secs exceeds this (applies to total running time) verbose: int, optional (default 1) param passed to bfgs1run function quitLSfail: int, optional (default 1) 1 if quit when line search fails, 0 (potentially useful if func is not numerically continuous) ngrad: int, optional (default min(100, 2 * nvar, nvar + 10)) number of gradients willing to save and use in solving QP to check optimality tolerance on smallest vector in their convex hull; see also next two options Returns ------- x: 1D array of same length nvar = len(x0) final iterate f: float final function value d: 1D array of same length nvar final smallest vector in convex hull of saved gradients H: 2D array of shape (nvar, nvar) final inverse Hessian approximation iter: int number of iterations info: int reason for termination 0: tolerance on smallest vector in convex hull of saved gradients met 1: max number of iterations reached 2: f reached target value 3: norm(x) exceeded limit 4: cpu time exceeded limit 5: f or g is inf or nan at initial point 6: direction not a descent direction (because of rounding) 7: line search bracketed minimizer but Wolfe conditions not satisfied 8: line search did not bracket minimizer: f may be unbounded below 9: relative tolerance on function value met on last iteration X: 2D array of shape (iter, nvar) iterates where saved gradients were evaluated G: 2D array of shape (nvar, nvar) gradients evaluated at these points w: 1D array weights defining convex combination d = G*w fevalrec: 1D array of length iter record of all function evaluations in the line searches xrec: 2D array of length (iter, nvar) record of x iterates Hrec: 2D array of shape (iter, nvar) record of H (Hessian) iterates times: list of floats time consumed in each iteration Raises ------ ImportError """ def _fg(x): return func(x) if grad is None else (func(x), grad(x)) def _log(msg, level=0): if verbose > level: print msg # sanitize input x0 = x0.ravel() nvar = np.prod(x0.shape) H0 = sparse.eye(nvar) if H0 is None else H0 ngrad = min(100, min(2 * nvar, nvar + 10)) if ngrad is None else ngrad x = x0 H = H0 # initialize auxiliary variables S = [] Y = [] xrec = [] fevalrec = [] Hrec = [] X = x[:, np.newaxis] nG = 1 w = 1 # prepare for timing cpufinish = time.time() + cpumax time0 = time.time() times = [] # first evaluation f, g = _fg(x) # check that all is still well d = g G = g[:, np.newaxis] if np.isnan(f) or np.isinf(f): _log('bfgs1run: f is infinite or nan at initial iterate') info = 5 return x, f, d, H, 0, info, X, G, w, fevalrec, xrec, Hrec, times if np.any(np.isnan(g)) or np.any(np.isinf(g)): _log('bfgs1run: grad is infinite or nan at initial iterate') info = 5 return x, f, d, H, 0, info, X, G, w, fevalrec, xrec, Hrec, times # enter: main loop dnorm = linalg.norm(g, 2) # initialize dnorm stopping criterion f_old = f for it in xrange(maxit): times.append((time.time() - time0, f)) if callback: callback(x) p = -H.dot(g) if nvec == 0 else -hgprod( H, g.copy(), # we must no corrupt g! S, Y) gtp = g.T.dot(p) if np.isnan(gtp) or gtp >= 0: _log( 'bfgs1run: not descent direction, quitting after %d ' 'iteration(s), f = %g, dnorm = %5.1e, gtp=%s' % ( it + 1, f, dnorm, gtp)) info = 6 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times gprev = g # for BFGS update if strongwolfe: # strong Wolfe line search is not recommended except to simulate # exact line search _log("Starting inexact line search (strong Wolfe) ...") # have we coded strong Wolfe line search ? try: from linesch_sw import linesch_sw except ImportError: raise ImportError( '"linesch_sw" is not in path: it can be obtained from the' ' NLCG distribution') alpha, x, f, g, fail, _, _, fevalrecline = linesch_sw( func, x, p, grad=grad, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) # function values are not returned in strongwolfe, so set # fevalrecline to nan # fevalrecline = np.nan _log("... done.") # exact line search: increase alpha slightly to get to other side # of an discontinuity in nonsmooth case if wolfe2 == 0: increase = 1e-8 * (1 + alpha) x = x + increase * p _log(' exact line sch simulation: slightly increasing step ' 'from %g to %g' % (alpha, alpha + increase), level=1) f, g = func(x), grad(x) else: _log("Starting inexact line search (weak Wolfe) ...") alpha, x, f, g, fail, _, _, fevalrecline = linesch_ww( func, x, p, grad=grad, wolfe1=wolfe1, wolfe2=wolfe2, fvalquit=fvalquit, verbose=verbose) _log("... done.") # for the optimal check: discard the saved gradients iff the # new point x is not sufficiently close to the previous point # and replace them with new gradient if alpha * linalg.norm(p, 2) > evaldist: nG = 1 G = g[:, np.newaxis] X = g[:, np.newaxis] # otherwise add new gradient to set of saved gradients, # discarding oldest # if alread have ngrad saved gradients elif nG < ngrad: nG = nG + 1 G = np.column_stack((g, G)) X = np.column_stack((x, X)) else: # nG = ngrad G = np.column_stack((g, G[:, :ngrad - 1])) X = np.column_stack((x, X[:, :ngrad - 1])) # optimality check: compute smallest vector in convex hull # of qualifying gradients: reduces to norm of latest gradient # if ngrad = 1, and the set # must always have at least one gradient: could gain efficiency # here by updating previous QP solution if nG > 1: _log("Computing shortest l2-norm vector in convex hull of " "cached gradients: G = %s ..." % G.T) w, d, _, _ = qpspecial(G, verbose=verbose) _log("... done.") else: w = 1 d = g dnorm = linalg.norm(d, 2) # XXX these recordings shoud be optional! xrec.append(x) fevalrec.append(fevalrecline) Hrec.append(H) if verbose > 1: nfeval = len(fevalrecline) _log( 'bfgs1run: iter %d: nfevals = %d, step = %5.1e, f = %g, ' 'nG = %d, dnorm = %5.1e' % (it, nfeval, alpha, f, nG, dnorm), level=1) if f < fvalquit: # this is checked inside the line search _log('bfgs1run: reached target objective, quitting after' ' %d iteration(s)' % (it + 1)) info = 2 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times # this is not checked inside the line search elif linalg.norm(x, 2) > xnormquit: _log('bfgs1run: norm(x) exceeds specified limit, quitting after' ' %d iteration(s)' % (it + 1)) info = 3 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times # line search failed (Wolfe conditions not both satisfied) if fail == 1: if not quitLSfail: _log('bfgs1run: continue although line search failed', level=1) else: # quit since line search failed _log(('bfgs1run: line search failed. Quitting after %d ' 'iteration(s), f = %g, dnorm = %5.1e' % ( it + 1, f, dnorm))) info = 7 return (x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times) # function apparently unbounded below elif fail == -1: _log('bfgs1run: f may be unbounded below, quitting after %d ' 'iteration(s), f = %g' % (it + 1, f)) info = 8 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec # are we trapped in a local minimum ? relative_change = np.abs(1 - 1. * f_old) / np.max(np.abs( [f, f_old, 1])) if f != f_old else 0 if relative_change < funcrtol: _log('bfgs1run: relative change in func over last iteration (%g)' ' below tolerance (%g) , quiting after %d iteration(s),' ' f = %g' % (relative_change, funcrtol, it + 1, f)) info = 9 return (x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times) # check near-stationarity if dnorm <= gradnormtol: if nG == 1: _log('bfgs1run: gradient norm below tolerance, quiting ' 'after %d iteration(s), f = %g' % (it + 1, f)) else: _log( 'bfgs1run: norm of smallest vector in convex hull of' ' gradients below tolerance, quitting after ' '%d iteration(s), f = %g' % (it + 1, f)) info = 0 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times if time.time() > cpufinish: _log('bfgs1run: cpu time limit exceeded, quitting after %d ' 'iteration(s) %d' % (it + 1)) info = 4 return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times s = (alpha * p)[:, np.newaxis] y = (g - gprev)[:, np.newaxis] sty = s.T.dot(y) # successful line search ensures this is positive if nvec == 0: # perform rank two BFGS update to the inverse Hessian H if sty > 0: if it == 0 and scale: # for full BFGS, Nocedal and Wright recommend # scaling I before the first update only H = (sty / y.T.dot(y)).ravel()[0] * H # for formula, see Nocedal and Wright's book # M = I - rho*s*y', H = M*H*M' + rho*s*s', so we have # H = H - rho*s*y'*H - rho*H*y*s' + rho^2*s*y'*H*y*s' # + rho*s*s' note that the last two terms combine: # (rho^2*y'Hy + rho)ss' rho = 1. / sty Hy = H.dot(y) rhoHyst = rho * Hy.dot(s.T) # old version: update may not be symmetric because of rounding # H = H - rhoHyst' - rhoHyst + rho*s*(y'*rhoHyst) + rho*s*s'; # new in version 2.02: make H explicitly symmetric # also saves one outer product # in practice, makes little difference, except H=H' exactly ytHy = y.T.dot( Hy) # could be < 0 if H not numerically pos def sstfactor = np.max([rho * rho * ytHy + rho, 0]) sscaled = np.sqrt(sstfactor) * s H = sparse.csr_matrix(H.toarray() - ( rhoHyst.T + rhoHyst) + sscaled.dot(sscaled.T)) # alternatively add the update terms together first: does # not seem to make significant difference # update = sscaled*sscaled' - (rhoHyst' + rhoHyst); # H = H + update; # should not happen unless line search fails, and in that # case should normally have quit else: _log('bfgs1run: sty <= 0, skipping BFGS update at iteration ' '%d ' % it, level=1) else: # save s and y vectors for limited memory update s = alpha * p y = g - gprev if it < nvec: S = np.column_stack([S, s]) if len(S) else s Y = np.column_stack([Y, y]) if len(Y) else y # could be more efficient here by avoiding moving the columns else: S = np.column_stack((S[:, 1:nvec], s)) Y = np.column_stack((Y[:, 1:nvec], y)) if scale: # recommended by Nocedal-Wright H = ((s.T.dot(y)) / (y.T.dot(y))) * H0 f_old = f # end of 'for loop' _log('bfgs1run: %d iteration(s) reached, f = %g, dnorm = %5.1e' % ( maxit, f, dnorm)) info = 1 # quit since max iterations reached return x, f, d, H, it, info, X, G, w, fevalrec, xrec, Hrec, times