def step(self, xk, data, trust): """ Computes the next step in the parameter space. There are lots of tricks here that I will document later. @param[in] G The gradient @param[in] H The Hessian @param[in] trust The trust radius """ from scipy import optimize X, G, H = (data['X0'], data['G0'], data['H0']) if self.bhyp else (data['X'], data['G'], data['H']) H1 = H.copy() H1 = np.delete(H1, self.excision, axis=0) H1 = np.delete(H1, self.excision, axis=1) Eig = eig(H1)[0] # Diagonalize Hessian Emin = min(Eig) if Emin < self.eps: # Mix in SD step if Hessian minimum eigenvalue is negative # Experiment. Adj = max(self.eps, 0.01*abs(Emin)) - Emin print "Hessian has a small or negative eigenvalue (%.1e), mixing in some steepest descent (%.1e) to correct this." % (Emin, Adj) print "Eigenvalues are:" ### pvec1d(Eig) ### H += Adj*np.eye(H.shape[0]) if self.bhyp: G = np.delete(G, self.excision) H = np.delete(H, self.excision, axis=0) H = np.delete(H, self.excision, axis=1) xkd = np.delete(xk, self.excision) if self.Objective.Penalty.fmul != 0.0: warn_press_key("Using the multiplicative hyperbolic penalty is discouraged!") # This is the gradient and Hessian without the contributions from the hyperbolic constraint. Obj0 = {'X':X,'G':G,'H':H} class Hyper(object): def __init__(self, HL, Penalty): self.H = HL.copy() self.dx = 1e10 * np.ones(len(HL),dtype=float) self.Val = 0 self.Grad = np.zeros(len(HL),dtype=float) self.Hess = np.zeros((len(HL),len(HL)),dtype=float) self.Penalty = Penalty def _compute(self, dx): self.dx = dx.copy() Tmp = np.mat(self.H)*col(dx) Reg_Term = self.Penalty.compute(xkd+flat(dx), Obj0) self.Val = (X + np.dot(dx, G) + 0.5*row(dx)*Tmp + Reg_Term[0] - data['X'])[0,0] self.Grad = flat(col(G) + Tmp) + Reg_Term[1] def compute_val(self, dx): if norm(dx - self.dx) > 1e-8: self._compute(dx) return self.Val def compute_grad(self, dx): if norm(dx - self.dx) > 1e-8: self._compute(dx) return self.Grad def compute_hess(self, dx): if norm(dx - self.dx) > 1e-8: self._compute(dx) return self.Hess def hyper_solver(L): dx0 = np.zeros(len(xkd),dtype=float) #dx0 = np.delete(dx0, self.excision) # HL = H + (L-1)**2*np.diag(np.diag(H)) # Attempt to use plain Levenberg HL = H + (L-1)**2*np.eye(len(H)) HYP = Hyper(HL, self.Objective.Penalty) try: Opt1 = optimize.fmin_bfgs(HYP.compute_val,dx0,fprime=HYP.compute_grad,gtol=1e-5,full_output=True,disp=0) except: Opt1 = optimize.fmin(HYP.compute_val,dx0,full_output=True,disp=0) try: Opt2 = optimize.fmin_bfgs(HYP.compute_val,-xkd,fprime=HYP.compute_grad,gtol=1e-5,full_output=True,disp=0) except: Opt2 = optimize.fmin(HYP.compute_val,-xkd,full_output=True,disp=0) #Opt2 = optimize.fmin(HYP.compute_val,-xkd,full_output=True,disp=0) dx1, sol1 = Opt1[0], Opt1[1] dx2, sol2 = Opt2[0], Opt2[1] dxb, sol = (dx1, sol1) if sol1 <= sol2 else (dx2, sol2) for i in self.excision: # Reinsert deleted coordinates - don't take a step in those directions dxb = np.insert(dxb, i, 0) return dxb, sol else: # G0 and H0 are used for determining the expected function change. G0 = G.copy() H0 = H.copy() G = np.delete(G, self.excision) H = np.delete(H, self.excision, axis=0) H = np.delete(H, self.excision, axis=1) # print "Inverting Hessian:" ### # print " G:" ### # pvec1d(G,precision=5) ### # print " H:" ### # pmat2d(H,precision=5) ### Hi = invert_svd(np.mat(H)) dx = flat(-1 * Hi * col(G)) # print " dx:" ### # pvec1d(dx,precision=5) ### # dxa = -solve(H, G) # Take Newton Raphson Step ; use -1*G if want steepest descent. # dxa = flat(dxa) # print " dxa:" ### # pvec1d(dxa,precision=5) ### print ### for i in self.excision: # Reinsert deleted coordinates - don't take a step in those directions dx = np.insert(dx, i, 0) def para_solver(L): # Levenberg-Marquardt # HT = H + (L-1)**2*np.diag(np.diag(H)) # Attempt to use plain Levenberg HT = H + (L-1)**2*np.eye(len(H)) # print "Inverting Scaled Hessian:" ### # print " G:" ### # pvec1d(G,precision=5) ### # print " HT: (Scal = %.4f)" % (1+(L-1)**2) ### # pmat2d(HT,precision=5) ### Hi = invert_svd(np.mat(HT)) dx = flat(-1 * Hi * col(G)) # print " dx:" ### # pvec1d(dx,precision=5) ### # dxa = -solve(HT, G) # dxa = flat(dxa) # print " dxa:" ### # pvec1d(dxa,precision=5) ### # print ### sol = flat(0.5*row(dx)*np.mat(H)*col(dx))[0] + np.dot(dx,G) for i in self.excision: # Reinsert deleted coordinates - don't take a step in those directions dx = np.insert(dx, i, 0) return dx, sol def solver(L): return hyper_solver(L) if self.bhyp else para_solver(L) def trust_fun(L): N = norm(solver(L)[0]) #print "\rL = %.4e, Hessian diagonal addition = %.4e: found length %.4e, objective is %.4e" % (L, (L-1)**2, N, (N - trust)**2) return (N - trust)**2 def search_fun(L): # Evaluate ONLY the objective function. Most useful when # the objective is cheap, but the derivative is expensive. dx, sol = solver(L) # dx is how much the step changes from the previous step. # This is our trial step. xk_ = dx + xk Result = self.Objective.Full(xk_,0,verbose=False)['X'] - data['X'] print "Searching! Hessian diagonal addition = %.4e, L = % .4e, length %.4e, result %.4e" % ((L-1)**2,L,norm(dx),Result) return Result if self.trust0 > 0: # This is the trust region code. bump = False dx, expect = solver(1) dxnorm = norm(dx) if dxnorm > trust: bump = True # Tried a few optimizers here, seems like Brent works well. # Okay, the problem with Brent is that the tolerance is fractional. # If the optimized value is zero, then it takes a lot of meaningless steps. LOpt = optimize.brent(trust_fun,brack=(self.lmg,self.lmg*4),tol=1e-6) ### Result = optimize.fmin_powell(trust_fun,3,xtol=self.search_tol,ftol=self.search_tol,full_output=1,disp=0) ### LOpt = Result[0] dx, expect = solver(LOpt) dxnorm = norm(dx) # print "\rLevenberg-Marquardt: %s step found (length %.3e), Hessian diagonal is scaled by % .8f" % ('hyperbolic-regularized' if self.bhyp else 'Newton-Raphson', dxnorm, (LOpt-1)**2) print "\rLevenberg-Marquardt: %s step found (length %.3e), % .8f added to Hessian diagonal" % ('hyperbolic-regularized' if self.bhyp else 'Newton-Raphson', dxnorm, (LOpt-1)**2) else: # This is the nonlinear search code. # First obtain a step that is the same length as the provided trust radius. LOpt = optimize.brent(trust_fun,brack=(self.lmg,self.lmg*4),tol=1e-6) bump = False Result = optimize.brent(search_fun,brack=(LOpt,LOpt*4),tol=self.search_tol,full_output=1) ### optimize.fmin(search_fun,0,xtol=1e-8,ftol=data['X']*0.1,full_output=1,disp=0) ### Result = optimize.fmin_powell(search_fun,3,xtol=self.search_tol,ftol=self.search_tol,full_output=1,disp=0) dx, _ = solver(Result[0]) expect = Result[1] ## Decide which parameters to redirect. ## Currently not used. if self.Objective.Penalty.ptyp in [3,4,5]: self.FF.make_redirect(dx+xk) return dx, expect, bump
def MainOptimizer(self,b_BFGS=0): """ The main ForceBalance adaptive trust-radius pseudo-Newton optimizer. Tried and true in many situations. :) Usually this function is called with the BFGS or NewtonRaphson method. The NewtonRaphson method is consistently the best method I have, because I always provide at least an approximate Hessian to the objective function. The BFGS method is vestigial and currently does not work. BFGS is a pseudo-Newton method in the sense that it builds an approximate Hessian matrix from the gradient information in previous steps; Newton-Raphson requires the actual Hessian matrix. However, the algorithms are similar in that they both compute the step by inverting the Hessian and multiplying by the gradient. The method adaptively changes the step size. If the step is sufficiently good (i.e. the objective function goes down by a large fraction of the predicted decrease), then the step size is increased; if the step is bad, then it rejects the step and tries again. The optimization is terminated after either a function value or step size tolerance is reached. @param[in] b_BFGS Switch to use BFGS (True) or Newton-Raphson (False) """ if any(['liquid' in tgt.name.lower() for tgt in self.Objective.Targets]) and self.conv_obj < 1e-3: warn_press_key("Condensed phase targets detected - may not converge with current choice of convergence_objective (%.e)\nRecommended range is 1e-2 - 1e-1 for this option." % self.conv_obj) # Parameters for the adaptive trust radius a = self.adapt_fac # Default value is 0.5, decrease to make more conservative. Zero to turn off all adaptive. b = self.adapt_damp # Default value is 0.5, increase to make more conservative printcool( "Main Optimizer\n%s Mode%s" % ("BFGS" if b_BFGS else "Newton-Raphson", " (Static Radius)" if a == 0.0 else " (Adaptive Radius)"), ansi=1, bold=1) # First, set a bunch of starting values Ord = 1 if b_BFGS else 2 #Ord = 2 global ITERATION_NUMBER ITERATION_NUMBER = 0 global GOODSTEP Best_Step = 1 if all(i in self.chk for i in ['xk','X','G','H','ehist','x_best','xk_prev','trust']): print "Reading initial objective, gradient, Hessian from checkpoint file" xk, X, G, H, ehist = self.chk['xk'], self.chk['X'], self.chk['G'], self.chk['H'], self.chk['ehist'] X_best, xk_prev, trust = self.chk['x_best'], self.chk['xk_prev'], self.chk['trust'] else: xk = self.mvals0.copy() print data = self.Objective.Full(xk,Ord,verbose=True) X, G, H = data['X'], data['G'], data['H'] ehist = np.array([X]) xk_prev = xk.copy() trust = abs(self.trust0) X_best = X X_prev = X G_prev = G.copy() H_stor = H.copy() ndx = 0.0 color = "\x1b[1m" nxk = norm(xk) ngr = norm(G) Quality = 0.0 restep = False GOODSTEP = 1 Ord = 1 if b_BFGS else 2 while 1: # Loop until convergence is reached. ## Put data into the checkpoint file self.chk = {'xk': xk, 'X' : X, 'G' : G, 'H': H, 'ehist': ehist, 'x_best': X_best,'xk_prev': xk_prev, 'trust': trust} if self.wchk_step: self.writechk() stdfront = len(ehist) > self.hist and np.std(np.sort(ehist)[:self.hist]) or (len(ehist) > 0 and np.std(ehist) or 0.0) stdfront *= 2 print "%6s%12s%12s%12s%14s%12s%12s" % ("Step", " |k| "," |dk| "," |grad| "," -=X2=- ","Delta(X2)", "StepQual") print "%6i%12.3e%12.3e%12.3e%s%14.5e\x1b[0m%12.3e% 11.3f\n" % (ITERATION_NUMBER, nxk, ndx, ngr, color, X, stdfront, Quality) # Check the convergence criteria if ngr < self.conv_grd: print "Convergence criterion reached for gradient norm (%.2e)" % self.conv_grd break if ITERATION_NUMBER == self.maxstep: print "Maximum number of optimization steps reached (%i)" % ITERATION_NUMBER break if ndx < self.conv_stp and ITERATION_NUMBER > 0 and not restep: print "Convergence criterion reached in step size (%.2e)" % self.conv_stp break if stdfront < self.conv_obj and len(ehist) > self.hist and not restep: # Factor of two is so [0,1] stdev is normalized to 1 print "Convergence criterion reached for objective function (%.2e)" % self.conv_obj break if self.print_grad: bar = printcool("Total Gradient",color=4) self.FF.print_map(vals=G,precision=8) print bar if self.print_hess: bar = printcool("Total Hessian",color=4) pmat2d(H,precision=8) print bar for key, val in self.Objective.ObjDict.items(): if Best_Step: self.Objective.ObjDict_Last[key] = val restep = False dx, dX_expect, bump = self.step(xk, data, trust) old_pk = self.FF.create_pvals(xk) old_xk = xk.copy() # Increment the iteration counter. ITERATION_NUMBER += 1 # Take a step in the parameter space. xk += dx if self.print_vals: pk = self.FF.create_pvals(xk) dp = pk - old_pk bar = printcool("Mathematical Parameters (Current + Step = Next)",color=5) self.FF.print_map(vals=["% .4e %s %.4e = % .4e" % (old_xk[i], '+' if dx[i] >= 0 else '-', abs(dx[i]), xk[i]) for i in range(len(xk))]) print bar bar = printcool("Physical Parameters (Current + Step = Next)",color=5) self.FF.print_map(vals=["% .4e %s %.4e = % .4e" % (old_pk[i], '+' if dp[i] >= 0 else '-', abs(dp[i]), pk[i]) for i in range(len(pk))]) print bar # Evaluate the objective function and its derivatives. data = self.Objective.Full(xk,Ord,verbose=True) X, G, H = data['X'], data['G'], data['H'] ndx = norm(dx) nxk = norm(xk) ngr = norm(G) drc = abs(flat(dx)).argmax() dX_actual = X - X_prev try: Quality = dX_actual / dX_expect except: print "Warning: Step size of zero detected (i.e. wrong direction). Try reducing the finite_difference_h parameter" Quality = 1.0 # This is a step length of zero. if Quality <= 0.25 and X < (X_prev + self.err_tol) and self.trust0 > 0: # If the step quality is bad, then we should decrease the trust radius. trust = max(ndx*(1./(1+a)), self.mintrust) print "Low quality step, reducing trust radius to % .4e" % trust if Quality >= 0.75 and bump and self.trust0 > 0: # If the step quality is good, then we should increase the trust radius. # The 'a' factor is how much we should grow or shrink the trust radius each step # and the 'b' factor determines how closely we are tied down to the original value. # Recommend values 0.5 and 0.5 trust += a*trust*np.exp(-b*(trust/self.trust0 - 1)) if X > (X_prev + self.err_tol): Best_Step = 0 # Toggle switch for rejection (experimenting with no rejection) Rejects = True GOODSTEP = 0 Reevaluate = True trust = max(ndx*(1./(1+a)), self.mintrust) print "Rejecting step and reducing trust radius to % .4e" % trust if Rejects: xk = xk_prev.copy() if Reevaluate: restep = True color = "\x1b[91m" print "%6s%12s%12s%12s%14s%12s%12s" % ("Step", " |k| "," |dk| "," |grad| "," -=X2=- ","Delta(X2)", "StepQual") print "%6i%12.3e%12.3e%12.3e%s%14.5e\x1b[0m%12.3e% 11.3f\n" % (ITERATION_NUMBER, nxk, ndx, ngr, color, X, stdfront, Quality) printcool("Objective function rises!\nRe-evaluating at the previous point..",color=1) ITERATION_NUMBER += 1 data = self.Objective.Full(xk,Ord,verbose=True) GOODSTEP = 1 X, G, H = data['X'], data['G'], data['H'] X_prev = X dx *= 0 ndx = norm(dx) nxk = norm(xk) ngr = norm(G) Quality = 0.0 color = "\x1b[0m" else: color = "\x1b[91m" G = G_prev.copy() H = H_stor.copy() data = deepcopy(datastor) continue else: GOODSTEP = 1 if X > X_best: Best_Step = 0 color = "\x1b[95m" else: Best_Step = 1 color = "\x1b[92m" X_best = X ehist = np.append(ehist, X) # Hessian update for BFGS. if b_BFGS: Hnew = H_stor.copy() Dx = col(xk - xk_prev) Dy = col(G - G_prev) Mat1 = (Dy*Dy.T)/(Dy.T*Dx)[0,0] Mat2 = ((Hnew*Dx)*(Hnew*Dx).T)/(Dx.T*Hnew*Dx)[0,0] Hnew += Mat1-Mat2 H = Hnew.copy() data['H'] = H.copy() datastor= deepcopy(data) G_prev = G.copy() H_stor = H.copy() xk_prev = xk.copy() X_prev = X if len(self.FF.parmdestroy_this) > 0: self.FF.parmdestroy_save.append(self.FF.parmdestroy_this) self.FF.linedestroy_save.append(self.FF.linedestroy_this) bar = printcool("Final objective function value\nFull: % .6e Un-penalized: % .6e" % (data['X'],data['X0']), '@', bold=True, color=2) return xk