def ls_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval): """ Same as line_search_wolfe1, but fall back to line_search_wolfe2 if suitable step length is not found, and raise an exception if a suitable step length is not found. Raises ------ _LineSearchError If no suitable step size is found """ ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval) alpha = ret[0] if alpha is None or alpha < 1e-12: #print('A') # line search failed: try different one. ret = line_search_wolfe2(f, fprime, xk, pk, gfk, old_fval, old_old_fval) alpha = ret[0] if alpha is None or alpha < 1e-12: #print('B') ret = line_search_armijo(f, xk, pk, gfk, old_fval) alpha = ret[0] if alpha is None or alpha < 1e-12: #print('C') alpha = backtracking_line_search(f, gfk, xk, pk) return alpha
def test_line_search_armijo(self): c = 0 for name, f, fprime, x, p, old_f in self.line_iter(): f0 = f(x) g0 = fprime(x) self.fcount = 0 s, fc, fv = ls.line_search_armijo(f, x, p, g0, f0) c += 1 assert_equal(self.fcount, fc) assert_equal(fv, f(x + s*p)) assert_line_armijo(x, p, s, f, err_msg=name) assert_(c >= 9)
def test_line_search_armijo(self): c = 0 for name, f, fprime, x, p, old_f in self.line_iter(): f0 = f(x) g0 = fprime(x) self.fcount = 0 s, fc, fv = ls.line_search_armijo(f, x, p, g0, f0) c += 1 assert_equal(self.fcount, fc) assert_fp_equal(fv, f(x + s * p)) assert_line_armijo(x, p, s, f, err_msg=name) assert_(c >= 9)
def lineSearch(encoded_d_k, fval_x_k): """ returns: alpha_k: float or None alpha for which x_kp1 = x_k + alpha * d_k, or None if line search algorithm did not converge. new_fval : float or None New function value f(x_kp1), or None if the line search algorithm did not converge. """ d_k = np.frombuffer(base64.decodestring(encoded_d_k),dtype=np.float64) alpha_k, fc, new_fval = \ line_search_armijo(costFunction, params, d_k, accruedGradients, fval_x_k, args=(X,y), c1=1e-5) #cast to float because line_search_armijo returns type numpy float alpha_k = float(alpha_k) if alpha_k is not None else None new_fval = float(new_fval) if new_fval is not None else None return (alpha_k, new_fval)
def fista(grad, obj, prox, x0, momentum=True, max_iter=100, step_size=None, early_stopping=True, eps=np.finfo(np.float32).eps, times=False, debug=False, verbose=0, name="Optimization"): """ F/ISTA algorithm. """ if verbose and not debug: warnings.warn("Can't have verbose if cost-func is not computed, " "enable it by setting debug=True") adaptive_step_size = False if step_size is None: adaptive_step_size = True step_size = 1.0 # prepare the iterate t = t_old = 1 z_old = np.zeros_like(x0) x = np.copy(x0) if adaptive_step_size and x.ndim > 1: raise ValueError("Backtracking line search need to have 1D gradient") # saving variables pobj_, times_ = [obj(x)], [0.0] # precompute L.op(y) if adaptive_step_size: old_fval = obj(x) # main loop for ii in range(max_iter): if times: t0 = time.time() grad_ = grad(x) # step-size if adaptive_step_size: step_size, _, old_fval = line_search_armijo(obj, x.ravel(), -grad_.ravel(), grad_.ravel(), old_fval, c1=1.0e-5, alpha0=step_size) if step_size is None: step_size = 0.0 # descent step z = prox(x - step_size * grad_, step_size) # fista acceleration if momentum: t = 0.5 * (1.0 + np.sqrt(1.0 + 4.0 * t_old**2)) x = z + (t_old - 1.0) / t * (z - z_old) else: x = z # savings if debug: if adaptive_step_size: pobj_.append(old_fval) else: pobj_.append(obj(x)) # printing if debug and verbose > 0: print("[{0}] Iteration {1} / {2}, " "loss = {3}".format(name, ii + 1, max_iter, pobj_[ii])) # early-stopping l1_diff = np.sum(np.abs(z - z_old)) if l1_diff <= eps and early_stopping: if debug: print("---> [{0}] early-stopping " "done at {1}/{2}".format(name, ii + 1, max_iter)) break if l1_diff > np.finfo(np.float64).max: raise RuntimeError("[{}] {} have diverged during.".format( name, ["ISTA", "FISTA"][momentum])) # update iterates t_old = t z_old = z # savings if times: times_.append(time.time() - t0) if not times and not debug: return x if times and not debug: return x, times_ if not times and debug: return x, pobj_ if times and debug: return x, pobj_, times_
def hfn(func, x0, hess_vec, tol=1e-5, max_iter=500, c1=1e-4, c2=0.9, disp=False, trace=False): if (trace): hist = {} hist['f'] = [] hist['norm_g'] = [] hist['elaps_t'] = [] start_time = time.clock() f = lambda x: func(x)[0]; df = lambda x: func(x)[1]; x = x0 [loss, grad, extra] = func(x) grad_norm = linalg.norm(grad, inf) eps = min(1 / 2, sqrt(grad_norm)) * grad_norm for i in range(0, max_iter): #Start cg z = zeros(shape(x)) g = grad d = -g u = hess_vec(x, d, extra) for j in range(0,1000): gamma = g.transpose().dot(g)/(d.transpose().dot(u)) z = z + gamma*d g1 = g + gamma*u b = True if linalg.norm(g1,inf)<eps: b = False break else: betta = g1.transpose().dot(g1)/(g.transpose().dot(g)) d = -g1+betta*d u = hess_vec(x,d,extra) g = g1 if b: print('CG не сошелся') #Одномерный линейный поиск alpha = line_search_wolfe2(f = f,myfprime = df, xk = x, pk = z, gfk = grad, old_fval = loss, c1 = c1, c2 = c2) if (alpha[0] == None): alpha = line_search_armijo(f = f,myfprime = df, xk = x, pk = z, gfk = grad, old_fval = loss, c1 = c1, alpha0 = 1) x = x + alpha[0]*z [loss, grad, extra] = func(x) grad_norm = linalg.norm(grad, inf) eps = min(1 / 2, sqrt(grad_norm)) * grad_norm if (disp): print(str(1+i) + ')', loss, grad_norm); if (trace): hist['f'].append(loss) hist['norm_g'].append(grad_norm) current_time = time.clock() - start_time hist['elaps_t'].append(current_time) if grad_norm<tol: return x, loss, 0 return x, loss, 1
def _update_sol(self, solver, objective, niter): if (niter % (self.k + 1)) == 0: # Extrapolate at each k iterations self.buffer.append(solver.sol) # (Normalized) matrix of differences U = np.diff(self.buffer, axis=0) UU = np.dot(U, U.T) UU /= np.linalg.norm(UU) # If no parameter grid was provided, assemble one. if self.adaptive and (len(self.lambda_) <= 1): svals = np.sort(np.abs(np.linalg.eigvals(UU))) svals = np.log(svals) svals = 0.5 * (svals[:-1] + svals[1:]) self.lambda_ = np.concatenate(([0.], np.exp(svals))) # Grid search for the best parameter for the extrapolation fvals = [] c = np.zeros((self.k,)) extrap = np.zeros(np.shape(solver.sol)) for lambda_ in self.lambda_: # Coefficients of the extrapolation c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k), np.ones(self.k)) c[:] /= np.sum(c) extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c) fvals.append(np.sum([f.eval(extrap) for f in self.functions])) if self.forcedecrease and (min(fvals) > np.sum(objective[-1])): # If we have bad extrapolations, keep solution as is extrap[:] = solver.sol else: # Return the best extrapolation from the grid search lambda_ = self.lambda_[fvals.index(min(fvals))] # We can afford to solve the linear system here again because # self.k is normally very small. Alternatively, we could have # kept track of the best extrapolations during the grid search, # but that would require at least double the memory, as we'd # have to store both the current extrapolation and the best # extrapolation. c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k), np.ones(self.k)) c[:] /= np.sum(c) extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c) # Improve proposal with line search if self.dolinesearch: # Objective evaluation functional def f(x): return np.sum([f.eval(x) for f in self.functions]) # Solution at previous extrapolation xk = self.buffer[0] # Search direction pk = extrap - xk # Objective value during the previous extrapolation old_fval = np.sum(objective[-self.k]) a, fc, fa = line_search_armijo(f=f, xk=xk, pk=pk, gfk=-pk, old_fval=old_fval, c1=1e-4, alpha0=1.) # New point proposal if a is None: warnings.warn('Line search failed to find good step size') else: extrap[:] = xk + a * pk # Clear buffer and parameter grid for next extrapolation process self.buffer = [] self.lambda_ = [] if self.adaptive else self.lambda_ return extrap else: # Gather points for future extrapolation self.buffer.append(copy.copy(solver.sol)) return solver.sol
def opt_hyper_gaussian_natural_gradient(gpr, hyperparams, mean_key, variance_key, maxiter=500,inner_iter=10, Ifilter=None,gradcheck=False, bounds = None,callback=None, optimizer=OPT.fmin_tnc,gradient_tolerance=-1, messages=False, *args,**kw_args): def f(x, *args): x_ = X0 x_[Ifilter_x] = x rv = gpr.LML(param_list_to_dict(x_,param_struct,skeys),*args,**kw_args) #LG.debug("L("+str(x_)+")=="+str(rv)) if numpy.isnan(rv): return 1E6 return rv def df(x, *args): x_ = X0 x_[Ifilter_x] = x rv = gpr.LMLgrad(param_list_to_dict(x_,param_struct,skeys),*args,**kw_args) rv = param_dict_to_list(rv,skeys) #LG.debug("dL("+str(x_)+")=="+str(rv)) if not numpy.isfinite(rv).all(): #numpy.isnan(rv).any(): In = numpy.isnan(rv) rv[In] = 1E6 return rv[Ifilter_x] #0. store parameter structure skeys = numpy.sort(hyperparams.keys()) param_struct = dict([(name,hyperparams[name].shape) for name in skeys]) #1. convert the dictionaries to parameter lists X0 = param_dict_to_list(hyperparams,skeys) if Ifilter is not None: Ifilter_x = numpy.array(param_dict_to_list(Ifilter,skeys),dtype='bool') else: Ifilter_x = numpy.ones(len(X0),dtype='bool') #2. bounds if bounds is not None: #go through all hyperparams and build bound array (flattened) _b = [] for key in skeys: if key in bounds.keys(): _b.extend(bounds[key]) else: _b.extend([(-numpy.inf,+numpy.inf)]*hyperparams[key].size) bounds = numpy.array(_b) bounds = bounds[Ifilter_x] pass #2. set stating point of optimization, truncate the non-used dimensions x = X0.copy()[Ifilter_x] LG.debug("startparameters for opt:"+str(x)) if gradcheck: checkgrad(f, df, x) LG.info("check_grad (pre) (Enter to continue):" + str(OPT.check_grad(f,df,x))) raw_input() ## LG.debug("start optimization") if gradient_tolerance < 0: gradient_tolerance = numpy.sqrt(numpy.finfo(float).eps) hyper_for_opt = hyperparams.copy() normal_keys = [mean_key, variance_key] hyperparam_keys = [v for v in hyperparams.keys() if not v in normal_keys] last_lml = gpr.LML(hyperparams) curr_lml = last_lml + 2*gradient_tolerance direction_dict = dict([(k,numpy.zeros_like(v)) for k,v in hyperparams.iteritems()]) Q = hyper_for_opt[variance_key].shape[1] while maxiter > 0 and numpy.abs(last_lml-curr_lml) > gradient_tolerance: print "Iteration Loops left %s" % maxiter last_lml = gpr.LML(hyper_for_opt) # optimize non gaussian parameters #general optimizer interface #note: x is a subset of X, indexing the parameters that are optimized over # Ifilter_x pickes the subest of X, yielding x hyper_for_opt = optimizer(f, x, fprime=df, args=[hyperparam_keys], maxfun=int(inner_iter), pgtol=gradient_tolerance, messages=messages, bounds=bounds) # optimizer = OPT.fmin_l_bfgs_b # opt_RV=optimizer(f, x, fprime=df, maxfun=int(maxiter),iprint =1, bounds=bounds, factr=10.0, pgtol=1e-10) Xopt = X0.copy() Xopt[Ifilter_x] = hyper_for_opt[0] #convert into dictionary hyper_for_opt = param_list_to_dict(hyper_for_opt[0],param_struct,skeys) curr_lml = gpr.LML(hyper_for_opt) # optimize natural gradient parameters: print(" NIT NF F GTG") grad_mean_last = numpy.ones((1,Q)); grad_variance_last=numpy.ones((0,Q)); direction_last = 0 for i in xrange(inner_iter): #mean = hyper_for_opt[mean_key] variance = hyper_for_opt[variance_key] grad_gaussian = gpr.LMLgrad(hyper_for_opt, hyperparam_keys=normal_keys) grad_mean = grad_gaussian[mean_key] grad_variance = grad_gaussian[variance_key] grad = numpy.append(grad_mean, grad_variance, 0) grad_ = numpy.append(grad_mean / variance, grad_variance / 2., 0) grad_last = numpy.append(grad_mean_last, grad_variance_last, 0) beta = ((grad_ * (grad - grad_last)) / (grad * grad_).sum(0)).sum(0) direction = -grad_ + beta * direction_last direction_dict[mean_key] = direction[:grad_mean.shape[0],:] direction_dict[variance_key] = direction[grad_mean.shape[0]:,:] alpha = line_search_armijo(f, param_dict_to_list(hyper_for_opt, skeys), param_dict_to_list(direction_dict, skeys), 0, [normal_keys]) hyper_for_opt[mean_key] = alpha[0] * direction[:grad_mean.shape[0],:] hyper_for_opt[variance_key] = alpha[0] * direction[grad_mean.shape[0]:,:] grad_mean_last = grad_mean grad_variance_last = grad_variance direction_last = direction maxiter -= 1 #relate back to X Xopt = X0.copy() Xopt[Ifilter_x] = hyper_for_opt #convert into dictionary opt_hyperparams = param_list_to_dict(Xopt,param_struct,skeys) #get the log marginal likelihood at the optimum: opt_lml = gpr.LML(opt_hyperparams,**kw_args) return hyper_for_opt, curr_lml
def _update_sol(self, solver, objective, niter): if (niter % (self.k + 1)) == 0: # Extrapolate at each k iterations self.buffer.append(solver.sol) # (Normalized) matrix of differences U = np.diff(self.buffer, axis=0) UU = np.dot(U, U.T) UU /= np.linalg.norm(UU) # If no parameter grid was provided, assemble one. if self.adaptive and (len(self.lambda_) <= 1): svals = np.sort(np.abs(np.linalg.eigvals(UU))) svals = np.log(svals) svals = 0.5 * (svals[:-1] + svals[1:]) self.lambda_ = np.concatenate(([0.], np.exp(svals))) # Grid search for the best parameter for the extrapolation fvals = [] c = np.zeros((self.k, )) extrap = np.zeros(np.shape(solver.sol)) for lambda_ in self.lambda_: # Coefficients of the extrapolation c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k), np.ones(self.k)) c[:] /= np.sum(c) extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c) fvals.append(np.sum([f.eval(extrap) for f in self.functions])) if self.forcedecrease and (min(fvals) > np.sum(objective[-1])): # If we have bad extrapolations, keep solution as is extrap[:] = solver.sol else: # Return the best extrapolation from the grid search lambda_ = self.lambda_[fvals.index(min(fvals))] # We can afford to solve the linear system here again because # self.k is normally very small. Alternatively, we could have # kept track of the best extrapolations during the grid search, # but that would require at least double the memory, as we'd # have to store both the current extrapolation and the best # extrapolation. c[:] = np.linalg.solve(UU + lambda_ * np.eye(self.k), np.ones(self.k)) c[:] /= np.sum(c) extrap[:] = np.dot(np.asarray(self.buffer[:-1]).T, c) # Improve proposal with line search if self.dolinesearch: # Objective evaluation functional def f(x): return np.sum([f.eval(x) for f in self.functions]) # Solution at previous extrapolation xk = self.buffer[0] # Search direction pk = extrap - xk # Objective value during the previous extrapolation old_fval = np.sum(objective[-self.k]) a, fc, fa = line_search_armijo(f=f, xk=xk, pk=pk, gfk=-pk, old_fval=old_fval, c1=1e-4, alpha0=1.) # New point proposal if a is None: warnings.warn('Line search failed to find good step size') else: extrap[:] = xk + a * pk # Clear buffer and parameter grid for next extrapolation process self.buffer = [] self.lambda_ = [] if self.adaptive else self.lambda_ return extrap else: # Gather points for future extrapolation self.buffer.append(copy.copy(solver.sol)) return solver.sol
def fista(grad, obj, prox, x0, momentum='fista', restarting=None, max_iter=100, step_size=None, early_stopping=True, eps=np.finfo(np.float64).eps, times=False, debug=False, verbose=0, name="Optimization"): """ ISTA like algorithm. """ # parameters checking if verbose and not debug: print(f"[{name}] Can't have verbose if cost-func is not computed, " f"enable it by setting debug=True") if momentum not in [None, 'fista', 'greedy']: raise ValueError(f"[{name}] momentum should be ['fista', 'ista', " f"'greedy'], got {momentum}") if restarting not in [None, 'obj', 'descent']: raise ValueError(f"[{name}] restarting should be [None, 'obj', " f"'descent'], got {restarting}") if momentum == 'ista' and restarting in ['obj', 'descent']: raise ValueError(f"[{name}] restarting can't be set to 'obj' or " f"'descent' if momentum == 'ista'") # prepare the iterate x_old, x, y, y_old = np.copy(x0), np.copy(x0), np.copy(x0), np.copy(x0) pobj_, times_, diff_ = [obj(x)], [0.0], [0.0] t = t_old = 1 # prepare the adaptative-step variables adaptive_step_size = False if step_size is None: adaptive_step_size = True step_size = 1.0 old_fval = pobj_[0] # main loop for ii in range(max_iter): if times: t0 = time.time() grad_ = grad(y) # adaptative step-size if adaptive_step_size: step_size, _, old_fval = line_search_armijo(obj, y.ravel(), -grad_.ravel(), grad_.ravel(), old_fval, c1=1.0e-5, alpha0=step_size) if step_size is None: step_size = 0.0 # main descent step x = prox(y - step_size * grad_, step_size) # fista acceleration if momentum is None: y = x elif momentum == 'fista': t = 0.5 * (1.0 + np.sqrt(1.0 + 4.0 * t_old**2)) y = x + (t_old - 1.0) / t * (x - x_old) elif momentum == 'greedy': y = x + (x - x_old) diff_.append(np.linalg.norm(x - x_old)) # savings times if times: # skip cost-function computation for benchmark delta_t = time.time() - t0 # savings cost-function values if debug: pobj_.append(obj(x)) # savings times, restart after cost-function computation if times: t0 = time.time() if restarting == 'obj' and (pobj_[-1] > pobj_[-2]): # restart if cost function increase if momentum == 'fista': x = x_old t = 1.0 elif momentum == 'greedy': y = x if restarting == 'descent' and np.sum((y_old - x) * (x - x_old)) > 0.0: # restart if x_k+1 - x_k has the same direction than x_k - x_k-1 if momentum == 'fista': x = x_old t = 1.0 elif momentum == 'greedy': y = x # variables updates k+1, k, k-1 t_old = t x_old = x y_old = y # verbose at every 100th iterations if debug and verbose > 0 and ii % 100: print( f"\r[{name}] Iteration {100.0 * (ii + 1) / max_iter:.0f}%, " f"loss = {pobj_[ii]:.3e}, " f"grad-norm = {np.linalg.norm(grad_):.3e}", end='', flush=True) # early-stopping on || x_k - x_k-1 || < eps if early_stopping and diff_[-1] <= eps: if debug: print(f"\r[{name}] early-stopping " f"done at {100.0 * (ii + 1) / max_iter:.0f}%, " f"loss = {pobj_[ii]:.3e}, " f"grad-norm = {np.linalg.norm(grad_):.3e}") print("\n") break # divergence safeguarding if diff_[-1] > np.finfo(np.float64).max: raise RuntimeError(f"\n[{name}] algo. have diverged during.") # savings times if times: times_.append(delta_t + time.time() - t0) if not times and not debug: return x if times and not debug: return x, np.array(times_) if not times and debug: return x, np.array(pobj_) if times and debug: return x, np.array(pobj_), np.array(times_)