def maximize_likelihood(self, obsTime, shiftStart, shiftEnd): x0 = np.array((12., 1.)) if not hasattr(obsTime, "__iter__") or len(np.unique(obsTime)) == 1: optResult = optimize.OptimizeResult() optResult.x = np.array([obsTime, 1000]) optResult.fun = -np.inf optResult.success = True elif not hasattr(obsTime, "__iter__") and len(np.unique(obsTime)) == 0: optResult = optimize.OptimizeResult() optResult.x = x0 optResult.fun = np.nan optResult.success = True else: ineqConstr1 = lambda coeff: coeff ineqConstr2 = lambda coeff: 24 - coeff[0] optResult = optimize.minimize(self.neg_log_likelihood, x0, (obsTime, shiftStart, shiftEnd), method='SLSQP', constraints=({ "type": "ineq", "fun": ineqConstr1 }, { "type": "ineq", "fun": ineqConstr2 }), options={ 'disp': False, 'ftol': 1e-08 }) print(optResult) if optResult.fun < 0: self.neg_log_likelihood(optResult.x, obsTime, shiftStart, shiftEnd) if not optResult.success: raise RuntimeError("Optimization was not successful") self.location, self.kappa = optResult.x self.AIC = 2 * (optResult.fun + 2) self.negLL = optResult.fun print("AIC:", self.AIC) return optResult
def coord_descent(fun, init, args, **kwargs): maxiter = kwargs['maxiter'] x = init.copy() def coord_opt(alpha, scales, i): if alpha < 0: result = 1e6 else: scales[i] = alpha result = fun(scales) return result nfev = 0 for j in range(maxiter): for i in range(len(x)): print("Optimizing variable {}".format(i)) r = opt.minimize_scalar(lambda alpha: coord_opt(alpha, x, i)) nfev += r.nfev opt_alpha = r.x x[i] = opt_alpha if 'callback' in kwargs: kwargs['callback'](x) res = opt.OptimizeResult() res.x = x res.nit = maxiter res.nfev = nfev res.fun = np.array([r.fun]) res.success = True return res
def custmin(fun, bracket, args=(), maxfev=None, stepsize=0.1, maxiter=100, callback=None, **options): bestx = (bracket[1] + bracket[0]) / 2.0 besty = fun(bestx) funcalls = 1 niter = 0 improved = True stop = False while improved and not stop and niter < maxiter: improved = False niter += 1 for testx in [bestx - stepsize, bestx + stepsize]: testy = fun(testx, *args) funcalls += 1 if testy < besty: besty = testy bestx = testx improved = True if callback is not None: callback(bestx) if maxfev is not None and funcalls >= maxfev: stop = True break return optimize.OptimizeResult(fun=besty, x=bestx, nit=niter, nfev=funcalls, success=(niter > 1))
def mcddp(x0, u_init, callback): "Monte-Carlo DDP" rtol = 1e-2 niter_same = 0 niter_same_max = 2 c_best = cost(u_init.flatten(), x0) u_best = u_init niter_max = 10 for i in range(niter_max): #log.info('mcddp adding noise') std = 1e-5 / dt * np.sqrt(model.u_upper - model.u_lower) u_i = u_best.copy() u_i = np.random.normal(u_best, std, size=u_i.shape) u_i = np.clip(u_i, model.u_lower, model.u_upper) #log.info('mcddp solving') _, u_i = ddp_solve(x0, initial=u_i, dt=dt, callback=callback) #log.info('mcddp final state:\n%s', model.state_rep(step_array(x0, u_i, dt=dt)[-1])) c = cost(u_i.flatten(), x0) #log.info('mcddp final cost: %.5g', c) niter_same += 1 if c < c_best: #log.info('mcddp improved best solution, i: %d, c: %.5g, ' # '%%ch: %.5g, std: %s', i, c, (c_best-c)/abs(c_best), std) if (c_best - c) / abs(c_best) > rtol: niter_same = 0 c_best, u_best = c, u_i.copy() if callback_opt: callback_opt(x0, u_best) if niter_same >= niter_same_max: break return optimize.OptimizeResult(success=True, x=u_best)
def _optimize(self, objective): """ Select the random point with the minimum objective function value. Parameters ---------- :param objective: objective function to minimize Returns ------- :return: optimal parameter found by the optimizer (scipy format) """ points = self._get_eval_points() if self.matrix_to_vector_transform is not None: # Transform the sampled matrix points in vectors points = np.array([ self.matrix_to_vector_transform(points[i]) for i in range(self._nb_samples) ]) evaluations = objective(points) idx_best = np.argmin(evaluations, axis=0) return sc_opt.OptimizeResult(x=points[idx_best, :], success=True, fun=evaluations[idx_best, :], nfev=points.shape[0], message="OK")
def custmin(fun, x0, args=(), maxfev=None, stepsize=0.1, maxiter=100, callback=None, **options): bestx = x0 besty = fun(x0) funcalls = 1 niter = 0 improved = True stop = False while improved and not stop and niter < maxiter: improved = False niter += 1 for dim in range(np.size(x0)): for s in [bestx[dim] - stepsize, bestx[dim] + stepsize]: testx = np.copy(bestx) testx[dim] = s testy = fun(testx, *args) funcalls += 1 if testy < besty: besty = testy bestx = testx improved = True if callback is not None: callback(bestx) if maxfev is not None and funcalls >= maxfev: stop = True break return optimize.OptimizeResult(fun=besty, x=bestx, nit=niter, nfev=funcalls, success=(niter > 1))
def fit_lstsq(f, x0, jac=None, tol=1e-8, delta=None, iterations=None, callback=None, rcond=1e-2, lstsq=None): """Fit objective function ``f(x) = y`` using a naive repeated linear least-squares fit.""" dx = 0 for nit in count(): y0 = f(x0) if callback is not None: chisq = reduced_chisq(y0) callback( sciopt.OptimizeResult(x=x0, fun=y0, chisq=chisq, nit=nit, dx=dx, success=False, message="In progress.")) if nit > 0 and np.allclose(dx, 0, atol=tol): message = "Reached convergence" success = True break if iterations is not None and nit > iterations: message = "Reached max number of iterations" success = False break dx, dy = fit_lstsq_oneshot(lstsq, f, x0, y0=y0, jac=jac, delta=delta, rcond=rcond) x0 += dx chisq = reduced_chisq(y0) return sciopt.OptimizeResult(x=x0, fun=y0, chisq=chisq, nit=nit, success=success, message=message)
def local_optimization_step(fun, x0, *losargs, **loskwargs): loss_before = loss_fn(x0) inner_opt(constraint_solve, constraint_check, variables, bounds, args) r = spo.OptimizeResult() r.x, _, _ = vars_to_x(variables) loss_after = constraint_solve.to_diffsat(cache=True).loss(args) r.success = not (loss_before == loss_after and not constraint_check.to_diffsat(cache=True).satisfy(args)) r.fun = loss_after return r
def lm(fun, x0, jac, args=(), kwargs={}, ftol=1e-6, max_nfev=10000, x_scale=None, geodesic_accel=False, uphill_steps=False): LAM_UP = 1.5 LAM_DOWN = 5. if x_scale is None: x_scale = np.ones(x0.shape[0], dtype=np.float64) x = x0 xs = x / x_scale lam = 100. r = fun(x, *args, **kwargs) C = dot(r, r) / 2 Js = jac(x, *args, **kwargs) * x_scale[newaxis, :] dC = dot(Js.T, r) JsTJs = dot(Js.T, Js) assert r.shape[0] == Js.shape[0] I = np.eye(Js.shape[1]) for step in range(max_nfev): xs_new = xs - solve(JsTJs + lam * I, dC) x_new = xs_new * x_scale r_new = fun(x_new, *args, **kwargs) C_new = dot(r_new, r_new) / 2 print('trying step: size {:.3g}, C {:.3g}, lam {:.3g}'.format( norm(x - x_new), C_new, lam )) # print(x - x_new) if C_new >= C: lam *= LAM_UP if lam >= 1e6: break continue relative_err = abs(C - C_new) / C if relative_err <= ftol: break xs = xs_new print(xs) x = xs * x_scale r = r_new C = C_new if C < 1e-6: break Js = jac(x, *args, **kwargs) * x_scale[newaxis, :] dC = dot(Js.T, r) JsTJs = dot(Js.T, Js) lam /= LAM_DOWN return opt.OptimizeResult(x=x, fun=r)
def pure_random_search(function=fn.eggholder, start_coordinates=[0, 0], iterations=100000, bounds=[(-512, 512), (-512, 512)], show_plots=True): # defining the number of steps n = iterations #creating two array for containing x and y coordinate #of size equals to the number of size and filled up with 0's x = np.zeros(n) y = np.zeros(n) # set initial coordinates x[0], y[0] = start_coordinates[0], start_coordinates[1] # set minimum minimum = function(start_coordinates) best_point = start_coordinates # filling the coordinates with random variables count = 0 iter_to_best = [0] f_points = [minimum] for i in range(1, n): # use those steps x[i] = np.random.uniform(low=bounds[0][0], high=bounds[0][1]) y[i] = np.random.uniform(low=bounds[1][0], high=bounds[1][1]) #check if current point is better than current minimum curr_point = [x[i], y[i]] f_curr_point = function(curr_point) if f_curr_point <= minimum: f_points.append(f_curr_point) iter_to_best.append(count) minimum = f_curr_point best_point = curr_point count += 1 #insert last iteration f_point iter_to_best.append(n) f_points.append(f_points[-1]) #create an optResult object result = optimize.OptimizeResult(x=best_point, fun=minimum, iter_to_best=iter_to_best, f_points=f_points) #print('true iterations: ', count) if show_plots: # plotting stuff: pylab.title("Pure Random Search ($n = " + str(n) + "$ steps)") pylab.plot(x, y, 'o', ms=0.1) #pylab.savefig("Pure_Random_Search"+str(n)+".png",bbox_inches="tight",dpi=600) pylab.show() return result #pure_random_search()
def _res2scipy(result, history): ret = spopt.OptimizeResult() # the following values need refinement ret.success = True ret.status = 0 ret.message = 'completed' # the following are proper ret.x = result.optpar.copy() ret.fun = result.optval ret.nfev = len(history) return ret
def single_objective(parameters_guess, bounds, fit_bead, fit_parameter_names, exp_dict, global_opts={}): r""" Evaluate parameter set for equation of state with given experimental data Parameters ---------- parameters_guess : numpy.ndarray An array of initial guesses for parameters. bounds : list[tuple] List of length equal to fit_parameter_names with lists of pairs for minimum and maximum bounds of parameter being fit. Defaults from Eos object are broad, so we recommend specification. fit_bead : str Name of bead whose parameters are being fit. fit_parameter_names : list[str] This list contains the name of the parameter being fit (e.g. epsilon). See EOS documentation for supported parameter names. Cross interaction parameter names should be composed of parameter name and the other bead type, separated by an underscore (e.g. epsilon_CO2). exp_dict : dict Dictionary of experimental data objects. global_opts : dict, Optional, default={} This dictionary is included for continuity with other global optimization methods, although this method doesn't have options. Returns ------- Objective : obj scipy OptimizedResult object """ if len(global_opts) > 0: logger.info( "The fitting method 'single_objective' does not have further options" ) obj_value = ff.compute_obj(parameters_guess, fit_bead, fit_parameter_names, exp_dict, bounds) result = spo.OptimizeResult( x=parameters_guess, fun=obj_value, success=True, nit=0, message= "Successfully computed objective function for provided parameter set.", ) return result
def ddp_minimizer(cost, u, args, *a, callback=None, **kw): n = u.shape[0] // action_shape[0] U = u.reshape((n, ) + action_shape) (x0, ) = args try: X, U = ddp_solve(x0, dt=dt, callback=callback, initial=U, atol=5e0, λ_base=3.0, ln_λ=0, ln_λ_max=15, iter_max=500) except: log.exception('ddp solve failed') return optimize.OptimizeResult(x=U.flatten(), fun=cost(U.flatten(), x0), success=True)
def minimize_pgd_madry(closure, x0, prox, lmo, step=None, max_iter=200, prox_args=(), callback=None): x = x0.detach().clone() batch_size = x.size(0) if step is None: # estimate lipschitz constant # TODO: this is not the optimal step-size (if there even is one.) # I don't recommend to use this. L = utils.init_lipschitz(closure, x0) step_size = 1. / L elif isinstance(step, Number): step_size = torch.ones(batch_size, device=x.device) * step elif isinstance(step, torch.Tensor): step_size = step else: raise ValueError( f"step must be a number or a torch Tensor, got {step} instead") for it in range(max_iter): x.requires_grad = True _, grad = closure(x) with torch.no_grad(): update_direction, _ = lmo(-grad, x) update_direction += x x = prox(x + utils.bmul(step_size, update_direction), step_size, *prox_args) if callback is not None: if callback(locals()) is False: break fval, grad = closure(x) return optimize.OptimizeResult(x=x, nit=it, fval=fval, grad=grad)
def _scipy_minimize(minimizer, f, x0, delta=None, jac=None, callback=None, **kwargs): state = sciopt.OptimizeResult(x=x0, fun=None, chisq=None, nit=0, success=False, message="In progress.") def callback_wrapper(x, *_): state.nit += 1 state.dx = x - state.x state.x = x state.fun = f(x) state.chisq = reduced_chisq(state.fun) callback(state) def obj_fun(x): return reduced_chisq(f(x)) if jac is None and delta is not None: jac = partial(jac_twopoint, obj_fun, delta=delta) result = minimizer(obj_fun, x0, jac=jac, callback=callback and callback_wrapper, **kwargs) result.fun = f(result.x) result.chisq = reduced_chisq(result.fun) return result
def _optimize(self, objective): """ Minimize the objective function Parameters ---------- :param objective: objective function to minimize Returns ------- :return: optimal parameter found by the optimization (scipy format) """ # Initial value initial = self.get_initial()[0] if self.vector_to_matrix_transform is not None: initial = self.vector_to_matrix_transform(initial) if self.solver_type is 'NelderMead' or self.solver_type is 'ParticleSwarm': initial = None # Create tensorflow variable if self.matrix_manifold_dimension is None: x_tf = tf.Variable(tf.zeros(self.dimension, dtype=tf.float64)) else: x_tf = tf.Variable( tf.zeros([ self.matrix_manifold_dimension, self.matrix_manifold_dimension ], dtype=tf.float64)) # Cost function for pymanopt def objective_fct(x): if self.matrix_to_vector_transform_tf is not None: # Reshape x from matrix to vector form to compute the objective function (tensorflow format) x = self.matrix_to_vector_transform_tf( x, self.matrix_manifold_dimension) return objective(x)[0] # Transform the cost function to tensorflow function cost = tf.py_function(objective_fct, [x_tf], tf.float64) # Gradient function for pymanopt def objective_grad(x): if self.matrix_to_vector_transform is not None: # Reshape x from matrix to vector form to compute the gradient x = self.matrix_to_vector_transform(x) # Compute the gradient grad = np.array(objective(x)[1])[0] if self.vector_to_matrix_transform is not None: # Reshape the gradient in matrix form for the optimization on the manifold grad = self.vector_to_matrix_transform(grad) return grad # Define pymanopt problem problem = pyman.Problem(manifold=self.manifold, cost=cost, egrad=objective_grad, arg=x_tf, verbosity=2) # Optimize the parameters of the problem opt_x, opt_log = self.solver.solve(problem, x=initial) if self.matrix_to_vector_transform is not None: # Reshape the optimum from matrix to vector form opt_x = self.matrix_to_vector_transform(opt_x) # Format the result to fit with GPflowOpt result = sc_opt.OptimizeResult( x=opt_x, fun=opt_log['final_values']['f(x)'], nit=opt_log['final_values']['iterations'], message=opt_log['stoppingreason'], success=True) return result
def minimize_frank_wolfe( f_grad, x0, lmo, step="backtracking", lipschitz=None, max_iter=400, tol=1e-12, callback=None, verbose=0, ): r"""Frank-Wolfe algorithm. Implements the Frank-Wolfe algorithm, see , see :ref:`frank_wolfe` for a more detailed description. Args: f_grad: callable Takes as input the current iterate (a vector of same size as x0) and returns the function value and gradient of the objective function. It should accept the optional argument return_gradient, and when False it should return only the function value. x0: array-like Initial guess for solution. lmo: callable Takes as input a vector u of same size as x0 and returns both the update direction and the maximum admissible step-size. step: str or callable, optional Step-size strategy to use. Should be one of - "backtracking", will use the backtracking line-search from [1]_ - "DR", will use the Demyanov-Rubinov step-size. This step-size minimizes a quadratic upper bound ob the objective using the gradient's lipschitz constant, passed in keyword argument `lipschitz`. - "sublinear", will use a decreasing step-size of the form 2/(k+2). - callable, if step is a callable function, it will use the step-size returned by step(locals). lipschitz: None or float, optional Estimate for the Lipschitz constant of the gradient. Required when step="DR". max_iter: integer, optional Maximum number of iterations. tol: float, optional Tolerance of the stopping criterion. The algorithm will stop whenever the Frank-Wolfe gap is below tol or the maximum number of iterations is exceeded. callback: callable, optional Callback to execute at each iteration. If the callable returns False then the algorithm with immediately return. verbose: int, optional Verbosity level. Returns: res : scipy.optimize.OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: [1] Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." <http://proceedings.mlr.press/v28/jaggi13-supp.pdf>`_ ICML 2013. [2] Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" <http://fa.bianp.net/blog/2018/notes-on-the-frank-wolfe-algorithm-part-i/>`_, 2018 [3] Pedregosa, Fabian, Armin Askari, Geoffrey Negiar, and Martin Jaggi. `"Step-Size Adaptivity in Projection-Free Optimization." <https://arxiv.org/pdf/1806.05123.pdf>`_ arXiv:1806.05123 (2018). Examples: * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark.py` * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_vertex_overlap.py` """ x0 = np.asanyarray(x0, dtype=np.float) if tol < 0: raise ValueError("Tol must be non-negative") x = x0.copy() lipschitz_t = None step_size = None if lipschitz is not None: lipschitz_t = lipschitz f_t, grad = f_grad(x) old_f_t = None it = 0 for it in range(max_iter): update_direction, max_step_size = lmo(-grad, x) norm_update_direction = linalg.norm(update_direction) ** 2 certificate = np.dot(update_direction, -grad) # .. compute an initial estimate for the .. # .. Lipschitz estimate if not given ... if lipschitz_t is None: eps = 1e-3 grad_eps = f_grad(x + eps * update_direction)[1] lipschitz_t = linalg.norm(grad - grad_eps) / ( eps * np.sqrt(norm_update_direction) ) print("Estimated L_t = %s" % lipschitz_t) if certificate <= tol: break if hasattr(step, "__call__"): step_size = step(locals()) f_next, grad_next = f_grad(x + step_size * update_direction) elif step == "backtracking": step_size, lipschitz_t, f_next, grad_next = backtracking_step_size( x, f_t, old_f_t, f_grad, certificate, lipschitz_t, max_step_size, update_direction, norm_update_direction, ) elif step == "DR": if lipschitz is None: raise ValueError('lipschitz needs to be specified with step="DR"') step_size = min( certificate / (norm_update_direction * lipschitz_t), max_step_size ) f_next, grad_next = f_grad(x + step_size * update_direction) elif step == "oblivious": # .. without knowledge of the Lipschitz constant .. # .. we take the oblivious 2/(k+2) step-size .. step_size = 2.0 / (it + 2) f_next, grad_next = f_grad(x + step_size * update_direction) else: raise ValueError("Invalid option step=%s" % step) if callback is not None: callback(locals()) x += step_size * update_direction old_f_t = f_t f_t, grad = f_next, grad_next if callback is not None: callback(locals()) return optimize.OptimizeResult(x=x, nit=it, certificate=certificate)
def minimize_PGD(f, g=None, x0=None, tol=1e-6, max_iter=500, verbose=0, callback=None, backtracking: bool = True, step_size=None, max_iter_backtracking=100, backtracking_factor=0.6, trace=False) -> optimize.OptimizeResult: """Proximal gradient descent. Solves problems of the form minimize_x f(x) + g(x) where we have access to the gradient of f and to the proximal operator of g. Arguments: f : loss function (smooth) g : penalty term (proximal) x0 : array-like, optional Initial guess backtracking : boolean Whether to perform backtracking (i.e. line-search) or not. max_iter : int Maximum number of iterations. verbose : int Verbosity level, from 0 (no output) to 2 (output on each iteration) step_size : float Starting value for the line-search procedure. XXX callback : callable callback function (optional). Returns: res : The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications to signal recovery." Convex optimization in signal processing and communications (2009) """ if x0 is None: xk = np.zeros(f.n_features) else: xk = np.array(x0, copy=True) if not max_iter_backtracking > 0: raise ValueError('Line search iterations need to be greater than 0') if g is None: g = ZeroLoss() if step_size is None: # sample to estimate Lipschitz constant step_size_n_sample = 5 L = [] for _ in range(step_size_n_sample): x_tmp = np.random.randn(f.n_features) x_tmp /= linalg.norm(x_tmp) L.append(linalg.norm(f(xk) - f(x_tmp))) # give it a generous upper bound step_size = 2. / np.mean(L) success = False trace_func = [] trace_time = [] trace_x = [] start_time = datetime.now() it = 1 # .. a while loop instead of a for loop .. # .. allows for infinite or floating point max_iter .. if trace: trace_x.append(xk.copy()) trace_func.append(f(xk) + g(xk)) trace_time.append((datetime.now() - start_time).total_seconds()) while it <= max_iter: # .. compute gradient and step size current_step_size = step_size grad_fk = f.gradient(xk) x_next = g.prox(xk - current_step_size * grad_fk, current_step_size) incr = x_next - xk if backtracking: fk = f(xk) f_next = f(x_next) for _ in range(max_iter_backtracking): if f_next <= fk + grad_fk.dot( incr) + incr.dot(incr) / (2.0 * current_step_size): # .. step size found .. break else: # .. backtracking, reduce step size .. current_step_size *= backtracking_factor x_next = g.prox(xk - current_step_size * grad_fk, current_step_size) incr = x_next - xk f_next = f(x_next) else: warnings.warn( "Maxium number of line-search iterations reached") certificate = np.linalg.norm((xk - x_next) / step_size) xk[:] = x_next if trace: trace_x.append(xk.copy()) trace_func.append(f(xk) + g(xk)) trace_time.append((datetime.now() - start_time).total_seconds()) if verbose > 0: print("Iteration %s, step size: %s" % (it, step_size)) if certificate < tol: if verbose: print("Achieved relative tolerance at iteration %s" % it) success = True break if callback is not None: callback(xk) it += 1 if it >= max_iter: warnings.warn( "proximal_gradient did not reach the desired tolerance level", RuntimeWarning) return optimize.OptimizeResult(x=xk, success=success, certificate=certificate, nit=it, trace_x=np.array(trace_x), trace_func=np.array(trace_func), trace_time=trace_time)
def minimize_frank_wolfe(closure, x0, lmo, step='sublinear', max_iter=200, callback=None): """Performs the Frank-Wolfe algorithm on a batch of objectives of the form min_x f(x) s.t. x in C where we have access to the Linear Minimization Oracle (LMO) of the constraint set C, and the gradient of f through closure. Args: closure: callable gives function values and the jacobian of f. x0: torch.Tensor of shape (batch_size, *). initial guess lmo: callable Returns update_direction, max_step_size step: float or 'sublinear' step-size scheme to be used. max_iter: int max number of iterations. callback: callable (optional) Any callable called on locals() at the end of each iteration. Often used for logging. """ x = x0.detach().clone() batch_size = x.size(0) if not (isinstance(step, Number) or step == 'sublinear'): raise ValueError("step must be a float or 'sublinear'.") if isinstance(step, Number): step_size = step * torch.ones( batch_size, device=x.device, dtype=x.dtype) cert = np.inf * torch.ones(batch_size, device=x.device) for it in range(max_iter): x.requires_grad = True fval, grad = closure(x) update_direction, max_step_size = lmo(-grad, x) cert = utils.bdot(-grad, update_direction) if step == 'sublinear': step_size = 2. / (it + 2) * torch.ones( batch_size, dtype=x.dtype, device=x.device) with torch.no_grad(): step_size = torch.min(step_size, max_step_size) x += utils.bmul(update_direction, step_size) if callback is not None: if callback(locals()) is False: break fval, grad = closure(x) return optimize.OptimizeResult(x=x, nit=it, fval=fval, grad=grad, certificate=cert)
def minimize_vrtos( f_deriv, A, b, x0, step_size, prox_1=None, prox_2=None, alpha=0, max_iter=500, tol=1e-6, callback=None, verbose=0, ): r"""Variance-reduced three operator splitting (VRTOS) algorithm. The VRTOS algorithm can solve optimization problems of the form argmin_{x \in R^p} \sum_{i}^n_samples f(A_i^T x, b_i) + alpha * ||x||_2^2 + + pen1(x) + pen2(x) Parameters ---------- f_deriv derivative of f x0: np.ndarray or None, optional Starting point for optimization. step_size: float or None, optional Step size for the optimization. If None is given, this will be estimated from the function f. n_jobs: int Number of threads to use in the optimization. A number higher than 1 will use the Asynchronous SAGA optimization method described in [Pedregosa et al., 2017] max_iter: int Maximum number of passes through the data in the optimization. tol: float Tolerance criterion. The algorithm will stop whenever the norm of the gradient mapping (generalization of the gradient for nonsmooth optimization) is below tol. verbose: bool Verbosity level. True might print some messages. trace: bool Whether to trace convergence of the function, useful for plotting and/or debugging. If ye, the result will have extra members trace_func, trace_time. Returns ------- opt: OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References ---------- Pedregosa, Fabian, Kilian Fatras, and Mattia Casotto. "Variance Reduced Three Operator Splitting." arXiv preprint arXiv:1806.07294 (2018). """ n_samples, n_features = A.shape success = False # FIXME: just a workaround for now # FIXME: check if prox_1 is a tuple if hasattr(prox_1, "__len__") and len(prox_1) == 2: blocks_1 = prox_1[1] prox_1 = prox_1[0] else: blocks_1 = sparse.eye(n_features, n_features, format="csr") if hasattr(prox_2, "__len__") and len(prox_2) == 2: blocks_2 = prox_2[1] prox_2 = prox_2[0] else: blocks_2 = sparse.eye(n_features, n_features, format="csr") Y = np.zeros((2, x0.size)) z = x0.copy() assert A.shape[0] == b.size if step_size < 0: raise ValueError if prox_1 is None: @utils.njit def prox_1(x, i, indices, indptr, d, step_size): pass if prox_2 is None: @utils.njit def prox_2(x, i, indices, indptr, d, step_size): pass A = sparse.csr_matrix(A) epoch_iteration = _factory_sparse_vrtos(f_deriv, prox_1, prox_2, blocks_1, blocks_2, A, b, alpha, step_size) # .. memory terms .. memory_gradient = np.zeros(n_samples) gradient_average = np.zeros(n_features) x1 = x0.copy() grad_tmp = np.zeros(n_features) # warm up for the JIT epoch_iteration( Y, x0, x1, z, memory_gradient, gradient_average, np.array([0]), grad_tmp, step_size, ) # .. iterate on epochs .. if callback is not None: callback(locals()) for it in range(max_iter): epoch_iteration( Y, x0, x1, z, memory_gradient, gradient_average, np.random.permutation(n_samples), grad_tmp, step_size, ) certificate = np.linalg.norm(x0 - z) + np.linalg.norm(x1 - z) if callback is not None: callback(locals()) return optimize.OptimizeResult(x=z, success=success, nit=it, certificate=certificate)
def minimize_three_split(closure, x0, prox1=None, prox2=None, tol=1e-6, max_iter=1000, verbose=0, callback=None, line_search=True, step=None, max_iter_backtracking=100, backtracking_factor=0.7, h_Lipschitz=None, *args_prox): """Davis-Yin three operator splitting method. This algorithm can solve problems of the form minimize_x f(x) + g(x) + h(x) where f is a smooth function and g and h are (possibly non-smooth) functions for which the proximal operator is known. Remark: this method returns x = prox1(...). If g and h are two indicator functions, this method only garantees that x is feasible for the first. Therefore if one of the constraints is a hard constraint, make sure to pass it to prox1. Args: closure: callable Returns the function values and gradient of the objective function. With return_gradient=False, returns only the function values. Shape of return value: (batch_size, *) x0 : torch.Tensor(shape: (batch_size, *)) Initial guess prox1 : callable or None prox1(x, step_size, *args) returns the proximal operator of g at xa with parameter step_size. step_size can be a scalar or of shape (batch_size,). prox2 : callable or None prox2(x, step_size, *args) returns the proximal operator of g at xa with parameter step_size. alpha can be a scalar or of shape (batch_size,). tol: float Tolerance of the stopping criterion. max_iter : int Maximum number of iterations. verbose : int Verbosity level, from 0 (no output) to 2 (output on each iteration) callback : callable. callback function (optional). Called with locals() at each step of the algorithm. The algorithm will exit if callback returns False. line_search : boolean Whether to perform line-search to estimate the step sizes. step_size : float or tensor(shape: (batch_size,)) or None Starting value(s) for the line-search procedure. if None, step_size will be estimated for each datapoint in the batch. max_iter_backtracking: int maximun number of backtracking iterations. Used in line search. backtracking_factor: float the amount to backtrack by during line search. args_prox: iterable (optional) Extra arguments passed to the prox functions. kwargs_prox: dict (optional) Extra keyword arguments passed to the prox functions. Returns: res : OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution tensor, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. """ success = torch.zeros(x0.size(0), dtype=bool) if not max_iter_backtracking > 0: raise ValueError("Line search iterations need to be greater than 0") LS_EPS = np.finfo(np.float).eps if prox1 is None: @torch.no_grad() def prox1(x, s=None, *args): return x if prox2 is None: @torch.no_grad() def prox2(x, s=None, *args): return x x = x0.detach().clone().requires_grad_(True) batch_size = x.size(0) if step is None: line_search = True step_size = 1.0 / utils.init_lipschitz(closure, x) elif isinstance(step, Number): step_size = step * torch.ones( batch_size, device=x.device, dtype=x.dtype) else: raise ValueError("step must be float or None.") z = prox2(x, step_size, *args_prox) z = z.clone().detach() z.requires_grad_(True) fval, grad = closure(z) x = prox1(z - utils.bmul(step_size, grad), step_size, *args_prox) u = torch.zeros_like(x) for it in range(max_iter): z.requires_grad_(True) fval, grad = closure(z) with torch.no_grad(): x = prox1(z - utils.bmul(step_size, u + grad), step_size, *args_prox) incr = x - z norm_incr = torch.norm(incr.view(incr.size(0), -1), dim=-1) rhs = fval + utils.bdot(grad, incr) + ((norm_incr**2) / (2 * step_size)) ls_tol = closure(x, return_jac=False) mask = torch.bitwise_and(norm_incr > 1e-7, line_search) ls = mask.detach().clone() # TODO: optimize code in this loop using mask for it_ls in range(max_iter_backtracking): if not (mask.any()): break rhs[mask] = fval[mask] + utils.bdot(grad[mask], incr[mask]) rhs[mask] += utils.bmul(norm_incr[mask]**2, 1. / (2 * step_size[mask])) ls_tol[mask] = closure(x, return_jac=False)[mask] - rhs[mask] mask &= (ls_tol > LS_EPS) step_size[mask] *= backtracking_factor z = prox2(x + utils.bmul(step_size, u), step_size, *args_prox) u += utils.bmul(x - z, 1. / step_size) certificate = utils.bmul(norm_incr, 1. / step_size) if callback is not None: if callback(locals()) is False: break success = torch.bitwise_and(certificate < tol, it > 0) if success.all(): break return optimize.OptimizeResult(x=x, success=success, nit=it, fval=fval, certificate=certificate)
def run_station(config_file, waveform_file, network, station, location, logger): """Runner for analysis of single station. For multiple stations, set up config file to run batch job using mpi_job CLI. The output file is in HDF5 format. The configuration details are added to the output file for traceability. :param config_file: Config filename specifying job settings :type config_file: dict :param waveform_file: Event waveform source file for seismograms, generated using `extract_event_traces.py` script :type waveform_file: str or pathlib.Path :param network: Network code of station to analyse :type network: str :param station: Station code to analyse :type station: str :param location: Location code of station to analyse. Can be '' (empty string) if not set. :type location: str :param logger: Output logging instance :type logger: logging.Logger :return: Pair containing (solution, configuration) containers. Configuration will have additional traceability information. :rtype: (solution, dict) """ with open(config_file, 'r') as cf: config = json.load(cf) # end with # logger.info("Config:\n{}".format(json.dumps(config, indent=4))) station_id = "{}.{}.{}".format(network, station, location) logger.info("Network.Station.Location: {}".format(station_id)) config.update({"station_id": station_id}) stype = config['solver']['type'] if stype.lower() == 'mcmc': runner = run_mcmc else: logger.error("Unknown solver type: {}".format(stype)) return (None, config) # end if # Load input data logger.info('Ingesting waveform file {}'.format(waveform_file)) waveform_data = NetworkEventDataset(waveform_file, network=network, station=station, location=location) config.update({"waveform_file": waveform_file}) # Trim entire dataset to max time window required. time_window = config["su_energy_opts"]["time_window"] # Trim streams to time window waveform_data.apply( lambda stream: stream.trim(stream[0].stats.onset + time_window[0], stream[0].stats.onset + time_window[1])) # Curate input data if curation options given if "curation_opts" in config: curation_opts = config["curation_opts"] if curation_opts: curate_seismograms(waveform_data, curation_opts, logger) # end if # end if try: # Ordering of seismograms important here, since storage of sequential values in solution # depend on it. Here the input seismograms are ordered by event ID. soln = runner(waveform_data.station(station).values(), config, logger) except Exception as e: logger.error('Runner failed on station {}'.format(station_id)) logger.exception(e) soln = optimize.OptimizeResult() soln.success = False soln.message = str(e) # end try # Add ordered event IDs so source waveforms can be re-extraced later # from source file if necessary. try: ordered_event_ids = [ st[0].stats.event_id for st in waveform_data.station(station).values() ] except Exception as e: logger.error( 'Event ID collection failed on station {}'.format(station_id)) logger.exception(e) ordered_event_ids = [] # end try config.update({"event_ids": ordered_event_ids}) return soln, config
def minimize(X, f, length, *varargin): realmin = np.finfo(np.double).tiny INT = 0.1 #don't reevaluate within 0.1 of the limit of the current bracket EXT = 3.0 #extrapolate maximum 3 times the current step-size MAX = 20 #max 20 function evaluations per line search RATIO = 10 #maximum allowed slope ratio SIG = 0.1 RHO = SIG / 2 #SIG and RHO are the constants controlling the Wolfe- #Powell conditions. SIG is the maximum allowed absolute ratio between #previous and new slopes (derivatives in the search direction), thus setting #SIG to low (positive) values forces higher precision in the line-searches. #RHO is the minimum allowed fraction of the expected (from the slope at the #initial point in the linesearch). Constants must satisfy 0 < RHO < SIG < 1. #Tuning of SIG (depending on the nature of the function to be optimized) may #speed up the minimization; it is probably not worth playing much with RHO. #The code falls naturally into 3 parts, after the initial line search is #started in the direction of steepest descent. 1) we first enter a while loop #which uses point 1 (p1) and (p2) to compute an extrapolation (p3), until we #have extrapolated far enough (Wolfe-Powell conditions). 2) if necessary, we #enter the second loop which takes p2, p3 and p4 chooses the subinterval #containing a (local) minimum, and interpolates it, unil an acceptable point #is found (Wolfe-Powell conditions). Note, that points are always maintained #in order p0 <= p1 <= p2 < p3 < p4. 3) compute a new search direction using #conjugate gradients (Polack-Ribiere flavour), or revert to steepest if there #was a problem in the previous line-search. Return the best value so far, if #two consecutive line-searches fail, or whenever we run out of function #evaluations or line-searches. During extrapolation, the "f" function may fail #either with an error or returning Nan or Inf, and minimize should handle this #gracefully. red = 1.0 if length > 0: S = 'Linesearch' else: S = 'Function evaluation' funcalls = 0 i = 0 #zero the run length counter ls_failed = False #no previous line search has failed f0, df0 = f(X, *varargin) #get function value and gradient funcalls += 1 #print S, 'iteration', i, 'Value: %4.6e'%f0 fX = [f0] if (length < 0): i += 1 #count epochs?! s = -df0 d0 = -s.dot(s) #initial search direction (steepest) and slope x3 = red / (1 - d0) #initial step is red/(|s|+1) while (i < np.abs(length)): if (length > 0): i += 1 #count epochs?! X0 = X.copy() F0 = f0 dF0 = df0.copy() M = (MAX if (length > 0) else np.minimum(MAX, -length - i)) while True: x2 = 0 f2 = f0 d2 = d0 f3 = f0 df3 = df0.copy() success = False while (not success and M > 0): try: M -= 1 if (length < 0): i += 1 f3, df3 = f(X + x3 * s, *varargin) funcalls += 1 if (np.isnan(f3) or np.isinf(f3)): raise Exception('') success = True except: x3 = (x2 + x3) / 2.0 #bisect and try again if (f3 < F0): #keep best values X0 = X + x3 * s F0 = f3 dF0 = df3.copy() d3 = df3.dot(s) #new slope if (d3 > SIG * d0 or f3 > f0 + x3 * RHO * d0 or M == 0): break #are we done extrapolating? x1 = x2 f1 = f2 d1 = d2 # move point 2 to point 1 x2 = x3 f2 = f3 d2 = d3 # move point 3 to point 2 A = 6 * (f1 - f2) + 3 * (d2 + d1) * (x2 - x1) # make cubic extrapolation B = 3 * (f2 - f1) - (2 * d1 + d2) * (x2 - x1) x3 = x1 - d1 * (x2 - x1)**2 / (B + np.sqrt(B * B - A * d1 * (x2 - x1)) ) # num. error possible, ok! if (not np.isreal(x3) or np.isnan(x3) or np.isinf(x3) or x3 < 0): x3 = x2 * EXT # num prob | wrong sign? elif (x3 > x2 * EXT): x3 = x2 * EXT # new point beyond extrapolation limit? extrapolate maximum amount elif (x3 < x2 + INT * (x2 - x1)): x3 = x2 + INT * (x2 - x1 ) # new point too close to previous point? while ((np.abs(d3) > -SIG * d0 or f3 > f0 + x3 * RHO * d0) and M > 0): # keep interpolating if (d3 > 0 or f3 > f0 + x3 * RHO * d0): # choose subinterval x4 = x3 f4 = f3 d4 = d3 # move point 3 to point 4 else: x2 = x3 f2 = f3 d2 = d3 # move point 3 to point 2 if (f4 > f0): x3 = x2 - (0.5 * d2 * (x4 - x2)**2) / ( f4 - f2 - d2 * (x4 - x2)) # quadratic interpolation else: A = 6 * (f2 - f4) / (x4 - x2) + 3 * (d4 + d2) # cubic interpolation B = 3 * (f4 - f2) - (2 * d2 + d4) * (x4 - x2) x3 = x2 + (np.sqrt(B * B - A * d2 * (x4 - x2)**2) - B) / A # num. error possible, ok! if (np.isnan(x3) or np.isinf(x3)): x3 = (x2 + x4) / 2 # if we had a numerical problem then bisect x3 = np.maximum(np.minimum(x3, x4 - INT * (x4 - x2)), x2 + INT * (x4 - x2)) # don't accept too close f3, df3 = f(X + x3 * s, *varargin) funcalls += 1 if (f3 < F0): # keep best values X0 = X + x3 * s F0 = f3 dF0 = df3.copy() M -= 1 if (length < 0): i += 1 # count epochs?! d3 = df3.dot(s) # new slope if (np.abs(d3) < -SIG * d0 and f3 < f0 + x3 * RHO * d0): # if line search succeeded X = X + x3 * s f0 = f3 fX.append(f0) # update variables #print S, i, 'Value: %4.6e'%f0 s = (df3.dot(df3) - df0.dot(df3)) / ( df0.dot(df0)) * s - df3 # Polack-Ribiere CG direction df0 = df3.copy() # swap derivatives d3 = d0 d0 = df0.dot(s) if (d0 > 0): # new slope must be negative s = -df0 d0 = -s.dot(s) # otherwise use steepest direction x3 *= np.minimum(RATIO, d3 / (d0 - realmin)) # slope ratio but max RATIO ls_failed = False # this line search did not fail else: X = X0 f0 = F0 df0 = dF0.copy() # restore best point so far if (ls_failed or i > np.abs(length)): # line search failed twice in a row break # or we ran out of time, so we give up s = -df0 d0 = -s.dot(s) # try steepest x3 = 1.0 / (1.0 - d0) ls_failed = True # this line search failed #print S, 'iteration', i, 'Value: %4.6e'%f0 return optimize.OptimizeResult(fun=F0, x=X0, nit=i, nfev=funcalls, success=True, status=0, message='')
def minimize_frank_wolfe( fun, x0, lmo, x0_rep=None, variant='vanilla', jac="2-point", step="backtracking", lipschitz=None, args=(), max_iter=400, tol=1e-12, callback=None, verbose=0, eps=1e-8, ): r"""Frank-Wolfe algorithm. Implements the Frank-Wolfe algorithm, see , see :ref:`frank_wolfe` for a more detailed description. Args: fun : callable The objective function to be minimized. ``fun(x, *args) -> float`` where x is an 1-D array with shape (n,) and `args` is a tuple of the fixed parameters needed to completely specify the function. x0: array-like Initial guess for solution. lmo: callable Takes as input a vector u of same size as x0 and returns both the update direction and the maximum admissible step-size. x0_rep: immutable Is used to initialize the active set when variant == 'pairwise'. variant: {'vanilla, 'pairwise'} Determines which Frank-Wolfe variant to use, along with lmo. Pairwise sets up and updates an active set of vertices. This is needed to make sure to not move out of the constraint set when using a pairwise LMO. jac : {callable, '2-point', bool}, optional Method for computing the gradient vector. If it is a callable, it should be a function that returns the gradient vector: ``jac(x, *args) -> array_like, shape (n,)`` where x is an array with shape (n,) and `args` is a tuple with the fixed parameters. Alternatively, the '2-point' select a finite difference scheme for numerical estimation of the gradient. If `jac` is a Boolean and is True, `fun` is assumed to return the gradient along with the objective function. If False, the gradient will be estimated using '2-point' finite difference estimation. step: str or callable, optional Step-size strategy to use. Should be one of - "backtracking", will use the backtracking line-search from [PANJ2020]_ - "DR", will use the Demyanov-Rubinov step-size. This step-size minimizes a quadratic upper bound ob the objective using the gradient's lipschitz constant, passed in keyword argument `lipschitz`. [P2018]_ - "sublinear", will use a decreasing step-size of the form 2/(k+2). [J2013]_ - callable, if step is a callable function, it will use the step-size returned by step(locals). lipschitz: None or float, optional Estimate for the Lipschitz constant of the gradient. Required when step="DR". max_iter: integer, optional Maximum number of iterations. tol: float, optional Tolerance of the stopping criterion. The algorithm will stop whenever the Frank-Wolfe gap is below tol or the maximum number of iterations is exceeded. callback: callable, optional Callback to execute at each iteration. If the callable returns False then the algorithm with immediately return. eps: float or ndarray If jac is approximated, use this value for the step size. verbose: int, optional Verbosity level. Returns: scipy.optimize.OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: .. [J2013] Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." <http://proceedings.mlr.press/v28/jaggi13-supp.pdf>`_ ICML 2013. .. [P2018] Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" <http://fa.bianp.net/blog/2018/notes-on-the-frank-wolfe-algorithm-part-i/>`_, 2018 .. [PANJ2020] Pedregosa, Fabian, Armin Askari, Geoffrey Negiar, and Martin Jaggi. `"Step-Size Adaptivity in Projection-Free Optimization." <https://arxiv.org/pdf/1806.05123.pdf>`_ arXiv:1806.05123 (2020). Examples: * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark.py` * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_vertex_overlap.py` """ x0 = np.asanyarray(x0, dtype=np.float) if tol < 0: raise ValueError("Tol must be non-negative") x = x0.copy() if variant == 'vanilla': active_set = None elif variant == 'pairwise': active_set = defaultdict(float) active_set[x0_rep] = 1. else: raise ValueError("Variant must be one of {'vanilla', 'pairwise'}.") lipschitz_t = None step_size = None if lipschitz is not None: lipschitz_t = lipschitz func_and_grad = utils.build_func_grad(jac, fun, args, eps) f_t, grad = func_and_grad(x) old_f_t = None for it in range(max_iter): update_direction, fw_vertex_rep, away_vertex_rep, max_step_size = lmo( -grad, x, active_set) norm_update_direction = linalg.norm(update_direction)**2 certificate = np.dot(update_direction, -grad) # .. compute an initial estimate for the .. # .. Lipschitz estimate if not given ... if lipschitz_t is None: eps = 1e-3 grad_eps = func_and_grad(x + eps * update_direction)[1] lipschitz_t = linalg.norm(grad - grad_eps) / ( eps * np.sqrt(norm_update_direction)) print("Estimated L_t = %s" % lipschitz_t) if certificate <= tol: break if hasattr(step, "__call__"): step_size = step(locals()) f_next, grad_next = func_and_grad(x + step_size * update_direction) elif step == "backtracking": step_size, lipschitz_t, f_next, grad_next = backtracking_step_size( x, f_t, old_f_t, func_and_grad, certificate, lipschitz_t, max_step_size, update_direction, norm_update_direction, ) elif step == "DR": if lipschitz is None: raise ValueError( 'lipschitz needs to be specified with step="DR"') step_size = min( certificate / (norm_update_direction * lipschitz_t), max_step_size) f_next, grad_next = func_and_grad(x + step_size * update_direction) elif step == "sublinear": # .. without knowledge of the Lipschitz constant .. # .. we take the sublinear 2/(k+2) step-size .. step_size = 2.0 / (it + 2) f_next, grad_next = func_and_grad(x + step_size * update_direction) else: raise ValueError("Invalid option step=%s" % step) if callback is not None: if callback(locals()) is False: # pylint: disable=g-bool-id-comparison break x += step_size * update_direction if variant == 'pairwise': update_active_set(active_set, fw_vertex_rep, away_vertex_rep, step_size) old_f_t = f_t f_t, grad = f_next, grad_next if callback is not None: callback(locals()) return optimize.OptimizeResult(x=x, nit=it, certificate=certificate, active_set=active_set)
def minimize_svrg( f_deriv, A, b, x0, step_size, alpha=0, prox=None, max_iter=500, tol=1e-6, verbose=False, callback=None, ): r"""Stochastic average gradient augmented (SAGA) algorithm. The SAGA algorithm can solve optimization problems of the form argmin_{x \in R^p} \sum_{i}^n_samples f(A_i^T x, b_i) + alpha * ||x||_2^2 + + beta * ||x||_1 Args: f_deriv derivative of f x0: np.ndarray or None, optional Starting point for optimization. step_size: float or None, optional Step size for the optimization. If None is given, this will be estimated from the function f. n_jobs: int Number of threads to use in the optimization. A number higher than 1 will use the Asynchronous SAGA optimization method described in [Pedregosa et al., 2017] max_iter: int Maximum number of passes through the data in the optimization. tol: float Tolerance criterion. The algorithm will stop whenever the norm of the gradient mapping (generalization of the gradient for nonsmooth optimization) is below tol. verbose: bool Verbosity level. True might print some messages. trace: bool Whether to trace convergence of the function, useful for plotting and/or debugging. If ye, the result will have extra members trace_func, trace_time. Returns: opt: OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: The SAGA algorithm was originally described in Aaron Defazio, Francis Bach, and Simon Lacoste-Julien. `SAGA: A fast incremental gradient method with support for non-strongly convex composite objectives. <https://arxiv.org/abs/1407.0202>`_ Advances in Neural Information Processing Systems. 2014. The implemented has some improvements with respect to the original, like support for sparse datasets and is described in Fabian Pedregosa, Remi Leblond, and Simon Lacoste-Julien. "Breaking the Nonsmooth Barrier: A Scalable Parallel Method for Composite Optimization." Advances in Neural Information Processing Systems (NIPS) 2017. """ x = np.ascontiguousarray(x0).copy() n_samples, n_features = A.shape A = sparse.csr_matrix(A) if step_size is None: # then need to use line search raise ValueError if hasattr(prox, "__len__") and len(prox) == 2: blocks = prox[1] prox = prox[0] else: blocks = sparse.eye(n_features, n_features, format="csr") if prox is None: @utils.njit def prox(x, i, indices, indptr, d, step_size): pass A_data = A.data A_indices = A.indices A_indptr = A.indptr n_samples, n_features = A.shape rblocks_indices = blocks.T.tocsr().indices blocks_indptr = blocks.indptr bs_data, bs_indices, bs_indptr = _support_matrix(A_indices, A_indptr, rblocks_indices, blocks.shape[0]) csr_blocks_1 = sparse.csr_matrix((bs_data, bs_indices, bs_indptr)) # .. diagonal reweighting .. d = np.array(csr_blocks_1.sum(0), dtype=np.float).ravel() idx = d != 0 d[idx] = n_samples / d[idx] d[~idx] = 1 @utils.njit def full_grad(x): grad = np.zeros(x.size) for i in range(n_samples): p = 0.0 for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] p += x[j_idx] * A_data[j] grad_i = f_deriv(p, b[i]) # .. gradient estimate (XXX difference) .. for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] grad[j_idx] += grad_i * A_data[j] / n_samples return grad @utils.njit(nogil=True) def _svrg_epoch(x, x_snapshot, idx, gradient_average, grad_tmp, step_size): # .. inner iteration .. for i in idx: p = 0.0 p_old = 0.0 for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] p += x[j_idx] * A_data[j] p_old += x_snapshot[j_idx] * A_data[j] grad_i = f_deriv(p, b[i]) old_grad_i = f_deriv(p_old, b[i]) for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] grad_tmp[j_idx] = (grad_i - old_grad_i) * A_data[j] # .. update coefficients .. # .. first iterate on blocks .. for h_j in range(bs_indptr[i], bs_indptr[i + 1]): h = bs_indices[h_j] # .. then iterate on features inside block .. for b_j in range(blocks_indptr[h], blocks_indptr[h + 1]): bias_term = d[h] * (gradient_average[b_j] + alpha * x[b_j]) x[b_j] -= step_size * (grad_tmp[b_j] + bias_term) prox(x, i, bs_indices, bs_indptr, d, step_size) idx = np.arange(n_samples) grad_tmp = np.zeros(n_features) success = False if callback is not None: callback(locals()) for it in range(max_iter): x_snapshot = x.copy() gradient_average = full_grad(x_snapshot) np.random.shuffle(idx) _svrg_epoch(x, x_snapshot, idx, gradient_average, grad_tmp, step_size) if callback is not None: callback(locals()) if np.abs(x - x_snapshot).sum() < tol: success = True break message = "" return optimize.OptimizeResult(x=x, success=success, nit=it, message=message)
def minimize_pgd(closure, x0, prox, step='backtracking', max_iter=200, max_iter_backtracking=1000, backtracking_factor=.6, tol=1e-8, *prox_args, callback=None): """ Performs Projected Gradient Descent on batch of objectives of form: f(x) + g(x). We suppose we have access to gradient computation for f through closure, and to the proximal operator of g in prox. Args: closure: callable x0: torch.Tensor of shape (batch_size, *). prox: callable proximal operator of g step: 'backtracking' or float or torch.tensor of shape (batch_size,) or None. step size to be used. If None, will be estimated at the beginning using line search. If 'backtracking', will be estimated at each step using backtracking line search. max_iter: int number of iterations to perform. max_iter_backtracking: int max number of iterations in the backtracking line search backtracking_factor: float factor by which to multiply the step sizes during line search tol: float stops the algorithm when the certificate is smaller than tol for all datapoints in the batch prox_args: tuple (optional) additional args for prox callback: callable (optional) Any callable called on locals() at the end of each iteration. Often used for logging. """ x = x0.detach().clone() batch_size = x.size(0) if step is None: # estimate lipschitz constant L = utils.init_lipschitz(closure, x0) step_size = 1. / L elif step == 'backtracking': L = 1.8 * utils.init_lipschitz(closure, x0) step_size = 1. / L elif type(step) == float: step_size = step * torch.ones(batch_size, device=x.device) else: raise ValueError("step must be float or backtracking or None") for it in range(max_iter): x.requires_grad = True fval, grad = closure(x) x_next = prox(x - utils.bmul(step_size, grad), step_size, *prox_args) update_direction = x_next - x if step == 'backtracking': step_size *= 1.1 mask = torch.ones(batch_size, dtype=bool, device=x.device) with torch.no_grad(): for _ in range(max_iter_backtracking): f_next = closure(x_next, return_jac=False) rhs = (fval + utils.bdot(grad, update_direction) + utils.bmul( utils.bdot(update_direction, update_direction), 1. / (2. * step_size))) mask = f_next > rhs if not mask.any(): break step_size[mask] *= backtracking_factor x_next = prox(x - utils.bmul(step_size, grad), step_size[mask], *prox_args) update_direction[mask] = x_next[mask] - x[mask] else: warnings.warn("Maximum number of line-search iterations " "reached.") with torch.no_grad(): cert = torch.norm(utils.bmul(update_direction, 1. / step_size), dim=-1) x.copy_(x_next) if (cert < tol).all(): break if callback is not None: if callback(locals()) is False: break fval, grad = closure(x) return optimize.OptimizeResult(x=x, nit=it, fval=fval, grad=grad, certificate=cert)
def fmin_CondatVu(fun, fun_deriv, g_prox, h_prox, L, x0, alpha=1.0, beta=1.0, tol=1e-12, max_iter=10000, verbose=0, callback=None, step_size_x=1e-3, step_size_y=1e3, max_iter_ls=20, g_prox_args=(), h_prox_args=()): """Condat-Vu primal-dual splitting method. This method for optimization problems of the form minimize_x f(x) + alpha * g(x) + beta * h(L x) where f is a smooth function and g is a (possibly non-smooth) function for which the proximal operator is known. Parameters ---------- fun : callable f(x) returns the value of f at x. fun_deriv : callable f_prime(x) returns the gradient of f. g_prox : callable of the form g_prox(x, alpha) g_prox(x, alpha) returns the proximal operator of g at x with parameter alpha. x0 : array-like Initial guess L : ndarray or sparse matrix Linear operator inside the h term. max_iter : int Maximum number of iterations. verbose : int Verbosity level, from 0 (no output) to 2 (output on each iteration) callback : callable callback function (optional). Returns ------- res : OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References ---------- Condat, Laurent. "A primal-dual splitting method for convex optimization involving Lipschitzian, proximable and linear composite terms." Journal of Optimization Theory and Applications (2013). Chambolle, Antonin, and Thomas Pock. "On the ergodic convergence rates of a first-order primal-dual algorithm." Mathematical Programming (2015) """ xk = np.array(x0, copy=True) yk = L.dot(xk) success = False if not max_iter_ls > 0: raise ValueError('Line search iterations need to be greater than 0') if g_prox is None: def g_prox(step_size, x, *args): return x if h_prox is None: def h_prox(step_size, x, *args): return x # conjugate of h_prox def h_prox_conj(step_size, x, *args): return x - step_size * h_prox(beta / step_size, x / step_size, *args) it = 1 # .. main iteration .. while it < max_iter: grad_fk = fun_deriv(xk) x_next = g_prox(step_size_x * alpha, xk - step_size_x * grad_fk - step_size_x * L.T.dot(yk), *g_prox_args) y_next = h_prox_conj(step_size_y, yk + step_size_y * L.dot(2 * x_next - xk), *h_prox_args) incr = linalg.norm(x_next - xk) ** 2 + linalg.norm(y_next - yk) ** 2 yk = y_next xk = x_next if verbose > 0: print("Iteration %s, increment: %s" % (it, incr)) if callback is not None: callback(xk) if incr < tol: if verbose: print("Achieved relative tolerance at iteration %s" % it) success = True break it += 1 if it >= max_iter: warnings.warn( "proximal_gradient did not reach the desired tolerance level", RuntimeWarning) return optimize.OptimizeResult( x=xk, success=success, nit=it)
def minimize_saga( f_deriv, A, b, x0, step_size, prox=None, alpha=0, max_iter=500, tol=1e-6, verbose=1, callback=None, ): r"""Stochastic average gradient augmented (SAGA) algorithm. This algorithm can solve linearly-parametrized loss functions of the form minimize_x \sum_{i}^n_samples f(A_i^T x, b_i) + alpha ||x||_2^2 + g(x) where g is a function for which we have access to its proximal operator. .. warning:: This function is experimental, API is likely to change. Args: f loss functions. x0: np.ndarray or None, optional Starting point for optimization. step_size: float or None, optional Step size for the optimization. If None is given, this will be estimated from the function f. max_iter: int Maximum number of passes through the data in the optimization. tol: float Tolerance criterion. The algorithm will stop whenever the norm of the gradient mapping (generalization of the gradient for nonsmooth optimization) is below tol. verbose: bool Verbosity level. True might print some messages. trace: bool Whether to trace convergence of the function, useful for plotting and/or debugging. If ye, the result will have extra members trace_func, trace_time. Returns: opt: OptimizeResult The optimization result represented as a ``scipy.optimize.OptimizeResult`` object. Important attributes are: ``x`` the solution array, ``success`` a Boolean flag indicating if the optimizer exited successfully and ``message`` which describes the cause of the termination. See `scipy.optimize.OptimizeResult` for a description of other attributes. References: This variant of the SAGA algorithm is described in: `"Breaking the Nonsmooth Barrier: A Scalable Parallel Method for Composite Optimization." <https://arxiv.org/pdf/1707.06468.pdf>`_, Fabian Pedregosa, Remi Leblond, and Simon Lacoste-Julien. Advances in Neural Information Processing Systems (NIPS) 2017. """ # convert any input to CSR sparse matrix representation. In the future we # might want to implement also a version for dense data (numpy arrays) to # better exploit data locality x = np.ascontiguousarray(x0).copy() n_samples, n_features = A.shape A = sparse.csr_matrix(A) if step_size is None: # then need to use line search raise ValueError if hasattr(prox, "__len__") and len(prox) == 2: blocks = prox[1] prox = prox[0] else: blocks = sparse.eye(n_features, n_features, format="csr") if prox is None: @utils.njit def prox(x, i, indices, indptr, d, step_size): pass A_data = A.data A_indices = A.indices A_indptr = A.indptr n_samples, n_features = A.shape rblocks_indices = blocks.T.tocsr().indices blocks_indptr = blocks.indptr bs_data, bs_indices, bs_indptr = _support_matrix(A_indices, A_indptr, rblocks_indices, blocks.shape[0]) csr_blocks_1 = sparse.csr_matrix((bs_data, bs_indices, bs_indptr)) # .. diagonal reweighting .. d = np.array(csr_blocks_1.sum(0), dtype=np.float).ravel() idx = d != 0 d[idx] = n_samples / d[idx] d[~idx] = 1 @utils.njit(nogil=True) def _saga_epoch(x, idx, memory_gradient, gradient_average, grad_tmp, step_size): # .. inner iteration of the SAGA algorithm.. for i in idx: # .. gradient estimate .. p = 0.0 for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] p += x[j_idx] * A_data[j] grad_i = f_deriv(p, b[i]) for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] grad_tmp[j_idx] = (grad_i - memory_gradient[i]) * A_data[j] # .. update coefficients .. # .. first iterate on blocks .. for h_j in range(bs_indptr[i], bs_indptr[i + 1]): h = bs_indices[h_j] # .. then iterate on features inside block .. for b_j in range(blocks_indptr[h], blocks_indptr[h + 1]): bias_term = d[h] * (gradient_average[b_j] + alpha * x[b_j]) x[b_j] -= step_size * (grad_tmp[b_j] + bias_term) prox(x, i, bs_indices, bs_indptr, d, step_size) # .. update memory terms .. for j in range(A_indptr[i], A_indptr[i + 1]): j_idx = A_indices[j] tmp = (grad_i - memory_gradient[i]) * A_data[j] tmp /= n_samples gradient_average[j_idx] += tmp grad_tmp[j_idx] = 0 memory_gradient[i] = grad_i # .. initialize memory terms .. memory_gradient = np.zeros(n_samples) gradient_average = np.zeros(n_features) grad_tmp = np.zeros(n_features) idx = np.arange(n_samples) success = False if callback is not None: callback(locals()) for it in range(max_iter): x_old = x.copy() np.random.shuffle(idx) _saga_epoch(x, idx, memory_gradient, gradient_average, grad_tmp, step_size) if callback is not None: callback(locals()) diff_norm = np.abs(x - x_old).sum() if diff_norm < tol: success = True break return optimize.OptimizeResult(x=x, success=success, nit=it)
def noop_min(fun, x0, args, **options): return op.OptimizeResult(x=x0, fun=fun(x0), success=True, nfev=1)
def load_mcmc_solution(h5_file, job_timestamp=None, logger=None): """Load Monte Carlo Markov Chain solution from HDF5 file. :param h5_file: File from which to load solution :type h5_file: str or pathlib.Path :param job_timestamp: Timestamp of job whose solution is to be loaded :type job_timestamp: str or NoneType :param logger: Output logging instance :type logger: logging.Logger :return: (solution, job configuration), job timestamp :rtype: (solution, dict), str """ assert isinstance(job_timestamp, (str, type(None))) # TODO: migrate this to member of a new class for encapsulating an MCMC solution def read_data_empty(dataset): """ Read dataset that might be empty. If empty, return None. See also function `write_data_empty`. :param dataset: The h5py.Dataset node to read. :type dataset: h5py.Dataset :return: Dataset value or None :rtype: numpy.array or NoneType """ if not dataset.shape: value = None else: value = dataset.value # end if return value # end func def read_list_dataset(source_node): """Read list from a datase node containing ordered collection of items. See also function `write_list_dataset`. """ list_data = [] for idx, ds in source_node.items(): list_data.append((int(idx), ds.value)) # end for # Sort clusters by idx, then throw away the idx values. list_data.sort(key=lambda i: i[0]) return [d[1] for d in list_data] # end func soln_configs = [] with h5py.File(h5_file, 'r') as h5f: while job_timestamp is None: timestamps = list(h5f.keys()) if len(timestamps) > 1: for i, ts in enumerate(timestamps): job_node = h5f[ts] job_tracking = json.loads(job_node.attrs['job_tracking']) \ if 'job_tracking' in job_node.attrs else '' if job_tracking: job_tracking = '(' + ', '.join([ ': '.join([k, str(v)]) for k, v in job_tracking.items() ]) + ')' # end if print('[{}]'.format(i), ts, job_tracking) # end for index = input('Choose dataset number to load: ') if index.isdigit() and (0 <= int(index) < len(timestamps)): index = int(index) # end if else: index = 0 # end if job_timestamp = timestamps[index] if isinstance(index, int) else None # end while job_root = h5f[job_timestamp] # source_data_file = job_root.attrs['input_file'] for station_id, station_node in job_root.items(): if logger: logger.info('Loading {}'.format(station_id.replace('_', '.'))) # end if job_config = json.loads(station_node.attrs['config']) format_version = station_node.attrs['format_version'] job_config.update({'format_version': format_version}) if logger: logger.info( 'H5 storage format version: {}'.format(format_version)) # end if try: soln = optimize.OptimizeResult() soln.x = station_node['x'].value soln.num_input_seismograms = station_node[ 'num_input_seismograms'].value cluster_node = station_node['clusters'] soln.clusters = read_list_dataset(cluster_node) assert len(soln.x) == len(soln.clusters) cluster_energy_node = station_node['cluster_energy'] soln.cluster_funvals = read_list_dataset(cluster_energy_node) per_event_energy_node = station_node['per_event_energy'] soln.esu = read_list_dataset(per_event_energy_node) # Subsurface seismograms subsurface_node = station_node['subsurface'] subsurface = {} for layer_name, layer_node in subsurface_node.items(): subsurface[layer_name] = read_list_dataset(layer_node) # end for soln.subsurface = subsurface soln.bins = station_node['bins'].value soln.distribution = station_node['distribution'].value soln.acceptance_rate = station_node['acceptance_rate'].value soln.success = bool(station_node['success'].value) soln.status = int(station_node['status'].value) soln.message = station_node['message'].value soln.fun = station_node['fun'].value soln.jac = read_data_empty(station_node['jac']) soln.nfev = int(station_node['nfev'].value) soln.njev = int(station_node['njev'].value) soln.nit = int(station_node['nit'].value) soln.maxcv = read_data_empty(station_node['maxcv']) soln.samples = read_data_empty(station_node['samples']) soln.sample_funvals = read_data_empty( station_node['sample_energies']) bounds = station_node['bounds'].value soln.bounds = optimize.Bounds(bounds[0], bounds[1]) soln.version = station_node['version'].value if 'rnd_seed' in station_node: soln.rnd_seed = int(station_node['rnd_seed'].value) else: soln.rnd_seed = None # end if soln_configs.append((soln, job_config)) except TypeError as exc: if logger: logger.error( 'Error loading station {} solution'.format(station_id)) logger.error(repr(exc)) # end try # end for # end with return soln_configs, job_timestamp