def wrapped(beta, X, y, z, u, rho): beta = maybe_to_cupy(beta, X) z = maybe_to_cupy(z, X) u = maybe_to_cupy(u, X) res = func(beta, X, y) + (rho / 2) * np.dot(beta - z + u, beta - z + u) return normalize_to_array(res)
def compute_loss_grad(beta, X, y): beta = maybe_to_cupy(beta, X) scatter_beta = scatter_array( beta, dask_distributed_client) if dask_distributed_client else beta loss_fn = pointwise_loss(scatter_beta, X, y) gradient_fn = pointwise_gradient(scatter_beta, X, y) loss, gradient = compute(loss_fn, gradient_fn) return normalize_to_array(loss), normalize_to_array(gradient.copy())
def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy): beta = np.random.normal(size=2) M = len(beta) X = da.random.random((N, M), chunks=(N // nchunks, M)) y = make_y(X, beta=np.array(beta), chunks=(N // nchunks, )) if is_cupy: cupy = pytest.importorskip('cupy') X, y = to_dask_cupy_array_xy(X, y, cupy) X, y = persist(X, y) result = func(X, y, family=family, **kwargs) test_vec = np.random.normal(size=2) test_vec = maybe_to_cupy(test_vec, X) opt = family.pointwise_loss(result, X, y).compute() test_val = family.pointwise_loss(test_vec, X, y).compute() assert opt < test_val
def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4, family=Logistic, verbose=False, **kwargs): """L-BFGS solver using scipy.optimize implementation Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) regularizer : str or Regularizer lamduh : float max_iter : int maximum number of iterations to attempt before declaring failure to converge tol : float Maximum allowed change from prior iteration required to declare convergence family : Family verbose : bool, default False whether to print diagnostic information during convergence Returns ------- beta : array-like, shape (n_features,) """ dask_distributed_client = get_distributed_client() pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient if regularizer is not None: regularizer = Regularizer.get(regularizer) pointwise_loss = regularizer.add_reg_f(pointwise_loss, lamduh) pointwise_gradient = regularizer.add_reg_grad(pointwise_gradient, lamduh) n, p = X.shape beta0 = np.zeros(p) def compute_loss_grad(beta, X, y): beta = maybe_to_cupy(beta, X) scatter_beta = scatter_array( beta, dask_distributed_client) if dask_distributed_client else beta loss_fn = pointwise_loss(scatter_beta, X, y) gradient_fn = pointwise_gradient(scatter_beta, X, y) loss, gradient = compute(loss_fn, gradient_fn) return normalize_to_array(loss), normalize_to_array(gradient.copy()) with dask.config.set(fuse_ave_width=0): # optimizations slows this down beta, loss, info = fmin_l_bfgs_b(compute_loss_grad, beta0, fprime=None, args=(X, y), iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter) beta = maybe_to_cupy(beta, X) return beta
def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1, max_iter=250, abstol=1e-4, reltol=1e-2, family=Logistic, **kwargs): """ Alternating Direction Method of Multipliers Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) regularizer : str or Regularizer lamduh : float rho : float over_relax : FLOAT max_iter : int maximum number of iterations to attempt before declaring failure to converge abstol, reltol : float family : Family Returns ------- beta : array-like, shape (n_features,) """ pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient regularizer = Regularizer.get(regularizer) def create_local_gradient(func): @functools.wraps(func) def wrapped(beta, X, y, z, u, rho): beta = maybe_to_cupy(beta, X) z = maybe_to_cupy(z, X) u = maybe_to_cupy(u, X) res = func(beta, X, y) + rho * (beta - z + u) return normalize_to_array(res) return wrapped def create_local_f(func): @functools.wraps(func) def wrapped(beta, X, y, z, u, rho): beta = maybe_to_cupy(beta, X) z = maybe_to_cupy(z, X) u = maybe_to_cupy(u, X) res = func(beta, X, y) + (rho / 2) * np.dot(beta - z + u, beta - z + u) return normalize_to_array(res) return wrapped f = create_local_f(pointwise_loss) fprime = create_local_gradient(pointwise_gradient) nchunks = getattr(X, 'npartitions', 1) # nchunks = X.npartitions (n, p) = X.shape # XD = X.to_delayed().flatten().tolist() # yD = y.to_delayed().flatten().tolist() if isinstance(X, da.Array): XD = X.rechunk((None, X.shape[-1])).to_delayed().flatten().tolist() else: XD = [X] if isinstance(y, da.Array): yD = y.rechunk((None, y.shape[-1])).to_delayed().flatten().tolist() else: yD = [y] z = np.zeros(p) u = np.array([np.zeros(p) for i in range(nchunks)]) betas = np.array([np.ones(p) for i in range(nchunks)]) for k in range(max_iter): # x-update step new_betas = [ delayed(local_update)(xx, yy, bb, z, uu, rho, f=f, fprime=fprime) for xx, yy, bb, uu in zip(XD, yD, betas, u) ] new_betas = np.array(da.compute(*new_betas)) beta_hat = over_relax * new_betas + (1 - over_relax) * z # z-update step zold = z.copy() ztilde = np.mean(beta_hat + np.array(u), axis=0) z = regularizer.proximal_operator(ztilde, lamduh / (rho * nchunks)) # u-update step u += beta_hat - z # check for convergence primal_res = np.linalg.norm(new_betas - z) dual_res = np.linalg.norm(rho * (z - zold)) eps_pri = np.sqrt(p * nchunks) * abstol + reltol * np.maximum( np.linalg.norm(new_betas), np.sqrt(nchunks) * np.linalg.norm(z)) eps_dual = np.sqrt(p * nchunks) * abstol + \ reltol * np.linalg.norm(rho * u) if primal_res < eps_pri and dual_res < eps_dual: break return maybe_to_cupy(z, X)