def solve(self, X0, B, mmv, max_iter, callback=None): t_start = time.time() if X0 is None: R = copy_same_stride(B) X = create_same_stride(B.size(), B, B.dtype, B.device) X.fill_(0.0) else: R = B - mmv(X0) X = X0 m_eps = self.params.cg_epsilon(X.dtype) P = R # noinspection PyArgumentList Rsold = torch.sum(R.pow(2), dim=0) e_train = time.time() - t_start for i in range(max_iter): with TicToc("Chol Iter", debug=False): # TODO: FIXME t_start = time.time() AP = mmv(P) # noinspection PyArgumentList alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps) X.addmm_(P, torch.diag(alpha)) if (i + 1) % self.params.cg_full_gradient_every == 0: if (X.is_cuda): # addmm_ may not be finished yet causing mmv to get stale inputs. torch.cuda.synchronize() R = B - mmv(X) else: R = R - torch.mm(AP, torch.diag(alpha)) # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0) # noinspection PyArgumentList Rsnew = torch.sum(R.pow(2), dim=0) if Rsnew.abs().max().sqrt() < self.params.cg_tolerance: print("Stopping conjugate gradient descent at " "iteration %d. Solution has converged." % (i + 1)) break P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps))) if P.is_cuda: # P must be synced so that it's correct for mmv in next iter. torch.cuda.synchronize() Rsold = Rsnew e_iter = time.time() - t_start e_train += e_iter with TicToc("Chol callback", debug=False): if callback is not None: callback(i + 1, X, e_train) return X
def solve(self, X, M, Y, _lambda, initial_solution, max_iter, callback=None): n = X.size(0) if M is None: Knm = X else: Knm = None cuda_inputs: bool = X.is_cuda device = X.device stream = None if cuda_inputs: stream = get_non_default_stream(device) # Note that if we don't have CUDA this still works with stream=None. with torch.cuda.stream(stream): with TicToc("ConjGrad preparation", False): y_over_n = Y / n # Cannot be in-place since Y needs to be preserved if self.is_weighted: y_weights = self.weight_fn(Y) y_over_n.mul_( y_weights ) # This can be in-place since we own y_over_n # Compute the right hand side if Knm is not None: B = incore_fmmv(Knm, y_over_n, None, transpose=True, opt=self.params) else: B = self.kernel.dmmv(X, M, None, y_over_n, opt=self.params) B = self.preconditioner.apply_t(B) if self.is_weighted: mmv = functools.partial(self.weighted_falkon_mmv, penalty=_lambda, X=X, M=M, Knm=Knm, y_weights=y_weights) else: mmv = functools.partial(self.falkon_mmv, penalty=_lambda, X=X, M=M, Knm=Knm) # Run the conjugate gradient solver beta = self.optimizer.solve(initial_solution, B, mmv, max_iter, callback) return beta
def solve(self, X0, B, mmv, max_iter, callback=None): t_start = time.time() if X0 is None: R = copy_same_stride(B) X = create_same_stride(B.size(), B, B.dtype, B.device) X.fill_(0.0) else: R = B - mmv(X0) X = X0 m_eps = self.params.cg_epsilon(X.dtype) P = R Rsold = torch.sum(R.pow(2), dim=0) e_train = time.time() - t_start for i in range(max_iter): with TicToc("Chol Iter", debug=False): t_start = time.time() AP = mmv(P) alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps) X.addmm_(P, torch.diag(alpha)) if (i + 1) % self.params.cg_full_gradient_every == 0: R = B - mmv(X) else: R = R - torch.mm(AP, torch.diag(alpha)) # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0) Rsnew = torch.sum(R.pow(2), dim=0) if Rsnew.abs().max().sqrt() < self.params.cg_tolerance: print("Stopping conjugate gradient descent at " "iteration %d. Solution has converged." % (i + 1)) break P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps))) Rsold = Rsnew e_iter = time.time() - t_start e_train += e_iter with TicToc("Chol callback", debug=False): if callback is not None: callback(i + 1, X, e_train) return X
def solve(self, X, M, Y, _lambda, initial_solution, max_iter, callback=None): n = X.size(0) prec = self.preconditioner with TicToc("ConjGrad preparation", False): if M is None: Knm = X else: Knm = None # Compute the right hand side if Knm is not None: B = incore_fmmv(Knm, Y / n, None, transpose=True, opt=self.params) else: B = self.kernel.dmmv(X, M, None, Y / n, opt=self.params) B = prec.apply_t(B) # Define the Matrix-vector product iteration if X.is_cuda: s1 = torch.cuda.Stream(X.device) def mmv(sol): with TicToc("MMV", False): v = prec.invA(sol) v_t = prec.invT(v) if Knm is not None: cc = incore_fdmmv(Knm, v_t, None, opt=self.params) else: cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params) if X.is_cuda: with torch.cuda.stream(s1), torch.cuda.device( X.device): # We must sync before calls to prec.inv* which use a different stream cc_ = cc.div_(n) v_ = v.mul_(_lambda) s1.synchronize() cc_ = prec.invTt(cc_).add_(v_) s1.synchronize() return prec.invAt(cc_) else: return prec.invAt(prec.invTt(cc / n) + _lambda * v) # Run the conjugate gradient solver beta = self.optimizer.solve(initial_solution, B, mmv, max_iter, callback) return beta
def run_logistic_falkon(dset: Dataset, algorithm: Algorithm, dtype: Optional[DataType], iter_list: List[int], penalty_list: List[float], num_centers: int, kernel_sigma: float, kernel: str, seed: int): import torch import falkon from falkon import kernels from falkon.models import logistic_falkon from falkon.gsc_losses import LogisticLoss from falkon.utils import TicToc torch.manual_seed(seed) np.random.seed(seed) # Data types if dtype is None: dtype = DataType.float64 # Arguments if kernel.lower() == 'gaussian': k = kernels.GaussianKernel(kernel_sigma) elif kernel.lower() == 'laplacian': k = kernels.LaplacianKernel(kernel_sigma) elif kernel.lower() == 'linear': k = kernels.LinearKernel(beta=1.0, sigma=kernel_sigma) else: raise ValueError("Kernel %s not understood for algorithm %s" % (kernel, algorithm)) opt = falkon.FalkonOptions(compute_arch_speed=False, no_single_kernel=True, pc_epsilon_32=1e-6, pc_epsilon_64=1e-13, debug=True) loss = LogisticLoss(kernel=k) flk = logistic_falkon.LogisticFalkon(kernel=k, loss=loss, penalty_list=penalty_list, iter_list=iter_list, M=num_centers, seed=seed, error_fn=None, error_every=1, options=opt) # Error metrics err_fns = get_err_fns(dset) # Load data load_fn = get_load_fn(dset) Xtr, Ytr, Xts, Yts, kwargs = load_fn(dtype=dtype.to_numpy_dtype(), as_torch=True) Xtr = Xtr.pin_memory() Ytr = Ytr.pin_memory() err_fns = [functools.partial(fn, **kwargs) for fn in err_fns] with TicToc("LOGISTIC FALKON ALGORITHM"): flk.error_fn = err_fns[0] print("Starting to train model %s on data %s" % (flk, dset), flush=True) flk.fit(Xtr, Ytr, Xts, Yts) test_model(flk, f"{algorithm} on {dset}", Xts, Yts, Xtr, Ytr, err_fns)
def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : MxD tensor The matrix of Nystroem centers """ dtype = X.dtype eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag(self.fC, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag()
def falkon_mmv(self, sol, penalty, X, M, Knm): n = Knm.shape[0] if Knm is not None else X.shape[0] prec = self.preconditioner with TicToc("MMV", False): v = prec.invA(sol) v_t = prec.invT(v) if Knm is not None: cc = incore_fdmmv(Knm, v_t, None, opt=self.params) else: cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params) # AT^-1 @ (TT^-1 @ (cc / n) + penalty * v) cc_ = cc.div_(n) v_ = v.mul_(penalty) cc_ = prec.invTt(cc_).add_(v_) out = prec.invAt(cc_) return out
def mmv(sol): with TicToc("MMV", False): v = prec.invA(sol) v_t = prec.invT(v) if Knm is not None: cc = incore_fdmmv(Knm, v_t, None, opt=self.params) else: cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params) if X.is_cuda: with torch.cuda.stream(s1): cc_ = cc.div_(n) v_ = v.mul_(_lambda) s1.synchronize() cc_ = prec.invTt(cc_).add_(v_) s1.synchronize() out = prec.invAt(cc_) s1.synchronize() return out else: return prec.invAt(prec.invTt(cc / n) + _lambda * v)
def mmv(sol): with TicToc("MMV", False): v = prec.invA(sol) v_t = prec.invT(v) if Knm is not None: cc = incore_fdmmv(Knm, v_t, None, opt=self.params) else: cc = self.kernel.dmmv(X, M, v_t, None, opt=self.params) if X.is_cuda: with torch.cuda.stream(s1), torch.cuda.device( X.device): # We must sync before calls to prec.inv* which use a different stream cc_ = cc.div_(n) v_ = v.mul_(_lambda) s1.synchronize() cc_ = prec.invTt(cc_).add_(v_) s1.synchronize() return prec.invAt(cc_) else: return prec.invAt(prec.invTt(cc / n) + _lambda * v)
def run_falkon(dset: Dataset, algorithm: Algorithm, dtype: Optional[DataType], num_iter: int, num_centers: int, kernel_sigma: float, penalty: float, kernel: str, kfold: int, seed: int): import torch from falkon import kernels from falkon.models import falkon from falkon.utils import TicToc torch.manual_seed(seed) np.random.seed(seed) # Data types if dtype is None: dtype = DataType.float64 # Arguments if kernel.lower() == 'gaussian': k = kernels.GaussianKernel(kernel_sigma) elif kernel.lower() == 'laplacian': k = kernels.LaplacianKernel(kernel_sigma) elif kernel.lower() == 'linear': k = kernels.LinearKernel(beta=1.0, sigma=kernel_sigma) else: raise ValueError("Kernel %s not understood for algorithm %s" % (kernel, algorithm)) opt = falkon.FalkonOptions(compute_arch_speed=False, no_single_kernel=True, pc_epsilon_32=1e-6, pc_epsilon_64=1e-13, debug=True) flk = falkon.Falkon(kernel=k, penalty=penalty, M=num_centers, maxiter=num_iter, seed=seed, error_fn=None, error_every=1, options=opt) # Error metrics err_fns = get_err_fns(dset) if kfold == 1: # Load data load_fn = get_load_fn(dset) Xtr, Ytr, Xts, Yts, kwargs = load_fn(dtype=dtype.to_numpy_dtype(), as_torch=True) Xtr = Xtr.pin_memory() Ytr = Ytr.pin_memory() temp_test = torch.empty(3, 3).cuda() del temp_test err_fns = [functools.partial(fn, **kwargs) for fn in err_fns] with TicToc("FALKON ALGORITHM"): flk.error_fn = err_fns[0] print("Starting to train model %s on data %s" % (flk, dset), flush=True) flk.fit(Xtr, Ytr, Xts, Yts) test_model(flk, f"{algorithm} on {dset}", Xts, Yts, Xtr, Ytr, err_fns) else: print("Will train model %s on data %s with %d-fold CV" % (flk, dset, kfold), flush=True) load_fn = get_cv_fn(dset) iteration = 0 test_errs, train_errs = [], [] for Xtr, Ytr, Xts, Yts, kwargs in load_fn(k=kfold, dtype=dtype.to_numpy_dtype(), as_torch=True): err_fns = [functools.partial(fn, **kwargs) for fn in err_fns] with TicToc("FALKON ALGORITHM (fold %d)" % (iteration)): flk.error_every = err_fns[0] flk.fit(Xtr, Ytr, Xts, Yts) iteration += 1 c_test_errs, c_train_errs = test_model(flk, f"{algorithm} on {dset}", Xts, Yts, Xtr, Ytr, err_fns) train_errs.append(c_train_errs) test_errs.append(c_test_errs) print("Full errors: Test %s - Train %s" % (test_errs, train_errs)) print() print("%d-Fold Error Report" % (kfold)) for err_fn_i in range(len(err_fns)): print("Final test errors: %.4f +- %4f" % (np.mean( [e[err_fn_i] for e in test_errs]), np.std([e[err_fn_i] for e in test_errs]))) print("Final train errors: %.4f +- %4f" % (np.mean([e[err_fn_i] for e in train_errs ]), np.std([e[err_fn_i] for e in train_errs]))) print()
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor (2D) The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Y : torch.Tensor (1D or 2D) The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Xts : torch.Tensor (2D) or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Yts : torch.Tensor (1D or 2D) or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Returns -------- model: Falkon The fitted model """ if X.size(0) != Y.size(0): raise ValueError("X and Y must have the same number of " "samples (found %d and %d)" % (X.size(0), Y.size(0))) if Y.dim() == 1: Y = torch.unsqueeze(Y, 1) if Y.dim() != 2: raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim())) if not check_same_dtype(X, Y): raise TypeError("X and Y must have the same data-type.") dtype = X.dtype # Decide whether to use CUDA for preconditioning based on M _use_cuda_preconditioner = ( self.use_cuda_ and (not self.options.cpu_preconditioner) and self.M >= get_min_cuda_preconditioner_size(dtype) ) _use_cuda_mmv = ( self.use_cuda_ and X.shape[0] * X.shape[1] * self.M / self.num_gpus >= get_min_cuda_mmv_size(dtype) ) self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None t_s = time.time() ny_points = self.center_selection.select(X, None, self.M) if self.use_cuda_: ny_points = ny_points.pin_memory() with TicToc("Calcuating Preconditioner of size %d" % (self.M), debug=self.options.debug): pc_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_preconditioner) if pc_opt.debug: print("Preconditioner will run on %s" % ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus))) precond = falkon.preconditioner.FalkonPreconditioner(self.penalty, self.kernel, pc_opt) precond.init(ny_points) if _use_cuda_mmv: # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() X = X.pin_memory() # Decide whether it's worthwile to pre-compute the k_NM kernel. # If we precompute K_NM, each CG iteration costs # Given a single kernel evaluation between two D-dimensional vectors # costs D, at CG iteration we must perform N*M kernel evaluations. # Other than the kernel evaluations we must perform two matrix-vector # products 2(N*M*T) and a bunch of triangular solves. # # So if we precompute we have 2*(N*M*T), othewise we also have N*M*D # but precomputing costs us N*M memory. # So heuristic is the following: # - If D is large (e.g. > 100) check if RAM is sufficient # - If RAM is sufficient precompute # - Otherwise do not precompute Knm = None if X.size(1) > 1200: necessary_ram = X.size(0) * ny_points.size(0) * sizeof_dtype(dtype) k_opt = dataclasses.replace(self.options, use_cpu=True) cpu_info = get_device_info(k_opt) available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9 del k_opt if available_ram > necessary_ram: if self.options.debug: print("%d*%d Kernel matrix will be stored" % (X.size(0), ny_points.size(0))) Knm = self.kernel(X, ny_points, opt=self.options) # TODO: Maybe we should do the same for Kts, but this complicates # checks for fitting in memory elif self.options.debug: print( "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" % (available_ram / 2 ** 30, necessary_ram / 2 ** 30)) self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. val_cback = None if self.error_fn is not None and self.error_every is not None: def val_cback(it, beta, train_time): self.fit_times_.append(self.fit_times_[0] + train_time) if it % self.error_every != 0: print("Iteration %3d - Elapsed %.1fs" % (it, self.fit_times_[-1]), flush=True) return err_str = "training" if Xts is None or Yts is None else "validation" alpha = precond.apply(beta) # Compute error: can be train or test; if Xts is not None and Yts is not None: pred = self._predict(Xts, ny_points, alpha) err = self.error_fn(Yts, pred) else: pred = self._predict(X, ny_points, alpha) err = self.error_fn(Y, pred) err_name = "error" if isinstance(err, tuple) and len(err) == 2: err, err_name = err print("Iteration %3d - Elapsed %.1fs - %s %s: %.4f" % (it, self.fit_times_[-1], err_str, err_name, err), flush=True) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): o_opt: FalkonOptions = dataclasses.replace(self.options, use_cpu=not _use_cuda_mmv) if o_opt.debug: print("Optimizer will run on %s" % ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True) optim = falkon.optim.FalkonConjugateGradient(self.kernel, precond, o_opt) if Knm is not None: beta = optim.solve( Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=val_cback) else: beta = optim.solve( X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=val_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon Kernel Logistic Regression model. Parameters ----------- X : torch.Tensor (2D) The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Y : torch.Tensor (1D or 2D) The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Xts : torch.Tensor (2D) or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Yts : torch.Tensor (1D or 2D) or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Returns -------- model: LogisticFalkon The fitted model """ X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts) dtype = X.dtype self.fit_times_ = [] t_s = time.time() ny_X, ny_Y = self.center_selection.select(X, Y, self.M) if self.use_cuda_: ny_X = ny_X.pin_memory() # beta is the temporary iterative solution beta = torch.zeros(ny_X.shape[0], 1, dtype=dtype) optim = ConjugateGradient(opt=self.options) validation_cback = None precond = None if self.error_fn is not None and self.error_every is not None: def validation_cback(iteration, x, pc, train_time): self.fit_times_.append(train_time) if iteration % self.error_every != 0: print("Iteration %3d - Elapsed %.1fs" % (iteration, self.fit_times_[-1]), flush=True) return err_str = "training" if Xts is None or Yts is None else "validation" coeff = pc.invT(x) # Compute error: can be train or test; if Xts is not None and Yts is not None: pred = self._predict(Xts, ny_X, coeff) err = self.error_fn(Yts, pred) loss = torch.mean(self.loss(Yts, pred)).item() else: pred = self._predict(X, ny_X, coeff) err = self.error_fn(Y, pred) loss = torch.mean(self.loss(Y, pred)).item() err_name = "error" if isinstance(err, tuple) and len(err) == 2: err, err_name = err print( f"Iteration {iteration:3d} - Elapsed {self.fit_times_[-1]:.2f}s - " f"{err_str} loss {loss:.4f} - " f"{err_str} {err_name} {err:.4f} ", flush=True) t_elapsed = 0.0 for it, penalty in enumerate(self.penalty_list): max_iter = self.iter_list[it] print("Iteration %d - penalty %e - sub-iterations %d" % (it, penalty, max_iter), flush=True) with TicToc("Preconditioner", self.options.debug): if precond is None: precond = falkon.preconditioner.LogisticPreconditioner( self.kernel, self.loss, self.options) precond.init(ny_X, ny_Y, beta, penalty, X.shape[0]) if self.use_cuda_: torch.cuda.empty_cache() with TicToc("Gradient", self.options.debug): # Gradient computation knmp_grad, inner_mmv = self.loss.knmp_grad(X, ny_X, Y, precond.invT(beta), opt=self.options) grad_p = precond.invAt( precond.invTt(knmp_grad).add_(penalty * beta)) with TicToc("Optim", self.options.debug): # MMV operation for CG def mmv(sol): sol_a = precond.invA(sol) knmp_hess = self.loss.knmp_hess(X, ny_X, Y, inner_mmv, precond.invT(sol_a), opt=self.options) return precond.invAt( precond.invTt(knmp_hess).add_(sol_a.mul_(penalty))) optim_out = optim.solve(X0=None, B=grad_p, mmv=mmv, max_iter=max_iter, callback=None) beta -= precond.invA(optim_out) t_elapsed += time.time() - t_s if validation_cback is not None: validation_cback(it, beta, precond, train_time=t_elapsed) t_s = time.time() t_elapsed += time.time() - t_s if validation_cback is not None: validation_cback(len(self.penalty_list), beta, precond, train_time=t_elapsed) self.alpha_ = precond.invT(beta) self.ny_points_ = ny_X return self
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Y : torch.Tensor The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Xts : torch.Tensor or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Yts : torch.Tensor or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Returns -------- model: Falkon The fitted model """ X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts) dtype = X.dtype self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None # Start training timer t_s = time.time() # Pick Nystrom centers if self.weight_fn is not None: # noinspection PyTupleAssignmentBalance ny_points, ny_indices = self.center_selection.select_indices( X, None) else: # noinspection PyTypeChecker ny_points: Union[ torch.Tensor, falkon.sparse.SparseTensor] = self.center_selection.select( X, None) ny_indices = None num_centers = ny_points.shape[0] # Decide whether to use CUDA for preconditioning and iterations, based on number of centers _use_cuda_preconditioner = ( self.use_cuda_ and (not self.options.cpu_preconditioner) and num_centers >= get_min_cuda_preconditioner_size( dtype, self.options)) _use_cuda_mmv = (self.use_cuda_ and X.shape[0] * X.shape[1] * num_centers / self.num_gpus >= get_min_cuda_mmv_size(dtype, self.options)) if self.use_cuda_: ny_points = ny_points.pin_memory() with TicToc("Calcuating Preconditioner of size %d" % (num_centers), debug=self.options.debug): pc_opt: FalkonOptions = dataclasses.replace( self.options, use_cpu=not _use_cuda_preconditioner) if pc_opt.debug: print("Preconditioner will run on %s" % ("CPU" if pc_opt.use_cpu else ("%d GPUs" % self.num_gpus))) precond = falkon.preconditioner.FalkonPreconditioner( self.penalty, self.kernel, pc_opt) ny_weight_vec = None if self.weight_fn is not None: ny_weight_vec = self.weight_fn(Y[ny_indices]) precond.init(ny_points, weight_vec=ny_weight_vec) if _use_cuda_mmv: # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() X = X.pin_memory() # K_NM storage decision k_opt = dataclasses.replace(self.options, use_cpu=True) cpu_info = get_device_info(k_opt) available_ram = min(k_opt.max_cpu_mem, cpu_info[-1].free_memory) * 0.9 if self._can_store_knm(X, ny_points, available_ram): Knm = self.kernel(X, ny_points, opt=self.options) else: Knm = None self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. validation_cback = None if self.error_fn is not None and self.error_every is not None: validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points, precond) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): o_opt: FalkonOptions = dataclasses.replace( self.options, use_cpu=not _use_cuda_mmv) if o_opt.debug: print("Optimizer will run on %s" % ("CPU" if o_opt.use_cpu else ("%d GPUs" % self.num_gpus)), flush=True) optim = falkon.optim.FalkonConjugateGradient( self.kernel, precond, o_opt, weight_fn=self.weight_fn) if Knm is not None: beta = optim.solve(Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) else: beta = optim.solve(X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): """Fits the Falkon KRR model. Parameters ----------- X : torch.Tensor The tensor of training data, of shape [num_samples, num_dimensions]. If X is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Y : torch.Tensor The tensor of training targets, of shape [num_samples, num_outputs]. If X and Y represent a classification problem, Y can be encoded as a one-hot vector. If Y is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Xts : torch.Tensor or None Tensor of validation data, of shape [num_test_samples, num_dimensions]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Xts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Yts : torch.Tensor or None Tensor of validation targets, of shape [num_test_samples, num_outputs]. If validation data is provided and `error_fn` was specified when creating the model, they will be used to print the validation error during the optimization iterations. If Yts is in Fortran order (i.e. column-contiguous) then we can avoid an extra copy of the data. Must be a CUDA tensor. Returns -------- model: InCoreFalkon The fitted model """ # Fix a synchronization bug which occurs when re-using center selector. torch.cuda.synchronize() X, Y, Xts, Yts = self._check_fit_inputs(X, Y, Xts, Yts) self.fit_times_ = [] self.ny_points_ = None self.alpha_ = None # Start training timer t_s = time.time() # Pick Nystrom centers if self.weight_fn is not None: # noinspection PyTupleAssignmentBalance ny_points, ny_indices = self.center_selection.select_indices( X, None) else: # noinspection PyTypeChecker ny_points: Union[ torch.Tensor, falkon.sparse.SparseTensor] = self.center_selection.select( X, None) ny_indices = None num_centers = ny_points.shape[0] pc_stream = torch.cuda.Stream(X.device) with TicToc("Calcuating Preconditioner of size %d" % (num_centers), debug=self.options.debug), torch.cuda.stream(pc_stream): precond = falkon.preconditioner.FalkonPreconditioner( self.penalty, self.kernel, self.options) ny_weight_vec = None if self.weight_fn is not None: ny_weight_vec = self.weight_fn(Y[ny_indices]) precond.init(ny_points, weight_vec=ny_weight_vec) pc_stream.synchronize() # Cache must be emptied to ensure enough memory is visible to the optimizer torch.cuda.empty_cache() # K_NM storage decision gpu_info = get_device_info(self.options)[X.device.index] available_ram = min(self.options.max_gpu_mem, gpu_info.free_memory) * 0.9 if self._can_store_knm(X, ny_points, available_ram): Knm = self.kernel(X, ny_points, opt=self.options) else: Knm = None self.fit_times_.append(time.time() - t_s) # Preparation time # Here we define the callback function which will run at the end # of conjugate gradient iterations. This function computes and # displays the validation error. validation_cback = None if self.error_fn is not None and self.error_every is not None: validation_cback = self._get_callback_fn(X, Y, Xts, Yts, ny_points, precond) # Start with the falkon algorithm with TicToc('Computing Falkon iterations', debug=self.options.debug): optim = falkon.optim.FalkonConjugateGradient( self.kernel, precond, self.options, weight_fn=self.weight_fn) if Knm is not None: beta = optim.solve(Knm, None, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) else: beta = optim.solve(X, ny_points, Y, self.penalty, initial_solution=None, max_iter=self.maxiter, callback=validation_cback) self.alpha_ = precond.apply(beta) self.ny_points_ = ny_points return self
def fit(self, X: torch.Tensor, Y: torch.Tensor, Xts: Optional[torch.Tensor] = None, Yts: Optional[torch.Tensor] = None): if X.size(0) != Y.size(0): raise ValueError("X and Y must have the same number of " "samples (found %d and %d)" % (X.size(0), Y.size(0))) if Y.dim() == 1: Y = torch.unsqueeze(Y, 1) if Y.dim() != 2: raise ValueError("Y is expected 1D or 2D. Found %dD." % (Y.dim())) if not check_same_dtype(X, Y): raise TypeError("X and Y must have the same data-type.") dtype = X.dtype self.fit_times_ = [] t_s = time.time() ny_X, ny_Y = self.center_selection.select(X, Y, self.M) if self.use_cuda_: ny_X = ny_X.pin_memory() # beta is the temporary iterative solution beta = torch.zeros(ny_X.shape[0], 1, dtype=dtype) optim = ConjugateGradient(opt=self.options) cback = None precond = None if self.error_fn is not None and self.error_every is not None: def cback(it, x, pc, train_time): self.fit_times_.append(train_time) if it % self.error_every != 0: print("Iteration %3d - Elapsed %.1fs" % (it, self.fit_times_[-1]), flush=True) return err_str = "training" if Xts is None or Yts is None else "validation" coeff = pc.invT(x) # Compute error: can be train or test; if Xts is not None and Yts is not None: pred = self._predict(Xts, ny_X, coeff) err = self.error_fn(Yts, pred) loss = torch.mean(self.loss(Yts, pred)).item() else: pred = self._predict(X, ny_X, coeff) err = self.error_fn(Y, pred) loss = torch.mean(self.loss(Y, pred)).item() err_name = "error" if isinstance(err, tuple) and len(err) == 2: err, err_name = err print( f"Iteration {it:3d} - Elapsed {self.fit_times_[-1]:.2f}s - " f"{err_str} loss {loss:.4f} - " f"{err_str} {err_name} {err:.4f} ", flush=True) t_elapsed = 0.0 for it, penalty in enumerate(self.penalty_list): max_iter = self.iter_list[it] print("Iteration %d - penalty %e - sub-iterations %d" % (it, penalty, max_iter), flush=True) with TicToc("Preconditioner", self.options.debug): if precond is None: precond = falkon.preconditioner.LogisticPreconditioner( self.kernel, self.loss, self.options) precond.init(ny_X, ny_Y, beta, penalty, X.shape[0]) if self.use_cuda_: torch.cuda.empty_cache() with TicToc("Gradient", self.options.debug): # Gradient computation knmp_grad, inner_mmv = self.loss.knmp_grad(X, ny_X, Y, precond.invT(beta), opt=self.options) grad_p = precond.invAt( precond.invTt(knmp_grad).add_(penalty * beta)) # Callback def mmv(sol): sol_a = precond.invA(sol) knmp_hess = self.loss.knmp_hess(X, ny_X, Y, inner_mmv, precond.invT(sol_a), opt=self.options) return precond.invAt( precond.invTt(knmp_hess).add_(sol_a.mul_(penalty))) with TicToc("Optim", self.options.debug): optim_out = optim.solve(X0=None, B=grad_p, mmv=mmv, max_iter=max_iter, callback=None) beta -= precond.invA(optim_out) t_elapsed += time.time() - t_s cback(it, beta, precond, train_time=t_elapsed) t_s = time.time() t_elapsed += time.time() - t_s cback(len(self.penalty_list), beta, precond, train_time=t_elapsed) self.alpha_ = precond.invT(beta) self.ny_points_ = ny_X
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : MxD tensor Matrix of Nystroem centers Y : Mx1 tensor Vector of targets corresponding to the Nystroem centers `X` alpha : Mx1 tensor Parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): self.fC = potrf_wrapper(self.fC, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) else: if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(self.fC, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm inplace_set_diag(self.fC, self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() self.fC = vec_mul_triang(self.fC, W.numpy().reshape(-1), side=0, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag(self.fC, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = torch.from_numpy(self.fC).diag()