def init(self, X: Union[torch.Tensor, SparseTensor]): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : MxD tensor The matrix of Nystroem centers """ dtype = X.dtype eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag(self.fC, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag()
def gen_random_pd(t, dtype, F=False, seed=0): A = gen_random(t, t, dtype, F, seed) copy_triang(A, upper=True) #A += A.T #A *= 2 #A += 20 A *= 1 A += 2 A.flat[::t + 1] += t*4 return A
def test_up(self, mat, order, dtype): mat = fix_mat(mat, order=order, dtype=dtype, numpy=True) mat_up = mat.copy(order="K") # Upper triangle of mat_low is 0 mat_up[np.tril_indices(self.t, -1)] = 0 copy_triang(mat_up, upper=True) assert np.sum(mat_up == 0) == 0 np.testing.assert_array_equal(np.triu(mat), np.triu(mat_up)) np.testing.assert_array_equal(np.tril(mat_up), np.triu(mat_up).T) np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up)) # Reset and try with `upper=False` mat_up[np.tril_indices(self.t, -1)] = 0 copy_triang(mat_up, upper=False) # Only the diagonal will be set. np.testing.assert_array_equal(np.diag(mat), np.diag(mat_up))
def par_lauum_f_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM") trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s2 = torch.cuda.Stream(device=tc_device) cublasSetStream(cublas_handle, s1._as_parameter_) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Preallocate 2 columns whole_col_b = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0], max_block_size), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True) for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start col_b = copy_to_device(N - b_start, bb.length, A, b_start, bb.start, whole_col_b, 0, 0, s1) except ValueError: pass # No column here if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if b_start + bb.length < N: syrk_fn(cublas_handle, uplo='L', trans='T', n=bb.length, k=col_b.shape[0] - bb.length, alpha=1.0, A=col_b[bb.length:, :].data_ptr(), lda=col_b.stride(1), beta=0.0, C=col_b.data_ptr(), ldc=col_b.stride(1)) # CPU LAUUM on A[bb.start:bb.end, bb.start:bb.end]. This is a bit messy, should do cleanup. Abb = A[bb.start:bb.end, bb.start:bb.end] # L\U if independent_output: Abb_np = Abb.numpy().copy(order="F") # Make symmetric: L\L copy_triang(Abb_np, upper=False) uu, info = lauum_fn(Abb_np, lower=1, overwrite_c=True) # LAU\L Abb.copy_(torch.from_numpy(uu.T)) # L\LAU else: uu, info = lauum_fn(Abb.numpy(), lower=1, overwrite_c=False) # LAU\L if b_start + bb.length < N: zero_triang(uu, upper=True) Abb.copy_(torch.from_numpy(uu)) if b_start + bb.length < N: # It is IMPORTANT to do the copy on s1 and then sync it. tbb = copy_to_host(bb.length, bb.length, col_b, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() if independent_output: Abb.add_(torch.triu(tbb.T)) else: Abb.add_(tbb) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b col_r = copy_to_device(N - br.start, br.length, A, br.start, br.start, whole_col_r, 0, 0, s1) # Restrict column b to only the last 'r' rows ccb = col_b[br.start - b_start:, :] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='L', uplo='L', trans='T', diag='N', m=br.length, n=bb.length, alpha=1.0, A=col_r.data_ptr(), lda=col_r.stride(1), B=ccb.data_ptr(), ldb=ccb.stride(1), C=ccb.data_ptr(), ldc=ccb.stride(1)) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='T', transb='N', m=br.length, n=bb.length, k=col_r.shape[0] - br.length, alpha=1.0, A=col_r[br.length:, :].data_ptr(), lda=col_r.stride(1), B=ccb[br.length:, :].data_ptr(), ldb=ccb.stride(1), beta=1.0, C=ccb.data_ptr(), ldc=ccb.stride(1)) # Copy back to A[r, b] if independent_output: _temp_cpu = copy_to_host(br.length, bb.length, ccb, 0, 0, temp_bb, 0, 0, s1) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_(_temp_cpu.T) else: s1.synchronize() copy_to_host(br.length, bb.length, ccb, 0, 0, A, br.start, bb.start, s2) s2.synchronize()
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] dts = sizeof_dtype(A.dtype) lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM") trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s2 = torch.cuda.Stream(device=tc_device) s1_cuda, s2_cuda = s1._as_parameter_, s2._as_parameter_ cublasSetStream(cublas_handle, s1_cuda) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Preallocate 2 block-columns. The single block is a CPU buffer whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start cuda_memcpy2d_async(dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts, src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts, width=bb.length * dts, height=N - b_start, stream=s1_cuda) except ValueError: # all of `my_rows` are smaller than `b`. pass if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: is_last_row = b_start + bb.length == N # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if not is_last_row: syrk_fn(cublas_handle, uplo='U', trans='N', n=bb.length, k=N - b_start - bb.length, alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(), lda=max_block_size, beta=0.0, C=whole_col_b.data_ptr(), ldc=max_block_size) # Run LAUUM on CPU on Abb.T (transpose because LAPACK works in F-order) # Result will be on upper(uu). So if we copy back to lower(A), we must copy # back uu.T -- otherwise we should copy back uu directly. Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb_np = Abb.T.numpy().copy(order="F") # U\L copy_triang(Abb_np, upper=True) # L\L uu, info = lauum_fn(Abb_np, lower=1, overwrite_c=True) # LAU\L Abb.copy_(torch.from_numpy(uu.T)) # L \ LAU else: uu, info = lauum_fn(Abb.T.numpy(), lower=0, overwrite_c=False) # Zeroing must happen if the SYRK output is to be added: otherwise the # non-processed part of Abb (i.e. upper(Abb) if not independent_output) # will be multiplied by 2. if not is_last_row: zero_triang(uu, upper=False) Abb.copy_(torch.from_numpy(uu.T)) if not is_last_row: cuda_memcpy2d_async(dst=temp_bb.data_ptr(), dpitch=max_block_size * dts, src=whole_col_b.data_ptr(), spitch=max_block_size * dts, width=bb.length * dts, height=bb.length, stream=s1_cuda) s1.synchronize( ) # TODO: Check if failure when this commented out. if independent_output: Abb.add_( torch.triu(temp_bb[:bb.length, :bb.length].T)) else: Abb.add_(temp_bb[:bb.length, :bb.length]) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b cuda_memcpy2d_async(dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts, src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts, width=br.length * dts, height=N - br.start, stream=s1_cuda) #s1.synchronize() # Restrict column b to only the last 'r' rows ccb = whole_col_b[(br.start - b_start) * max_block_size:] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='R', uplo='U', trans='T', diag='N', m=bb.length, n=br.length, alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size, B=ccb.data_ptr(), ldb=max_block_size, C=ccb.data_ptr(), ldc=max_block_size) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='N', transb='T', m=bb.length, n=br.length, k=N - br.start - br.length, alpha=1.0, A=ccb[br.length * max_block_size:].data_ptr(), lda=max_block_size, B=whole_col_r[br.length * max_block_size:].data_ptr(), ldb=max_block_size, beta=1.0, C=ccb.data_ptr(), ldc=max_block_size) # Copy back to A[r, b] if independent_output: # Copy must be transposed, copy to temp_bb first. cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_( temp_bb[:br.length, :bb.length].T) else: s1.synchronize() cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0], stream=s2_cuda) s2.synchronize()
def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor, alpha: torch.Tensor, penalty: float, N: int) -> None: """Initialize the preconditioner matrix. This method must be called before the preconditioner becomes usable. Parameters ---------- X : MxD tensor Matrix of Nystroem centers Y : Mx1 tensor Vector of targets corresponding to the Nystroem centers `X` alpha : Mx1 tensor Parameter vector (of the same dimension as `Y`) which gives the current solution to the optimization problem. penalty : float Regularization amount N : int Number of points in the full data-set. Notes ----- If `debug=True` is present in the options, this method will print a lot of extra information pertaining timings of the various preconditioner operations. This can be useful to help understand how the preconditioner works. """ if Y.shape[1] != 1: raise ValueError( "Logistic preconditioner can only deal with 1D outputs.") dtype = X.dtype M = X.size(0) eps = self.params.pc_epsilon(dtype) if self.fC is None: # This is done only at the first iteration of the logistic-falkon algorithm # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC` with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device='cpu', pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) self.fC = C.numpy() if not is_f_contig(C): self.fC = self.fC.T with TicToc("Add diag", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag(self.fC, eps * M) with TicToc("Cholesky 1", debug=self.params.debug): self.fC = potrf_wrapper(self.fC, clean=True, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(self.fC, upper=False) else: if not self._use_cuda: # Copy non-necessary for cuda since LAUUM will do the copying with TicToc("Copy triangular", debug=self.params.debug): # Copy upper(fC) to lower(fC): lower(fC) = T.T copy_triang(self.fC, upper=True) # does not copy the diagonal # Setting diagonal necessary for trmm inplace_set_diag(self.fC, self.dT) # Compute W with TicToc("TRMM", debug=self.params.debug): # T is on upper(fC). Compute T.T @ alpha alpha = self._trmm(alpha.clone()) with TicToc("W (ddf)", debug=self.params.debug): W = self.loss.ddf(Y, alpha) with TicToc("W-Multiply", debug=self.params.debug): W.sqrt_() self.fC = vec_mul_triang(self.fC, W.numpy().reshape(-1), side=0, upper=False) if self._use_cuda: with TicToc("LAUUM", debug=self.params.debug): # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM", debug=self.params.debug): # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params) # NOTE: Here the multiplier is 1/N instead of the more common 1/M! mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N) with TicToc("Add diag", debug=self.params.debug): # lower(fC) = 1/N * [email protected] + lambda * I inplace_add_diag(self.fC, penalty) with TicToc("Cholesky 2", debug=self.params.debug): # Cholesky on lower(fC) : lower(fC) = A.T self.fC = potrf_wrapper(self.fC, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = torch.from_numpy(self.fC).diag()