Beispiel #1
0
def fdmmv_cpu_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: Optional[torch.Tensor],
                     w: Optional[torch.Tensor],
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None):
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.size(1) if v is not None else w.size(1)
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrow X1 : n
    # ker_chunk : n*M
    # w_blk     : n*T
    n = avail_mem / (M * T + 1)
    n = int(math.floor(n))
    if n < 1:
        raise MemoryError(("Available memory %.2fGB is insufficient "
                           "for blockwise fdMMv.") %
                          (avail_mem * sizeof_dtype(dtype) / 2**30))

    # Allocate fixed arrays
    ker_chunk = create_same_stride((n, M), out, dtype, device='cpu')
    w_blk = create_same_stride((n, T), out, dtype, device='cpu')
    # Run blocked fdmmv
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        X1_chunk = X1.narrow_rows(i, ic)
        cur_ker_chunk = ker_chunk[:ic]
        cur_ker_chunk.fill_(0.0)
        ddd = kernel._prepare_sparse(X1_chunk, X2)
        kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk)
        kernel._finalize(cur_ker_chunk, ddd)

        # Multiply by the vector v
        cur_w_blk = w_blk[:ic]  # n x T
        cur_w_blk.fill_(0.0)
        if w is not None:
            cur_w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            cur_w_blk.addmm_(cur_ker_chunk, v)
        out.addmm_(cur_ker_chunk.T, cur_w_blk)
    del ker_chunk, w_blk
    return out
Beispiel #2
0
def _parallel_potrf_runner(A: torch.Tensor, opt: CholeskyOptions,
                           gpu_info) -> torch.Tensor:
    num_gpus = len(gpu_info)
    N = A.shape[0]
    dt = A.dtype
    # Calculate the maximum block size such that we don't run out of GPU
    # RAM on **any** available GPU. We need a total of 2 whole columns and 1 tile:
    # block-size^2 * ((N / block-size) * 2 + 1) floats
    # (plus the cuSOLVER buffer which is small).
    # block_size < (sqrt((2*N)^2 + 4R) - 2*N) / 2
    dts = sizeof_dtype(dt)
    avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
    max_block_size = (math.sqrt(4 * N**2 + 4 * avail_ram) - 2 * N) / 2
    max_block_size = int(math.floor(max_block_size))
    if max_block_size < 1:
        raise RuntimeError("Cannot run parallel POTRF with minimum "
                           "available memory of %.2fMB" %
                           (avail_ram * dts / 2**20))

    block_sizes = calc_block_sizes(max_block_size, num_gpus, N,
                                   opt.chol_par_blk_multiplier)
    block_allocations = []
    cur_n = 0
    for i, bs in enumerate(block_sizes):
        block_allocations.append((cur_n, cur_n + bs, bs, i % num_gpus, i))
        cur_n += bs

    device_info = []
    for g in range(num_gpus):
        device_info.append((0.0, initialization.cusolver_handle(g), g))

    parallel_potrf(device_info, block_allocations, A)
    return A
Beispiel #3
0
def fmmv_cpu(X1, X2, v, kernel, out, opt):
    """Blockwise kernel-vector product

    This function computes ``kernel(X1, X2) @ v`` in a blockwise fashion, to avoid having the
    whole N*M kernel matrix in memory at once.
    Note that while the principle is that of matrix-vector product, `v` can have more than
    one column.

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v
        [M, T] array
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [N, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    ntot, dtot = X1.size(0), X1.size(1)
    M, T = v.size()
    dtype = v.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Only necessary memory allocation is that for the temporary kernel
    # `temp_out` of size n*M
    extra_mem = kernel.extra_mem()
    n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=extra_mem.get('nd', 0),
                              coef_n=M + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M,
                              coef_d=extra_mem.get('d', 0) + extra_mem.get('md', 0) * M,
                              rest=extra_mem.get('m', 0), max_mem=avail_mem)

    # Run batched matrix multiplication
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1.narrow(0, i, ic), X2)  # , v=v)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i: i + ic, k: k + kc]
            X2d = X2[:, k: k + kc]
            kernel._apply(X1d, X2d.T, temp_out)

        # temp_out = fnc(X1*X2', X1, X2)
        kernel._finalize(temp_out, ddd)

        torch.mm(temp_out, v, out=out[i: i + ic, :])
    return out
Beispiel #4
0
def fmm_cpu_sparse(X1: SparseTensor, X2: SparseTensor,
                   kernel: 'falkon.kernels.Kernel',
                   out: Optional[torch.Tensor],
                   opt: BaseOptions) -> torch.Tensor:
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu_sparse(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare_sparse(X1, X2)
        kernel._apply_sparse(X1, X2.transpose_csc(), out)
        kernel._finalize(out, ddd)

    return out
Beispiel #5
0
    def test_no_overwrite(self, dtype, order, get_mat, expected_lower,
                          expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[
                0]**2 * sizeof_dtype(mat.dtype)

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_upper,
                                   act_up.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())

        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt)
            torch.cuda.synchronize()
        np.testing.assert_allclose(expected_lower,
                                   act_lo.cpu().numpy(),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(omat, mat.cpu())
Beispiel #6
0
def fmm_cuda_sparse(X1: SparseTensor,
                    X2: SparseTensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor] = None,
                    opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((out, 'out'))
    N = X1.size(0)
    M = X2.size(0)
    if out is None:
        out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    # Create the arguments passed to each subprocess
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth),
                             X2=X2,
                             out=out.narrow(0, block_sizes[i], bwidth),
                             kernel=kernel,
                             gpu_dtype=gpu_dtype,
                             max_mem=g.usable_ram), g.Id))
    _start_wait_processes(_sparse_fmm, args)
    torch.cuda.empty_cache()
    return out
Beispiel #7
0
    def _can_store_knm(self, X, ny_points, available_ram, store_threshold=1200):
        """Decide whether it's worthwile to pre-compute the k_NM kernel.

        Notes
        -----
        If we precompute K_NM, each CG iteration costs
        Given a single kernel evaluation between two D-dimensional vectors
        costs D, at CG iteration we must perform N*M kernel evaluations.
        Other than the kernel evaluations we must perform two matrix-vector
        products 2(N*M*T) and a bunch of triangular solves.

        So if we precompute we have 2*(N*M*T), othewise we also have N*M*D
        but precomputing costs us N*M memory.
        So heuristic is the following:
         - If D is large (> `store_threshold`) check if RAM is sufficient
         - If RAM is sufficient precompute
         - Otherwise do not precompute
        """
        if self.options.never_store_kernel:
            return False
        dts = sizeof_dtype(X.dtype)
        if X.size(1) > store_threshold:
            necessary_ram = X.size(0) * ny_points.size(0) * dts
            if available_ram > necessary_ram:
                if self.options.debug:
                    print("%d*%d Kernel matrix will be stored" %
                          (X.size(0), ny_points.size(0)))
                return True
            elif self.options.debug:
                print(
                    "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" %
                    (available_ram / 2 ** 30, necessary_ram / 2 ** 30))
                return False
        else:
            return False
Beispiel #8
0
def test_size_of_dtype():
    assert 8 == sizeof_dtype(np.float64)
    assert 4 == sizeof_dtype(np.float32)
    with pytest.raises(TypeError):
        sizeof_dtype(np.int32)

    assert 8 == sizeof_dtype(torch.float64)
    assert 4 == sizeof_dtype(torch.float32)
    with pytest.raises(TypeError):
        sizeof_dtype(torch.int32)
Beispiel #9
0
def _parallel_lauum_runner(A, write_opposite: bool, opt: LauumOptions,
                           gpu_info):
    # Choose target:
    if is_f_contig(A):
        target = par_lauum_f_lower
    elif is_contig(A):
        target = par_lauum_c_lower
    else:
        raise NotImplementedError(
            "Parallel LAUUM is only implemented for contiguous matrices")

    num_gpus = len(gpu_info)
    if num_gpus < 1:
        raise ValueError(
            "Parallel LAUUM should only be run when some GPU is available.")
    N = A.shape[0]
    dt = A.dtype
    dts = sizeof_dtype(dt)
    avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
    # Each GPU should be able to hold in memory 2 block columns
    max_block_size = int(math.floor(avail_ram / (2 * N)))
    if max_block_size < 1:
        raise RuntimeError("Cannot run parallel LAUUM with minimum "
                           "available memory of %.2fMB" %
                           (avail_ram * dts / 2**20))

    block_sizes = calc_block_sizes2(max_block_size, num_gpus, N,
                                    opt.lauum_par_blk_multiplier)
    block_allocations: List[BlockAlloc] = []
    cur_n = 0
    for bs in block_sizes:
        block_allocations.append(
            BlockAlloc(start=cur_n, end=cur_n + bs, length=bs))
        cur_n += bs

    barrier = threading.Barrier(num_gpus, timeout=1000)
    threads = []
    for g in gpu_info:
        gid_allocs = [
            i for i in range(len(block_allocations)) if i % num_gpus == g.Id
        ]
        cublas_handle = initialization.cublas_handle(g.Id)
        if cublas_handle is None:
            raise RuntimeError("CUBLAS must be initialized "
                               "on device %d before running parallel LAUUM." %
                               (g.Id))
        t = PropagatingThread(target=target,
                              name="GPU-%d" % (g.Id),
                              args=(A, block_allocations, gid_allocs, barrier,
                                    g.Id, cublas_handle, write_opposite))
        threads.append(t)

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return A
Beispiel #10
0
def fmm_cuda(X1: torch.Tensor,
             X2: torch.Tensor,
             kernel: 'falkon.kernels.Kernel',
             out: Optional[torch.Tensor] = None,
             opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out'))

    N = X1.shape[0]
    M = X2.shape[0]
    device = X1.device
    if out is None:
        out = create_same_stride((N, M),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=False)
    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # If float32 we need to upcast to float64 to avoid numerical precision errors
    # in the kernel
    gpu_dtype = X1.dtype
    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        gpu_dtype = torch.float64

    if device.type == 'cuda':
        sync_current_stream(device)
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmm(X1=X1,
                       X2=X2,
                       out=out,
                       kernel=kernel,
                       gpu_dtype=gpu_dtype,
                       max_mem=single_gpu_info.usable_ram,
                       num_streams=opt.num_fmm_streams)
        _call_direct(_generic_fmm, (args, device.index))
    else:
        # Create the arguments passed to each subprocess
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth),
                                 X2=X2,
                                 out=out.narrow(0, block_sizes[i], bwidth),
                                 kernel=kernel,
                                 gpu_dtype=gpu_dtype,
                                 max_mem=g.usable_ram,
                                 num_streams=opt.num_fmm_streams), g.Id))
        _start_wait_processes(_generic_fmm, args)
    return out
Beispiel #11
0
def generic_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.size()
    M, T = v.size()

    # GPU Memory Usage:
    # ker_gpu  : n*M
    # v_gpu    : M*T
    # X1s_gpu  : n*d
    # X2s_gpu  : M*d
    # mmv_gpu  : n*T
    # ----------
    # total : n*d + n*(M+T) + d*M + M*T
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=1,
                             coef_n=M + T,
                             coef_d=M,
                             rest=M * T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev)
        v_gpu = v.to(device=ddev)  # M x T
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)
            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, dtot, d):
                kc = min(d, dtot - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)
            # Multiply by the vector v
            c_g_mmv = mmv_gpu[:ic, :]
            torch.mm(c_g_ker, v_gpu, out=c_g_mmv)  # n x T
            # Copy back to host
            copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0)
    return out
Beispiel #12
0
 def test_ooc_mem(self, pd_data, dtype, order, upper, clean, overwrite):
     # 1600 is the minimum memory the fn seems to use (even for the 4x4 data)
     max_mem = max(pd_data.shape[0] * sizeof_dtype(dtype) * 1000, 1600)
     opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem)
     run_potrf_test(pd_data,
                    dtype=dtype,
                    order=order,
                    upper=upper,
                    clean=clean,
                    overwrite=overwrite,
                    start_cuda=False,
                    opt=opt)
Beispiel #13
0
def can_do_ic(A: torch.Tensor, device: DeviceInfo):
    # noinspection PyUnresolvedReferences
    avail_ram = device.actual_free_mem
    # The multiplier here is a bit tricky since setting it too high results
    # in hard-to-debug cuda errors
    avail_ram *= 0.85

    if A.is_cuda:
        needed_ram = 100 * 8  # Not very much indeed
    else:
        needed_ram = A.shape[0] * A.shape[1] * sizeof_dtype(A.dtype)

    return avail_ram >= needed_ram
Beispiel #14
0
def fmm_cpu(
        X1: torch.Tensor,
        X2: torch.Tensor,
        kernel: 'falkon.kernels.Kernel',
        out: Optional[torch.Tensor],
        opt: BaseOptions) -> torch.Tensor:
    """Compute kernel value on matrices X1 and X2: ``out = kernel(X1, X2)``

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    kernel
        Class representing the desired kernel function
    out
        Array for storing the kernel output. If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory. Additionally, the
        :attr:`~falkon.options.FalkonOptions.no_single_kernel` option is used to determine the
        accumulator data type.

    Returns
    --------
    out
        [N, M] array. The kernel between X1 and X2.
    """
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare(X1, X2)
        kernel._apply(X1, X2.T, out)
        kernel._finalize(out, ddd)

    return out
Beispiel #15
0
def fmm_cpu(X1: torch.Tensor, X2: torch.Tensor,
            kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor],
            opt: BaseOptions) -> torch.Tensor:
    """Compute kernel value on matrices X1 and X2: `out = kernel(X1, X2)`

    Parameters
    -----------
    X1 : [N, D] array
    X2 : [M, D] array
    kernel : Kernel
        Class representing the desired kernel function
    out : Optional([N, M] array)
        Array for storing the kernel output. If None, will be allocated within the function.
    opt : Union(Dict, CompOpt)
        Options dictionary. Supported options are
         - 'final_type', the data-type of the output array. If 'out' is not None and it's
            data-type clashes with the setting of 'final_type', the out matrix will not be
            modified.

    Returns
    --------
    out : [N, M] array
        The kernel between X1 and X2.
    """
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare(X1, X2)
        kernel._apply(X1, X2.T, out)
        kernel._finalize(out, ddd)

    return out
Beispiel #16
0
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor], opt: BaseOptions):
    opt = _setup_opt(opt, is_cpu=True)

    dtype = X1.dtype
    ntot, dtot = X1.size()
    mtot, T = v.size()

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)
    out.fill_(0.0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrowing X1, X2: n + m
    # Prepare - not computable, depends on kernel
    # ker_chunk : n*m
    # finalize : 0 (if can be implemented in place, kernel-dependent)
    n, m = select_dim_over_m(maxM=mtot,
                             maxN=ntot,
                             coef_nm=1,
                             coef_n=1,
                             coef_m=1,
                             tot=avail_mem)

    ker_chunk = create_same_stride((n, m), out, dtype, device='cpu')
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        cur_out = out[i:i + ic, :]
        X1_chunk = X1.narrow_rows(i, ic)
        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow_rows(j, jc)
            cur_ker_chunk = ker_chunk[:ic, :jc]
            cur_ker_chunk.fill_(0.0)

            ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
            kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(),
                                 cur_ker_chunk)
            kernel._finalize(cur_ker_chunk, ddd)

            # Multiply by the vector v
            cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc))
    return out
Beispiel #17
0
    def test_diff_blk_sizes(self, dtype, order, get_mat, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        # For cuda inputs we must add to available GPU memory the amount used by the
        # input matrix, since overwrite=False and a full copy must be performed.
        mgpu_slack = 0
        if device.startswith("cuda"):
            mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype)

        opt_v1 = dataclasses.replace(self.basic_opt, max_gpu_mem=2*2**20 + mgpu_slack)
        act_up_v1 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v1)
        opt_v2 = dataclasses.replace(self.basic_opt, max_gpu_mem=4*2**20 + mgpu_slack)
        act_up_v2 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v2)
        opt_v3 = dataclasses.replace(self.basic_opt, max_gpu_mem=6*2**20 + mgpu_slack)
        act_up_v3 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v3)

        np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v1.cpu().numpy(), rtol=self.rtol[dtype])
        np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v2.cpu().numpy(), rtol=self.rtol[dtype])
Beispiel #18
0
    def test_ic_mem(self, pd_data, dtype, order, upper, clean, overwrite,
                    start_cuda):
        if start_cuda:
            max_mem = 2000
        else:
            # 1600 is needed!
            max_mem = max(
                1600, pd_data.shape[0] * pd_data.shape[1] *
                sizeof_dtype(dtype) * 1.5)
        opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem)

        run_potrf_test(pd_data,
                       dtype=dtype,
                       order=order,
                       upper=upper,
                       clean=clean,
                       overwrite=overwrite,
                       start_cuda=start_cuda,
                       opt=opt)
Beispiel #19
0
def _single_gpu_method(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    backend = a.backend
    X1 = a.X1
    X2 = a.X2
    v = a.v
    oout = a.out
    other_vars = a.other_vars
    fn = a.function
    R = a.gpu_ram

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]

    # Second round of subdivision (only if necessary due to RAM constraints)
    n, m = _estimate_split(N, M, D, T, R, sizeof_dtype(X1.dtype))

    # Process the two rounds of splitting with a nested loop.
    for mi in range(0, M, m):
        ml = min(m, M - mi)
        if ml != M and mi > 0:  # Then we must create a temporary output array
            out = torch.empty_like(oout)
        else:
            out = oout

        cX2 = X2[mi:mi + ml, :]
        cv = v[mi:mi + ml, :]

        for ni in range(0, N, n):
            nl = min(n, N - ni)
            cX1 = X1[ni:ni + nl, :]
            cout = out[ni: ni + nl, :]

            variables = [cX1, cX2, cv] + other_vars
            fn(*variables, out=cout, device_id=device_id, backend=backend)
        torch.cuda.synchronize(device_id)
        if ml != M and mi > 0:
            oout.add_(out)

    return oout
Beispiel #20
0
    def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device):
        omat = get_mat(order=order, dtype=dtype)
        mat = get_mat(order=order, dtype=dtype, device=device)

        mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype)
        with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt:
            act_up = gpu_lauum(mat, upper=True, overwrite=False, write_opposite=True, opt=new_opt)
        act_up = act_up.cpu()
        np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T,
                                   rtol=self.rtol[dtype])

        mat = get_mat(order=order, dtype=dtype, device=device)
        with memory_checker(self.basic_opt) as new_opt:
            act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt)
            torch.cuda.synchronize()
        act_lo = act_lo.cpu()
        np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1),
                                   rtol=self.rtol[dtype])
        np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T,
                                   rtol=self.rtol[dtype])
Beispiel #21
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    #if sizeof_dtype(dtype) == 4:
    #    avail_mem /= 2
    n, m = select_dim_over_m(maxM=M,
                             maxN=N,
                             coef_nm=1.0,
                             coef_n=D + T,
                             coef_m=D + T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        nm_gpu = create_same_stride((n, m), X1, dtype, ddev)
        out_gpu = create_same_stride((n, T), out, dtype, ddev)
        X1s_gpu = create_same_stride((n, D), X1, dtype, ddev)
        X2s_gpu = create_same_stride((m, D), X2, dtype, ddev)
        vs_gpu = create_same_stride((m, T), v, dtype, ddev)

        for i in range(0, N, n):
            nb = min(n, N - i)
            cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0,
                                                 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu,
                                                     0, 0)
                cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0,
                                                    0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                # FIXME: This is the cause of mapping errors in case of float32 calculations.
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            # send result to CPU
            copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)

    return out
Beispiel #22
0
def fdmmv_cpu(X1, X2, v, w, kernel, out, opt):
    """Calculate a double kernel-vector product.

    This function computes the following quantity: ``kernel(X1, X2).T @ (kernel(X1, X2) @ v + w)``
    Where one of `v` or `w` can be empty.
    All arrays passed to this function must be 2-dimensional, although
    the second dimension can be unitary.

    The expression is not computed directly. We separate the computation
    into smaller blocks so as to reduce the total memory consumption (the
    large N*M kernel matrix is never wholly stored in RAM.)

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v : torch.Tensor or None
        [M, T] array. But note that at least one of v or w must be specified.
    w : torch.Tensor or None
        [N, T] array. But note that at least one of v or w must be specified.
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [M, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.shape[1] if v is not None else w.shape[1]
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # The only necessary temporary matrices are: `temp_out` of size n*M and
    # temp_w_block of size n*T
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M + T,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run Batched Matrix Computation
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1[i:i + ic, :], X2)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)
        kernel._finalize(temp_out, ddd)  # fnc(X1*X2', X1, X2) [n x M]

        w_blk = torch.zeros(ic, T, dtype=dtype)  # n x T
        if w is not None:
            w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            w_blk.addmm_(temp_out, v)

        out.add_(torch.mm(temp_out.T, w_blk))
    return out
Beispiel #23
0
def _sparse_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.size(0)

    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2_chunk : dtot + 2 * D * mtot * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here)
    # ker_gpu  : mtot * ntot
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density,
                                 coef_m=2 * dtot * X2.density,
                                 rest=dtot,
                                 max_mem=avail_mem)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)

            X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype)
            X2_chunk_d = SparseTensor.from_scipy(
                X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                .index_to_int() \
                .to(device=tc_device)
            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype)
                X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device)
                cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                cur_g_out.fill_(0.0)

                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
                cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                 cur_g_out)
                cur_g_out = kernel._finalize(cur_g_out, ddd)
                copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                     cpu_buf)
                del ddd, X1_chunk_d, X1_chunk
            del X2_chunk, X2_chunk_d
        del g_out
    return out
Beispiel #24
0
def _generic_fmm(proc_idx, queue, device_id):
    # Unpack the function arguments
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    num_streams = a.num_streams

    # flags and local variables
    change_dtype = gpu_dtype != X1.dtype
    X1_equal_X2 = _gpu_tns_same_memory(X1, X2)
    use_gpu_bufs = change_dtype or not cuda_inputs
    stride = "F" if is_f_contig(out, strict=True) else "C"
    j_iter = 0
    dts = sizeof_dtype(gpu_dtype)
    tc_device = torch.device('cuda:%d' % (int(device_id)))
    avail_mem = max_mem / dts

    # Choose block sizes n, m such that we won't run out of GPU memory
    ntot, d = X1.shape
    mtot = X2.shape[0]
    extra_mem = kernel.extra_mem()
    if cuda_inputs and not change_dtype:
        # No allocation will be performed by us. Only in-kernel stuff.
        n, m = select_dim_over_nm(max_n=ntot,
                                  max_m=mtot,
                                  d=d,
                                  coef_nd=extra_mem.get('nd', 0),
                                  coef_md=extra_mem.get('md', 0),
                                  coef_nm=extra_mem.get('nm', 0),
                                  coef_n=extra_mem.get('n', 0),
                                  coef_m=extra_mem.get('m', 0),
                                  rest=extra_mem.get('d', 0),
                                  max_mem=avail_mem)
    else:
        n, m = select_dim_over_nm(
            max_n=ntot,
            max_m=mtot,
            d=d,
            coef_nd=num_streams * (extra_mem.get('nd', 0) + 1),
            coef_md=num_streams * (extra_mem.get('md', 0) + 1),
            coef_nm=num_streams * (extra_mem.get('nm', 0) + 1),
            coef_n=extra_mem.get('n', 0),
            coef_m=extra_mem.get('m', 0),
            rest=extra_mem.get('d', 0),
            max_mem=avail_mem)

    # Create streams
    streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)]

    # Create buffers
    if use_gpu_bufs:
        gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        gX2_list = [
            create_same_stride((m, d), X2, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
        gout_list = [
            create_same_stride((n, m), out, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
    if not cuda_inputs:
        cpu_buf_list = [
            create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True)
            for _ in range(num_streams)
        ]

    # Define helpers for the copy-back operations (from cpu_buf to output)
    copy_ops = [None] * num_streams

    def wrap_copy_op(stream_idx):
        if copy_ops[stream_idx] is not None:
            copy_ops[stream_idx]()
            copy_ops[stream_idx] = None

    def do_copy_op(output, buf, i_, ic_, j_, jc_):
        # This function will also do the type conversion
        output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_])

    # Kernel computation begin
    with tcd.device(tc_device):
        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            with tcd.stream(streams[j_iter % len(streams)]):
                X1_chunk = X1.narrow(0, i, ic)
                if use_gpu_bufs:
                    cur_gX1 = gX1.narrow(0, 0, ic)
                    cur_gX1.copy_(X1_chunk, non_blocking=True)
                else:
                    cur_gX1 = X1_chunk

            for j in range(0, mtot, m):
                jc = min(m, mtot - j)
                # Choose the buffers for this inner iteration
                stream_id = j_iter % len(streams)
                stream = streams[stream_id]
                if use_gpu_bufs:
                    gX2 = gX2_list[stream_id]
                    gout = gout_list[stream_id]
                if not cuda_inputs:
                    cpu_buf = cpu_buf_list[stream_id]

                # Sync for buffers we must use now (e.g. 2 previous iters)
                with tcd.stream(stream):  # Inner-loop
                    stream.synchronize()
                    wrap_copy_op(stream_id)

                    if X1_equal_X2 and j < i:  # Shortcut for symmetric kernels
                        jc = min(m, mtot - j)
                        out[i:i + ic, j:j + jc].copy_(out[j:j + jc,
                                                          i:i + ic].T,
                                                      non_blocking=True)
                        j_iter += 1
                        continue

                    # Copy (CPU->GPU)
                    X2_chunk = X2.narrow(0, j, jc)
                    if use_gpu_bufs:
                        cur_gX2 = gX2.narrow(0, 0, jc)
                        cur_gX2.copy_(X2_chunk, non_blocking=True)
                    else:
                        cur_gX2 = X2_chunk

                    if use_gpu_bufs:
                        cur_gout = gout[:ic, :jc]
                    else:
                        cur_gout = out[i:i + ic, j:j + jc]
                    cur_gout.fill_(0.0)

                    # Compute
                    ddd = kernel._prepare(cur_gX1, cur_gX2)
                    kernel._apply(cur_gX1, cur_gX2.T, cur_gout)
                    cur_gout = kernel._finalize(cur_gout, ddd)

                    # Copy Back (GPU->CPU)
                    if not cuda_inputs:
                        # copy_ does not care about the contiguity of copies, as long as it's consistent
                        # however, in case of C-contiguous inputs it will create an intermediate array
                        # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous
                        # arrays.
                        if stride == "F":
                            copy_to_host(ic,
                                         jc,
                                         cur_gout,
                                         0,
                                         0,
                                         cpu_buf,
                                         0,
                                         0,
                                         s=stream)
                        else:
                            cuda_memcpy2d_async(dst=cpu_buf.data_ptr(),
                                                dpitch=cpu_buf.stride(0) * dts,
                                                src=cur_gout.data_ptr(),
                                                spitch=cur_gout.stride(0) *
                                                dts,
                                                width=jc * dts,
                                                height=ic,
                                                stream=stream._as_parameter_)
                        copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf,
                                                      i, ic, j, jc)
                    elif change_dtype:
                        out.narrow(0, i,
                                   ic).narrow(1, j,
                                              jc).copy_(cur_gout,
                                                        non_blocking=True)
                j_iter += 1

            for i in range(num_streams):
                streams[i].synchronize()
                wrap_copy_op(i)

    return out
Beispiel #25
0
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc],
                      my_rows: List[int], barrier: threading.Barrier,
                      device_id: int, cublas_handle, independent_output: bool):
    N = A.shape[0]
    dts = sizeof_dtype(A.dtype)

    lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM")
    trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM")
    gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM")
    syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK")

    tc_device = torch.device('cuda:%d' % (device_id))
    s1 = torch.cuda.Stream(device=tc_device)
    s2 = torch.cuda.Stream(device=tc_device)
    s1_cuda, s2_cuda = s1._as_parameter_, s2._as_parameter_
    cublasSetStream(cublas_handle, s1_cuda)

    max_block_size = max(ba.length for ba in block_allocs)
    my_rows = sorted(my_rows)

    with torch.cuda.device(tc_device), torch.cuda.stream(s1):
        # Preallocate 2 block-columns. The single block is a CPU buffer
        whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype,
                                     tc_device)
        temp_bb = create_fortran((max_block_size, max_block_size),
                                 A.dtype,
                                 'cpu',
                                 pin_memory=True).T
        for b in range(len(block_allocs)):
            bb = block_allocs[b]
            # Load col b.
            # Instead of loading the whole column only load the last rows
            # as necessary by inspecting the minimum value in my_rows which is >= b.
            try:
                min_row = min([r for r in my_rows if r >= b])
                b_start = block_allocs[min_row].start
                cuda_memcpy2d_async(dst=whole_col_b.data_ptr(),
                                    dpitch=max_block_size * dts,
                                    src=A[b_start, bb.start].data_ptr(),
                                    spitch=A.shape[1] * dts,
                                    width=bb.length * dts,
                                    height=N - b_start,
                                    stream=s1_cuda)
            except ValueError:
                # all of `my_rows` are smaller than `b`.
                pass
            if not independent_output:
                barrier.wait()

            for r in my_rows:
                if r < b:
                    continue
                if r == b:
                    is_last_row = b_start + bb.length == N
                    # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :]
                    # C = beta*C + alpha * op(A) @ op(A).T
                    if not is_last_row:
                        syrk_fn(cublas_handle,
                                uplo='U',
                                trans='N',
                                n=bb.length,
                                k=N - b_start - bb.length,
                                alpha=1.0,
                                A=whole_col_b[bb.length *
                                              max_block_size:].data_ptr(),
                                lda=max_block_size,
                                beta=0.0,
                                C=whole_col_b.data_ptr(),
                                ldc=max_block_size)
                    # Run LAUUM on CPU on Abb.T (transpose because LAPACK works in F-order)
                    # Result will be on upper(uu). So if we copy back to lower(A), we must copy
                    # back uu.T -- otherwise we should copy back uu directly.
                    Abb = A[bb.start:bb.end, bb.start:bb.end]
                    if independent_output:
                        Abb_np = Abb.T.numpy().copy(order="F")  # U\L
                        copy_triang(Abb_np, upper=True)  # L\L
                        uu, info = lauum_fn(Abb_np, lower=1,
                                            overwrite_c=True)  # LAU\L
                        Abb.copy_(torch.from_numpy(uu.T))  # L \ LAU
                    else:
                        uu, info = lauum_fn(Abb.T.numpy(),
                                            lower=0,
                                            overwrite_c=False)
                        # Zeroing must happen if the SYRK output is to be added: otherwise the
                        # non-processed part of Abb (i.e. upper(Abb) if not independent_output)
                        # will be multiplied by 2.
                        if not is_last_row:
                            zero_triang(uu, upper=False)
                        Abb.copy_(torch.from_numpy(uu.T))

                    if not is_last_row:
                        cuda_memcpy2d_async(dst=temp_bb.data_ptr(),
                                            dpitch=max_block_size * dts,
                                            src=whole_col_b.data_ptr(),
                                            spitch=max_block_size * dts,
                                            width=bb.length * dts,
                                            height=bb.length,
                                            stream=s1_cuda)
                        s1.synchronize(
                        )  # TODO: Check if failure when this commented out.
                        if independent_output:
                            Abb.add_(
                                torch.triu(temp_bb[:bb.length, :bb.length].T))
                        else:
                            Abb.add_(temp_bb[:bb.length, :bb.length])
                else:  # r > b
                    br = block_allocs[r]

                    # Load column r. Since r > b this column will be shorter than column b
                    cuda_memcpy2d_async(dst=whole_col_r.data_ptr(),
                                        dpitch=max_block_size * dts,
                                        src=A[br.start, br.start].data_ptr(),
                                        spitch=A.shape[1] * dts,
                                        width=br.length * dts,
                                        height=N - br.start,
                                        stream=s1_cuda)
                    #s1.synchronize()
                    # Restrict column b to only the last 'r' rows
                    ccb = whole_col_b[(br.start - b_start) * max_block_size:]

                    # TRMM on g_r[0:br.length, :] which is triangular (r*r)
                    #         and cur_g_b[0:br.length, :]
                    #         output is a r*b matrix and should be stored in a separate g_out block
                    # Could store output in the first rows of g_b
                    # C = alpha * op(A) @ B -- A triangular
                    trmm_fn(handle=cublas_handle,
                            side='R',
                            uplo='U',
                            trans='T',
                            diag='N',
                            m=bb.length,
                            n=br.length,
                            alpha=1.0,
                            A=whole_col_r.data_ptr(),
                            lda=max_block_size,
                            B=ccb.data_ptr(),
                            ldb=max_block_size,
                            C=ccb.data_ptr(),
                            ldc=max_block_size)

                    # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :]
                    #         output  is the same r*b matrix as before, outputs need to be summed.
                    # C = alpha * op(A) @ op(B) + beta * C
                    if br.end < N:
                        gemm_fn(handle=cublas_handle,
                                transa='N',
                                transb='T',
                                m=bb.length,
                                n=br.length,
                                k=N - br.start - br.length,
                                alpha=1.0,
                                A=ccb[br.length * max_block_size:].data_ptr(),
                                lda=max_block_size,
                                B=whole_col_r[br.length *
                                              max_block_size:].data_ptr(),
                                ldb=max_block_size,
                                beta=1.0,
                                C=ccb.data_ptr(),
                                ldc=max_block_size)

                    # Copy back to A[r, b]
                    if independent_output:
                        # Copy must be transposed, copy to temp_bb first.
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=temp_bb.data_ptr(),
                                             ldb=max_block_size,
                                             stream=s1_cuda)
                        s1.synchronize()
                        A[bb.start:bb.end, br.start:br.end].copy_(
                            temp_bb[:br.length, :bb.length].T)
                    else:
                        s1.synchronize()
                        cublasGetMatrixAsync(rows=bb.length,
                                             cols=br.length,
                                             elem_size=dts,
                                             A=ccb.data_ptr(),
                                             lda=max_block_size,
                                             B=A[br.start,
                                                 bb.start].data_ptr(),
                                             ldb=A.shape[0],
                                             stream=s2_cuda)
            s2.synchronize()
Beispiel #26
0
def _parallel_lauum_runner(A, write_opposite: bool, gpu_info):
    # Choose target:
    if is_f_contig(A):
        target = par_lauum_f_lower
    elif is_contig(A):
        target = par_lauum_c_lower
    else:
        raise NotImplementedError(
            "Parallel LAUUM is only implemented for contiguous matrices")

    N = A.shape[0]
    dt = A.dtype
    dts = sizeof_dtype(dt)
    if A.is_cuda:
        sync_current_stream(A.device)
        gpu_info = [g for g in gpu_info if g.Id == A.device.index]
        avail_ram = gpu_info[0].actual_free_mem / dts
        if target.__name__ == "par_lauum_f_lower":
            # Each GPU should hold in memory two additional blocks (2*B^2 <= M)
            # and 1 full column.
            max_block_size = int(
                math.floor((-N + math.sqrt(N**2 + 8 * avail_ram)) / 4))
        else:
            # Same RAM requirements as the out-of-core version
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))
        # All computations on the same device (where data is stored). No multi-GPU support!
        block_sizes = calc_block_sizes3(max_block_size, 1, N)
    else:
        avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
        # Each GPU should be able to hold in memory 2 block columns
        # Plus two blocks (=> quadratic equation 2B^2 + 2BN - M <= 0.
        # An additional block is needed whenever write_opposite is True, due to
        # copying blocks between matrices with different strides!
        if write_opposite:
            max_block_size = int(
                math.floor(
                    (-2 * N + math.sqrt(4 * N**2 + 12 * avail_ram)) / 6))
        else:
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))

        block_sizes = calc_block_sizes3(max_block_size, len(gpu_info), N)

    # Create BlockAlloc objects describing the subdivision of input
    block_allocations: List[BlockAlloc] = []
    cur_n = 0
    for bs in block_sizes:
        block_allocations.append(
            BlockAlloc(start=cur_n, end=cur_n + bs, length=bs))
        cur_n += bs

    num_gpus = len(gpu_info)
    if num_gpus < 1:
        raise ValueError(
            "Parallel LAUUM can only run when a GPU is available.")
    barrier = threading.Barrier(num_gpus, timeout=1000)
    threads = []
    for _gpu_idx, g in enumerate(gpu_info):
        # Assign rows to GPUs round-robin. Use _gpu_idx instead of g.Id since the latter
        # may not contain all integers from 0.
        gid_allocs = [
            i for i in range(len(block_allocations))
            if i % num_gpus == _gpu_idx
        ]
        cublas_handle = initialization.cublas_handle(g.Id)
        if cublas_handle is None:
            raise RuntimeError("CUBLAS must be initialized "
                               "on device %d before running parallel LAUUM." %
                               (g.Id))
        t = PropagatingThread(target=target,
                              name="GPU-%d" % (g.Id),
                              args=(A, block_allocations, gid_allocs, barrier,
                                    g.Id, cublas_handle, write_opposite))
        threads.append(t)

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return A
Beispiel #27
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Beispiel #28
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.size(1) if v is not None else w.size(1)
    dtype = X1.dtype

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream()
    s2 = tcd.Stream()

    with tcd.device(ddev), tcd.stream(s1):
        if v is not None:
            v_gpu = create_same_stride((M, T), v, dtype, ddev)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        K_gpu = create_same_stride((n, M), X1, dtype, ddev)
        X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        Kv_gpu = create_same_stride((n, T), X1, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        sq1_gpu = create_same_stride((n, ), X1, dtype, ddev)
        sq2_gpu = create_same_stride((M, ), X1, dtype, ddev)

        #if (d == D):
        #    with torch.cuda.stream(s2):
        #        cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2)
        #        torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu.narrow(0, 0, nb)  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                # Parallelize two matrix transfers (probably pointless)
                #if d < D:
                with torch.cuda.stream(s2):
                    cur_X2s_gpu = copy_to_device_noorder(M,
                                                         db,
                                                         X2,
                                                         0,
                                                         j,
                                                         X2s_gpu,
                                                         0,
                                                         0,
                                                         s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                      db,
                                                      X1,
                                                      i,
                                                      j,
                                                      X1ss_gpu,
                                                      0,
                                                      0,
                                                      s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize()
                s1.synchronize()
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                # Copy split w to GPU into cur_Kv_gpu,
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T
            s1.synchronize()
        s1.synchronize()

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Beispiel #29
0
def sparse_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, w, out = a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.shape
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory needs:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2       : dtot + 2 * D * M * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1)
    # ker_gpu  : M * ntot
    # w_gpu    : ntot * T
    # v_gpu    : M * T
    # out_gpu  : M * T
    avail_mem = max_mem / sizeof_dtype(dtype)
    den = 2 * D * X1.density + 2 + 3 * M + T
    sub = D + 2 * D * M * X2.density + M * T
    if v is not None:
        sub += M * T
    n = (avail_mem - sub) / den
    n = min(int(n), N)
    if n < 1:
        raise MemoryError("Not enough memory to run sparse dfmmv")

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        w_gpu = create_same_stride((n, T), out, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        ker_gpu = create_fortran((n, M), dtype, ddev)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        X2_d = SparseTensor.from_scipy(
            X2.transpose_csc().to_scipy().tocsr(copy=False)) \
            .index_to_int() \
            .to(device=ddev)

        for i in range(0, N, n):
            ic = min(n, N - i)
            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)

            ker_chunk = ker_gpu[:ic]
            ker_chunk.fill_(0.0)

            # TODO: This is wasteful (X2 will be prepared many times over)
            ddd = kernel._prepare_sparse(X1_chunk, X2)
            ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk)
            ker_chunk = kernel._finalize(ker_chunk, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)

            if v is not None:
                c_g_w.addmm_(ker_chunk, v_gpu)
            out_gpu.addmm_(ker_chunk.T, c_g_w)
            del ddd, X1_chunk, X1_chunk_d

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Beispiel #30
0
def generic_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.size()
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1d  : n x d
    # X2d  : M x d
    # Kv   : n x T
    # out2 : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    if sizeof_dtype(dtype) == 4:
        avail_mem /= 2
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev)
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        for i in range(0, N, n):
            ic = min(n, N - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)

            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, D, d):
                kc = min(d, D - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)
            if v is not None:
                c_g_w.addmm_(c_g_ker, v_gpu)
            out_gpu.addmm_(c_g_ker.T, c_g_w)

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out