Example #1
0
def fmm_cpu_sparse(X1: SparseTensor, X2: SparseTensor,
                   kernel: 'falkon.kernels.Kernel',
                   out: Optional[torch.Tensor],
                   opt: BaseOptions) -> torch.Tensor:
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu_sparse(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare_sparse(X1, X2)
        kernel._apply_sparse(X1, X2.transpose_csc(), out)
        kernel._finalize(out, ddd)

    return out
Example #2
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fmmv, args)
    return out
Example #3
0
def fmmv_cpu(X1, X2, v, kernel, out, opt):
    """Blockwise kernel-vector product

    This function computes ``kernel(X1, X2) @ v`` in a blockwise fashion, to avoid having the
    whole N*M kernel matrix in memory at once.
    Note that while the principle is that of matrix-vector product, `v` can have more than
    one column.

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v
        [M, T] array
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [N, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    ntot, dtot = X1.size(0), X1.size(1)
    M, T = v.size()
    dtype = v.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Only necessary memory allocation is that for the temporary kernel
    # `temp_out` of size n*M
    extra_mem = kernel.extra_mem()
    n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=extra_mem.get('nd', 0),
                              coef_n=M + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M,
                              coef_d=extra_mem.get('d', 0) + extra_mem.get('md', 0) * M,
                              rest=extra_mem.get('m', 0), max_mem=avail_mem)

    # Run batched matrix multiplication
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1.narrow(0, i, ic), X2)  # , v=v)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i: i + ic, k: k + kc]
            X2d = X2[:, k: k + kc]
            kernel._apply(X1d, X2d.T, temp_out)

        # temp_out = fnc(X1*X2', X1, X2)
        kernel._finalize(temp_out, ddd)

        torch.mm(temp_out, v, out=out[i: i + ic, :])
    return out
Example #4
0
def fdmmv_cpu_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: Optional[torch.Tensor],
                     w: Optional[torch.Tensor],
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None):
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.size(1) if v is not None else w.size(1)
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrow X1 : n
    # ker_chunk : n*M
    # w_blk     : n*T
    n = avail_mem / (M * T + 1)
    n = int(math.floor(n))
    if n < 1:
        raise MemoryError(("Available memory %.2fGB is insufficient "
                           "for blockwise fdMMv.") %
                          (avail_mem * sizeof_dtype(dtype) / 2**30))

    # Allocate fixed arrays
    ker_chunk = create_same_stride((n, M), out, dtype, device='cpu')
    w_blk = create_same_stride((n, T), out, dtype, device='cpu')
    # Run blocked fdmmv
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        X1_chunk = X1.narrow_rows(i, ic)
        cur_ker_chunk = ker_chunk[:ic]
        cur_ker_chunk.fill_(0.0)
        ddd = kernel._prepare_sparse(X1_chunk, X2)
        kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk)
        kernel._finalize(cur_ker_chunk, ddd)

        # Multiply by the vector v
        cur_w_blk = w_blk[:ic]  # n x T
        cur_w_blk.fill_(0.0)
        if w is not None:
            cur_w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            cur_w_blk.addmm_(cur_ker_chunk, v)
        out.addmm_(cur_ker_chunk.T, cur_w_blk)
    del ker_chunk, w_blk
    return out
Example #5
0
def fdmmv_cuda_sparse(X1: SparseTensor,
                      X2: SparseTensor,
                      v: Optional[torch.Tensor],
                      w: Optional[torch.Tensor],
                      kernel,
                      out: Optional[torch.Tensor] = None,
                      opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.95)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        cur_out_gpu = create_C((M, T), X1.dtype,
                               f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None

        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fdmmv, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            torch.cuda.comm.reduce_add(
                wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
Example #6
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)),
                                 X1,
                                 v.dtype,
                                 'cpu',
                                 pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv
    _start_wait_processes(target, args)
    return out
Example #7
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device,
                                 pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2, v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                kernel=kernel, max_mem=g.usable_ram), g.Id))

        _start_wait_processes(target, args)
    return out
Example #8
0
def fmm_cpu(
        X1: torch.Tensor,
        X2: torch.Tensor,
        kernel: 'falkon.kernels.Kernel',
        out: Optional[torch.Tensor],
        opt: BaseOptions) -> torch.Tensor:
    """Compute kernel value on matrices X1 and X2: ``out = kernel(X1, X2)``

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    kernel
        Class representing the desired kernel function
    out
        Array for storing the kernel output. If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory. Additionally, the
        :attr:`~falkon.options.FalkonOptions.no_single_kernel` option is used to determine the
        accumulator data type.

    Returns
    --------
    out
        [N, M] array. The kernel between X1 and X2.
    """
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare(X1, X2)
        kernel._apply(X1, X2.T, out)
        kernel._finalize(out, ddd)

    return out
Example #9
0
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor], opt: BaseOptions):
    opt = _setup_opt(opt, is_cpu=True)

    dtype = X1.dtype
    ntot, dtot = X1.size()
    mtot, T = v.size()

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)
    out.fill_(0.0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrowing X1, X2: n + m
    # Prepare - not computable, depends on kernel
    # ker_chunk : n*m
    # finalize : 0 (if can be implemented in place, kernel-dependent)
    n, m = select_dim_over_m(maxM=mtot,
                             maxN=ntot,
                             coef_nm=1,
                             coef_n=1,
                             coef_m=1,
                             tot=avail_mem)

    ker_chunk = create_same_stride((n, m), out, dtype, device='cpu')
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        cur_out = out[i:i + ic, :]
        X1_chunk = X1.narrow_rows(i, ic)
        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow_rows(j, jc)
            cur_ker_chunk = ker_chunk[:ic, :jc]
            cur_ker_chunk.fill_(0.0)

            ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
            kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(),
                                 cur_ker_chunk)
            kernel._finalize(cur_ker_chunk, ddd)

            # Multiply by the vector v
            cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc))
    return out
Example #10
0
def fmm_cpu(X1: torch.Tensor, X2: torch.Tensor,
            kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor],
            opt: BaseOptions) -> torch.Tensor:
    """Compute kernel value on matrices X1 and X2: `out = kernel(X1, X2)`

    Parameters
    -----------
    X1 : [N, D] array
    X2 : [M, D] array
    kernel : Kernel
        Class representing the desired kernel function
    out : Optional([N, M] array)
        Array for storing the kernel output. If None, will be allocated within the function.
    opt : Union(Dict, CompOpt)
        Options dictionary. Supported options are
         - 'final_type', the data-type of the output array. If 'out' is not None and it's
            data-type clashes with the setting of 'final_type', the out matrix will not be
            modified.

    Returns
    --------
    out : [N, M] array
        The kernel between X1 and X2.
    """
    opt = _setup_opt(opt, is_cpu=True)
    ntot, dtot = X1.size()
    mtot = X2.size(0)

    if out is None:
        out = torch.empty(ntot, mtot, dtype=X1.dtype)

    if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel:
        avail_mem = _get_cpu_ram(opt, 0.9)
        if avail_mem <= 0:
            raise MemoryError("Memory insufficient for kernel evaluation.")

        blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem)
    else:
        # Do the kernel computation on the spot
        out.fill_(0.0)
        ddd = kernel._prepare(X1, X2)
        kernel._apply(X1, X2.T, out)
        kernel._finalize(out, ddd)

    return out
Example #11
0
def fdmmv_cpu(X1, X2, v, w, kernel, out, opt):
    """Calculate a double kernel-vector product.

    This function computes the following quantity: ``kernel(X1, X2).T @ (kernel(X1, X2) @ v + w)``
    Where one of `v` or `w` can be empty.
    All arrays passed to this function must be 2-dimensional, although
    the second dimension can be unitary.

    The expression is not computed directly. We separate the computation
    into smaller blocks so as to reduce the total memory consumption (the
    large N*M kernel matrix is never wholly stored in RAM.)

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v : torch.Tensor or None
        [M, T] array. But note that at least one of v or w must be specified.
    w : torch.Tensor or None
        [N, T] array. But note that at least one of v or w must be specified.
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [M, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.shape[1] if v is not None else w.shape[1]
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # The only necessary temporary matrices are: `temp_out` of size n*M and
    # temp_w_block of size n*T
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M + T,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run Batched Matrix Computation
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1[i:i + ic, :], X2)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)
        kernel._finalize(temp_out, ddd)  # fnc(X1*X2', X1, X2) [n x M]

        w_blk = torch.zeros(ic, T, dtype=dtype)  # n x T
        if w is not None:
            w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            w_blk.addmm_(temp_out, v)

        out.add_(torch.mm(temp_out.T, w_blk))
    return out
Example #12
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue

        cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                         f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None
        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv
    _start_wait_processes(target, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
Example #13
0
def fmmv_cpu(X1, X2, v, kernel, out, opt):
    """Blockwise kernel-vector product

    This function computes
    ```
    kernel(X1, X2) @ v
    ```
    in a blockwise fashion, to avoid having the whole N*M kernel
    matrix in memory at once.
    Note that while the principle is that of matrix-vector product,
    `v` can have more than one column.

    Parameters
    -----------
     - X1 : [N, D] array
     - X2 : [M, D] array
     - v  : [M, T] array
     - kernel : Kernel
        Class representing the desired kernel function
     - out : [N, T] array (optional)
        Array for storing the kernel-vector product output.
        If None, will be allocated within the function.
     - opt : Union(Dict, CompOpt)
        Options dictionary. Supported options are
         - 'max_cpu_mem', sets the maximum amount of RAM which will the program should
            use.
         - 'final_type', the data-type of the output array. If `out` is not None and its
            data-type clashes with the setting of 'final_type', the `out` matrix will not be
            modified.
    """
    opt = _setup_opt(opt, is_cpu=True)

    ntot, dtot = X1.size(0), X1.size(1)
    M, T = v.size()
    dtype = v.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Only necessary memory allocation is that for the temporary kernel
    # `temp_out` of size n*M
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run batched matrix multiplication
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1.narrow(0, i, ic), X2)  # , v=v)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)

        # temp_out = fnc(X1*X2', X1, X2)
        kernel._finalize(temp_out, ddd)

        torch.mm(temp_out, v, out=out[i:i + ic, :])
    return out
Example #14
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    if out is None:
        out = create_same_stride((M, T),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1,
                         X2=X2,
                         v=v,
                         w=w,
                         out=out,
                         kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                             f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None
            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                   X2=X2,
                                   v=v,
                                   w=cur_w,
                                   out=cur_out_gpu,
                                   kernel=kernel,
                                   max_mem=g.usable_ram), g.Id))
        _start_wait_processes(target, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                tcd.comm.reduce_add(wrlk,
                                    destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out