Example #1
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fmmv, args)
    return out
Example #2
0
def fdmmv_cuda_sparse(X1: SparseTensor,
                      X2: SparseTensor,
                      v: Optional[torch.Tensor],
                      w: Optional[torch.Tensor],
                      kernel,
                      out: Optional[torch.Tensor] = None,
                      opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.95)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue
        cur_out_gpu = create_C((M, T), X1.dtype,
                               f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None

        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    _start_wait_processes(sparse_fdmmv, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            torch.cuda.comm.reduce_add(
                wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
Example #3
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)),
                                 X1,
                                 v.dtype,
                                 'cpu',
                                 pin_memory=True)
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    # Create queues
    args = []  # Arguments passed to each subprocess
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0: continue
        args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                              X2=X2,
                              v=v,
                              out=out.narrow(0, block_sizes[i], bwidth),
                              kernel=kernel,
                              max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv
    _start_wait_processes(target, args)
    return out
Example #4
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device,
                                 pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2, v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                kernel=kernel, max_mem=g.usable_ram), g.Id))

        _start_wait_processes(target, args)
    return out
Example #5
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    gpu_info = _get_gpu_info(opt, slack=0.9)
    block_sizes = calc_gpu_block_sizes(gpu_info, N)

    if out is None:
        out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True)

    wrlk = []  # outputs for each subprocess.
    args = []
    for i, g in enumerate(gpu_info):
        bwidth = block_sizes[i + 1] - block_sizes[i]
        if bwidth <= 0:
            continue

        cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                         f'cuda:{gpu_info[i].Id}')  # M x T
        wrlk.append(cur_out_gpu)

        cur_w = None
        if w is not None:
            cur_w = w.narrow(0, block_sizes[i], bwidth)
        args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                               X2=X2,
                               v=v,
                               w=cur_w,
                               out=cur_out_gpu,
                               kernel=kernel,
                               max_mem=g.usable_ram), g.Id))

    # If using torch multiprocessing with spawn/forkserver here we must make sure
    # that any consumer of the queues is on a different process than the queue producer.
    # This is due to passing in a CUDA tensor to the queue
    # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # Thus we cannot run the first task on the current process.
    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv
    _start_wait_processes(target, args)

    if len(wrlk) > 1:
        # noinspection PyTypeChecker
        fastest_device: int = np.argmax([d.speed for d in gpu_info])
        out.copy_(
            tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id))
    else:
        out.copy_(wrlk[0])
    return out
Example #6
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn(
            "KeOps backend was chosen to be CPU, but GPU input tensors found. "
            "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
            "please pass CPU tensors; to avoid this warning if the GPU backend is "
            "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula,
                aliases,
                reduction_op=reduction,
                axis=axis,
                dtype=dtype,
                dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N,
                          T,
                          dtype=X1.dtype,
                          device=device,
                          pin_memory=(backend != 'CPU')
                          and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # slack is high due to imprecise memory usage estimates for keops
        gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack)
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  other_vars=other_vars,
                                  function=fn,
                                  backend=backend,
                                  gpu_ram=g.usable_ram), g.Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        if device.type == 'cuda':
            with torch.cuda.device(device):
                sync_current_stream(device)
                out = fn(*variables, out=out, backend=backend)
        else:
            out = fn(*variables, out=out, backend=backend)

    return out
Example #7
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    if out is None:
        out = create_same_stride((M, T),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1,
                         X2=X2,
                         v=v,
                         w=w,
                         out=out,
                         kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                             f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None
            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                   X2=X2,
                                   v=v,
                                   w=cur_w,
                                   out=cur_out_gpu,
                                   kernel=kernel,
                                   max_mem=g.usable_ram), g.Id))
        _start_wait_processes(target, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                tcd.comm.reduce_add(wrlk,
                                    destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out