Ejemplo n.º 1
0
def fdmmv_cuda_sparse(X1: SparseTensor,
                      X2: SparseTensor,
                      v: Optional[torch.Tensor],
                      w: Optional[torch.Tensor],
                      kernel,
                      out: Optional[torch.Tensor] = None,
                      opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")
    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_C((M, T), X1.dtype, device, pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.95)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1, X2=X2, v=v, w=w, out=out, kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(sparse_fdmmv, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            cur_out_gpu = create_C((M, T), X1.dtype, f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None

            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(
                X1=X1.narrow_rows(block_sizes[i], bwidth),
                X2=X2, v=v, w=cur_w, out=cur_out_gpu,
                kernel=kernel, max_mem=g.usable_ram), g.Id))
        _start_wait_processes(sparse_fdmmv, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                torch.cuda.comm.reduce_add(
                    wrlk, destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out
Ejemplo n.º 2
0
def fmmv_cuda(X1: torch.Tensor,
              X2: torch.Tensor,
              v: torch.Tensor,
              kernel,
              out: Optional[torch.Tensor] = None,
              opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T

    performs  fnc(X1*X2', X1, X2) * v   : N x T
    in blocks on multiple GPUs
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device,
                                 pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fmmv
    else:
        target = generic_fmmv

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2, v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                kernel=kernel, max_mem=g.usable_ram), g.Id))

        _start_wait_processes(target, args)
    return out
Ejemplo n.º 3
0
def fmmv_cuda_sparse(X1: SparseTensor,
                     X2: SparseTensor,
                     v: torch.Tensor,
                     kernel,
                     out: Optional[torch.Tensor] = None,
                     opt: Optional[BaseOptions] = None) -> torch.Tensor:
    opt = _setup_opt(opt)
    _check_contiguity((v, 'v'), (out, 'out'))
    device = X1.device

    N = X1.size(0)
    # Create output matrix
    if out is None:
        out = create_fortran((N, v.size(1)),
                             X1.dtype,
                             device,
                             pin_memory=device.type != 'cuda')
    out.fill_(0.0)

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFmmv(X1=X1,
                        X2=X2,
                        v=v,
                        out=out,
                        kernel=kernel,
                        max_mem=single_gpu_info.usable_ram)
        _call_direct(sparse_fmmv, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  kernel=kernel,
                                  max_mem=g.usable_ram), g.Id))

        _start_wait_processes(sparse_fmmv, args)
    return out
Ejemplo n.º 4
0
def fdmmv_cuda(X1: torch.Tensor,
               X2: torch.Tensor,
               v: Optional[torch.Tensor],
               w: Optional[torch.Tensor],
               kernel,
               out: Optional[torch.Tensor] = None,
               opt: Optional[BaseOptions] = None) -> torch.Tensor:
    """
    X1 : N x D
    X2 : M x D
    v  : M x T
    w  : N x T

    performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v  +  w )  : M x T
    in blocks on multiple GPUs

    Assume all inputs have the same data type
    """
    opt = _setup_opt(opt)
    _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out'))
    device = X1.device
    if v is None and w is None:
        raise ValueError("one of 'v' or 'w' must not be None.")

    T = v.size(1) if v is not None else w.size(1)
    M = X2.size(0)
    N = X1.size(0)

    if out is None:
        out = create_same_stride((M, T),
                                 X1,
                                 X1.dtype,
                                 device=device,
                                 pin_memory=device.type != 'cuda')

    gpu_info = _get_gpu_info(opt, slack=0.9)

    if kernel.kernel_type == "l2distance" and kernel.name == "gaussian":
        target = distk_fdmmv
    else:
        target = generic_fdmmv

    if device.type == 'cuda':
        single_gpu_info = [g for g in gpu_info if g.Id == device.index][0]
        args = ArgsFdmmv(X1=X1,
                         X2=X2,
                         v=v,
                         w=w,
                         out=out,
                         kernel=kernel,
                         max_mem=single_gpu_info.usable_ram)
        _call_direct(target, (args, device.index))
    else:
        block_sizes = calc_gpu_block_sizes(gpu_info, N)
        wrlk = []  # outputs for each subprocess.
        args = []
        for i, g in enumerate(gpu_info):
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            cur_out_gpu = create_same_stride((M, T), X1, X1.dtype,
                                             f'cuda:{gpu_info[i].Id}')  # M x T
            wrlk.append(cur_out_gpu)

            cur_w = None
            if w is not None:
                cur_w = w.narrow(0, block_sizes[i], bwidth)
            args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                   X2=X2,
                                   v=v,
                                   w=cur_w,
                                   out=cur_out_gpu,
                                   kernel=kernel,
                                   max_mem=g.usable_ram), g.Id))
        _start_wait_processes(target, args)
        if len(wrlk) > 1:
            # noinspection PyTypeChecker
            fastest_device: int = np.argmax([d.speed for d in gpu_info])
            out.copy_(
                tcd.comm.reduce_add(wrlk,
                                    destination=gpu_info[fastest_device].Id))
        else:
            out.copy_(wrlk[0])
    return out