Ejemplo n.º 1
0
def generic_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.size()
    M, T = v.size()

    # GPU Memory Usage:
    # ker_gpu  : n*M
    # v_gpu    : M*T
    # X1s_gpu  : n*d
    # X2s_gpu  : M*d
    # mmv_gpu  : n*T
    # ----------
    # total : n*d + n*(M+T) + d*M + M*T
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=1,
                             coef_n=M + T,
                             coef_d=M,
                             rest=M * T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev)
        v_gpu = v.to(device=ddev)  # M x T
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)
            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, dtot, d):
                kc = min(d, dtot - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)
            # Multiply by the vector v
            c_g_mmv = mmv_gpu[:ic, :]
            torch.mm(c_g_ker, v_gpu, out=c_g_mmv)  # n x T
            # Copy back to host
            copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0)
    return out
Ejemplo n.º 2
0
def fmmv_cpu(X1, X2, v, kernel, out, opt):
    """Blockwise kernel-vector product

    This function computes ``kernel(X1, X2) @ v`` in a blockwise fashion, to avoid having the
    whole N*M kernel matrix in memory at once.
    Note that while the principle is that of matrix-vector product, `v` can have more than
    one column.

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v
        [M, T] array
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [N, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    ntot, dtot = X1.size(0), X1.size(1)
    M, T = v.size()
    dtype = v.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Only necessary memory allocation is that for the temporary kernel
    # `temp_out` of size n*M
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run batched matrix multiplication
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1.narrow(0, i, ic), X2)  # , v=v)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)

        # temp_out = fnc(X1*X2', X1, X2)
        kernel._finalize(temp_out, ddd)

        torch.mm(temp_out, v, out=out[i:i + ic, :])
    return out
Ejemplo n.º 3
0
def fdmmv_cpu(X1, X2, v, w, kernel, out, opt):
    """Calculate a double kernel-vector product.

    This function computes the following quantity: ``kernel(X1, X2).T @ (kernel(X1, X2) @ v + w)``
    Where one of `v` or `w` can be empty.
    All arrays passed to this function must be 2-dimensional, although
    the second dimension can be unitary.

    The expression is not computed directly. We separate the computation
    into smaller blocks so as to reduce the total memory consumption (the
    large N*M kernel matrix is never wholly stored in RAM.)

    Parameters
    -----------
    X1
        [N, D] array
    X2
        [M, D] array
    v : torch.Tensor or None
        [M, T] array. But note that at least one of v or w must be specified.
    w : torch.Tensor or None
        [N, T] array. But note that at least one of v or w must be specified.
    kernel
        Class representing the desired kernel function
    out : torch.Tensor or None
        [M, T] array for storing the kernel-vector product output.
        If None, will be allocated within the function.
    opt
        Basic options dictionary, used for determining available memory.
    """
    opt = _setup_opt(opt, is_cpu=True)

    # Parameter validation
    if v is None and w is None:
        raise ValueError("One of v and w must be specified to run fMMV.")
    T = v.shape[1] if v is not None else w.shape[1]
    ntot, dtot = X1.size()
    M = X2.size(0)
    dtype = X1.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(M, T, dtype=dtype)
    out.fill_(0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # The only necessary temporary matrices are: `temp_out` of size n*M and
    # temp_w_block of size n*T
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M + T,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run Batched Matrix Computation
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1[i:i + ic, :], X2)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)
        kernel._finalize(temp_out, ddd)  # fnc(X1*X2', X1, X2) [n x M]

        w_blk = torch.zeros(ic, T, dtype=dtype)  # n x T
        if w is not None:
            w_blk.copy_(w[i:i + ic, :])
        if v is not None:
            # w_blk + c_out * v => (n x T) + (n x M)*(M x T)
            w_blk.addmm_(temp_out, v)

        out.add_(torch.mm(temp_out.T, w_blk))
    return out
Ejemplo n.º 4
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.size(1) if v is not None else w.size(1)
    dtype = X1.dtype

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream()
    s2 = tcd.Stream()

    with tcd.device(ddev), tcd.stream(s1):
        if v is not None:
            v_gpu = create_same_stride((M, T), v, dtype, ddev)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        K_gpu = create_same_stride((n, M), X1, dtype, ddev)
        X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        Kv_gpu = create_same_stride((n, T), X1, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        sq1_gpu = create_same_stride((n, ), X1, dtype, ddev)
        sq2_gpu = create_same_stride((M, ), X1, dtype, ddev)

        #if (d == D):
        #    with torch.cuda.stream(s2):
        #        cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2)
        #        torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu.narrow(0, 0, nb)  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                # Parallelize two matrix transfers (probably pointless)
                #if d < D:
                with torch.cuda.stream(s2):
                    cur_X2s_gpu = copy_to_device_noorder(M,
                                                         db,
                                                         X2,
                                                         0,
                                                         j,
                                                         X2s_gpu,
                                                         0,
                                                         0,
                                                         s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                      db,
                                                      X1,
                                                      i,
                                                      j,
                                                      X1ss_gpu,
                                                      0,
                                                      0,
                                                      s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize()
                s1.synchronize()
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                # Copy split w to GPU into cur_Kv_gpu,
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T
            s1.synchronize()
        s1.synchronize()

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Ejemplo n.º 5
0
def generic_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    N, D = X1.size()
    M = X2.size(0)
    if v is None:
        T = w.size(1)
    else:
        T = v.size(1)

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1d  : n x d
    # X2d  : M x d
    # Kv   : n x T
    # out2 : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    if sizeof_dtype(dtype) == 4:
        avail_mem /= 2
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # Initialize GPU data
        ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev)
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        if v is not None:
            v_gpu = v.to(device=ddev)  # M x T

        for i in range(0, N, n):
            ic = min(n, N - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)

            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, D, d):
                kc = min(d, D - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)

            if w is not None:
                c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0)
            else:
                c_g_w = w_gpu.narrow(0, 0, ic)
                c_g_w.fill_(0.0)
            if v is not None:
                c_g_w.addmm_(c_g_ker, v_gpu)
            out_gpu.addmm_(c_g_ker.T, c_g_w)

        if not out.is_cuda:
            copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Ejemplo n.º 6
0
def fmmv_cpu(X1, X2, v, kernel, out, opt):
    """Blockwise kernel-vector product

    This function computes
    ```
    kernel(X1, X2) @ v
    ```
    in a blockwise fashion, to avoid having the whole N*M kernel
    matrix in memory at once.
    Note that while the principle is that of matrix-vector product,
    `v` can have more than one column.

    Parameters
    -----------
     - X1 : [N, D] array
     - X2 : [M, D] array
     - v  : [M, T] array
     - kernel : Kernel
        Class representing the desired kernel function
     - out : [N, T] array (optional)
        Array for storing the kernel-vector product output.
        If None, will be allocated within the function.
     - opt : Union(Dict, CompOpt)
        Options dictionary. Supported options are
         - 'max_cpu_mem', sets the maximum amount of RAM which will the program should
            use.
         - 'final_type', the data-type of the output array. If `out` is not None and its
            data-type clashes with the setting of 'final_type', the `out` matrix will not be
            modified.
    """
    opt = _setup_opt(opt, is_cpu=True)

    ntot, dtot = X1.size(0), X1.size(1)
    M, T = v.size()
    dtype = v.dtype

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Only necessary memory allocation is that for the temporary kernel
    # `temp_out` of size n*M
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=0,
                             coef_n=M,
                             coef_d=0,
                             rest=0,
                             tot=avail_mem)

    # Run batched matrix multiplication
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)

        ddd = kernel._prepare(X1.narrow(0, i, ic), X2)  # , v=v)
        temp_out = torch.zeros(ic, M, dtype=dtype)
        for k in range(0, dtot, d):
            kc = min(d, dtot - k)
            X1d = X1[i:i + ic, k:k + kc]
            X2d = X2[:, k:k + kc]
            kernel._apply(X1d, X2d.T, temp_out)

        # temp_out = fnc(X1*X2', X1, X2)
        kernel._finalize(temp_out, ddd)

        torch.mm(temp_out, v, out=out[i:i + ic, :])
    return out