Ejemplo n.º 1
0
def generic_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.size()
    M, T = v.size()

    # GPU Memory Usage:
    # ker_gpu  : n*M
    # v_gpu    : M*T
    # X1s_gpu  : n*d
    # X2s_gpu  : M*d
    # mmv_gpu  : n*T
    # ----------
    # total : n*d + n*(M+T) + d*M + M*T
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, d = select_dim_over_d(maxD=dtot,
                             maxN=ntot,
                             coef_nd=1,
                             coef_n=M + T,
                             coef_d=M,
                             rest=M * T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev)
        v_gpu = v.to(device=ddev)  # M x T
        X1s_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)
            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, dtot, d):
                kc = min(d, dtot - k)
                c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0,
                                                 0)
                c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0,
                                                 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)
            # Multiply by the vector v
            c_g_mmv = mmv_gpu[:ic, :]
            torch.mm(c_g_ker, v_gpu, out=c_g_mmv)  # n x T
            # Copy back to host
            copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0)
    return out
Ejemplo n.º 2
0
def _sparse_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.size(0)

    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2_chunk : dtot + 2 * D * mtot * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here)
    # ker_gpu  : mtot * ntot
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density,
                                 coef_m=2 * dtot * X2.density,
                                 rest=dtot,
                                 max_mem=avail_mem)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)

            X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype)
            X2_chunk_d = SparseTensor.from_scipy(
                X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                .index_to_int() \
                .to(device=tc_device)
            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype)
                X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device)
                cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                cur_g_out.fill_(0.0)

                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
                cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                 cur_g_out)
                cur_g_out = kernel._finalize(cur_g_out, ddd)
                copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                     cpu_buf)
                del ddd, X1_chunk_d, X1_chunk
            del X2_chunk, X2_chunk_d
        del g_out
    return out
Ejemplo n.º 3
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_m(
        maxM=mtot,
        maxN=ntot,
        tot=avail_mem,
        coef_nm=3,
        coef_n=2 + 2 * dtot * X1.density + T,
        coef_m=2 * dtot * X2.density + T,
        rest=dtot,
    )

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        v_gpu = v.to(device=ddev)  # M x T
        mmv_gpu = create_same_stride((n, T), out, dtype, ddev)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Ejemplo n.º 4
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.size(1) if v is not None else w.size(1)
    dtype = X1.dtype

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    # FIXME: There seems to be a bug where if we let avail_mem like it is
    #        for 32-bit data-types some copy fails. In such case we need
    #        to free up some more memory and then everything runs fine.
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_d(maxD=D,
                             maxN=N,
                             coef_nd=1,
                             coef_n=M + T + 1,
                             coef_d=M,
                             rest=rest_coef + M,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream()
    s2 = tcd.Stream()

    with tcd.device(ddev), tcd.stream(s1):
        if v is not None:
            v_gpu = create_same_stride((M, T), v, dtype, ddev)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        K_gpu = create_same_stride((n, M), X1, dtype, ddev)
        X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev)
        X2s_gpu = create_same_stride((M, d), X2, dtype, ddev)
        Kv_gpu = create_same_stride((n, T), X1, dtype, ddev)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = create_same_stride((M, T), out, dtype, ddev)
        out_gpu.fill_(0.0)
        sq1_gpu = create_same_stride((n, ), X1, dtype, ddev)
        sq2_gpu = create_same_stride((M, ), X1, dtype, ddev)

        #if (d == D):
        #    with torch.cuda.stream(s2):
        #        cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2)
        #        torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu.narrow(0, 0, nb)  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                # Parallelize two matrix transfers (probably pointless)
                #if d < D:
                with torch.cuda.stream(s2):
                    cur_X2s_gpu = copy_to_device_noorder(M,
                                                         db,
                                                         X2,
                                                         0,
                                                         j,
                                                         X2s_gpu,
                                                         0,
                                                         0,
                                                         s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                      db,
                                                      X1,
                                                      i,
                                                      j,
                                                      X1ss_gpu,
                                                      0,
                                                      0,
                                                      s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize()
                s1.synchronize()
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                # Copy split w to GPU into cur_Kv_gpu,
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T
            s1.synchronize()
        s1.synchronize()

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
    return out
Ejemplo n.º 5
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    #if sizeof_dtype(dtype) == 4:
    #    avail_mem /= 2
    n, m = select_dim_over_m(maxM=M,
                             maxN=N,
                             coef_nm=1.0,
                             coef_n=D + T,
                             coef_m=D + T,
                             tot=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        nm_gpu = create_same_stride((n, m), X1, dtype, ddev)
        out_gpu = create_same_stride((n, T), out, dtype, ddev)
        X1s_gpu = create_same_stride((n, D), X1, dtype, ddev)
        X2s_gpu = create_same_stride((m, D), X2, dtype, ddev)
        vs_gpu = create_same_stride((m, T), v, dtype, ddev)

        for i in range(0, N, n):
            nb = min(n, N - i)
            cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0,
                                                 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu,
                                                     0, 0)
                cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0,
                                                    0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                # FIXME: This is the cause of mapping errors in case of float32 calculations.
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            # send result to CPU
            copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)

    return out
Ejemplo n.º 6
0
def _generic_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.shape[0]

    # This function is slightly faster if we limit the sizes
    # of the processed blocks slightly. Especially when doing
    # a cold run since pinned-memory allocation is extremely slow.
    # We don't want to do it if we're memory constrained though.
    if max_mem > 4 * 2**30:
        max_mem /= 4
    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # - gOut    : n x m
    # - g_ssX1  : n x d
    # - g_sX2   : m x d
    # total : n*d + m*d + n*m
    n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = cast_tensor(X2.narrow(0, j, jc),
                                   dtype=gpu_dtype,
                                   warn=False).pin_memory()

            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                if _gpu_tns_same_memory(X1, X2) and j < i:
                    out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T)
                else:
                    X1_chunk = cast_tensor(X1.narrow(0, i, ic),
                                           dtype=gpu_dtype,
                                           warn=False).pin_memory()

                    ddd = kernel._prepare(X1_chunk, X2_chunk)

                    cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                    cur_g_out.fill_(0.0)

                    for k in range(0, dtot, d):
                        kc = min(d, dtot - k)
                        # Move to GPU
                        cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc)
                        cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc))
                        cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc)
                        cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc))
                        # Apply
                        a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out)

                    a.kernel._finalize(cur_g_out, ddd)
                    copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                         cpu_buf)
                    del ddd
        del g_out, g_X1d, g_X2d
    return out
Ejemplo n.º 7
0
def distk_fdmmv(proc_idx, queue, device_id):
    a: ArgsFdmmv = queue.get()
    X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem
    N, D = X1.size()
    M = X2.size(0)
    T = v.shape[1] if v is not None else w.shape[1]
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda

    # Memory usage:
    # v    : M x T
    # K    : n x M
    # X1ss : n x d
    # X2s  : M x d
    # Kv   : n x T
    # out  : M x T
    # sq1  : n x 1
    # sq2  : M x 1
    # ------------
    # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M
    avail_mem = max_mem / sizeof_dtype(dtype)
    rest_coef = 2 * M * T if v is not None else M * T
    n, d = select_dim_over_nd(max_n=N,
                              max_d=D,
                              coef_nd=1,
                              coef_n=M + T + 1,
                              coef_d=M,
                              rest=rest_coef + M,
                              max_mem=avail_mem)
    ddev = torch.device('cuda:%d' % int(device_id))
    s1 = tcd.Stream(ddev)
    s2 = tcd.Stream(ddev)

    with tcd.device(ddev), tcd.stream(s1):
        # First collect necessary memory
        mem_needed = n * M + n * T + n + M
        if not cuda_inputs:
            mem_needed += n * d + M * d
            if v is not None:
                mem_needed += M * T
        if not out.is_cuda:
            mem_needed += M * T
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        if v is not None:
            if not cuda_inputs:
                v_gpu = extract_same_stride(flat_gpu_tn,
                                            size=(M, T),
                                            other=v,
                                            offset=flat_offset)
                flat_offset += np.prod(v_gpu.shape)
                copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
            else:
                v_gpu = v
        K_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(n, M),
                                    other=X1,
                                    offset=flat_offset)
        flat_offset += np.prod(K_gpu.shape)
        Kv_gpu = extract_same_stride(flat_gpu_tn,
                                     size=(n, T),
                                     other=X1,
                                     offset=flat_offset)
        flat_offset += np.prod(Kv_gpu.shape)
        if out.is_cuda:
            out_gpu = out
        else:
            out_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(out_gpu.shape)
        out_gpu.fill_(0.0)
        if not cuda_inputs:
            X1ss_gpu = extract_same_stride(flat_gpu_tn,
                                           size=(n, d),
                                           other=X1,
                                           offset=flat_offset)
            flat_offset += np.prod(X1ss_gpu.shape)
            X2s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, d),
                                          other=X2,
                                          offset=flat_offset)
            flat_offset += np.prod(X2s_gpu.shape)
        sq1_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, ),
                                      other=X1,
                                      offset=flat_offset)
        flat_offset += np.prod(sq1_gpu.shape)
        sq2_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(M, ),
                                      other=X1,
                                      offset=flat_offset)

        for i in range(0, N, n):
            nb = min(N - i, n)

            cur_K_gpu = K_gpu[:nb]  # nb x M
            cur_K_gpu.fill_(0.0)

            for j in range(0, D, d):
                db = min(D - j, d)
                s1.synchronize(
                )  # need that the add_(sq2_gpu.T) op is complete to avoid overwrite
                # Parallelize two matrix transfers
                with tcd.stream(s2):
                    if cuda_inputs:
                        cur_X2s_gpu = X2[:, j:j + db]
                    else:
                        cur_X2s_gpu = copy_to_device_noorder(M,
                                                             db,
                                                             X2,
                                                             0,
                                                             j,
                                                             X2s_gpu,
                                                             0,
                                                             0,
                                                             s=s2)
                    torch.norm(cur_X2s_gpu,
                               p=2,
                               dim=1,
                               keepdim=True,
                               out=sq2_gpu).pow_(2)
                if cuda_inputs:
                    cur_X1ss_gpu = X1[i:i + nb, j:j + db]
                else:
                    cur_X1ss_gpu = copy_to_device_noorder(nb,
                                                          db,
                                                          X1,
                                                          i,
                                                          j,
                                                          X1ss_gpu,
                                                          0,
                                                          0,
                                                          s=s1)
                torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True,
                           out=sq1_gpu).pow_(2)

                s2.synchronize(
                )  # need that cur_X2s_gpu and sq2_gpu are available.
                cur_K_gpu.addmm_(mat1=cur_X1ss_gpu,
                                 mat2=cur_X2s_gpu.T,
                                 alpha=-2.0)
                cur_K_gpu.add_(sq1_gpu)
                cur_K_gpu.add_(sq2_gpu.T)
                cur_K_gpu.clamp_min_(0)

            cur_K_gpu = kernel._transform(cur_K_gpu)

            if w is not None:
                cur_Kv_gpu = copy_to_device_noorder(nb,
                                                    T,
                                                    w,
                                                    i,
                                                    0,
                                                    Kv_gpu,
                                                    0,
                                                    0,
                                                    s=s1)  # n x T
                if v is not None:
                    cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu)
            else:
                # v cannot be None if w is None
                cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb)  # n x T
                torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu)  # n x T

            # Multiply transposed kernel with the Kv result.
            out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu)  # M x T

        if not out.is_cuda:
            copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0)
        s1.synchronize()
    return out
Ejemplo n.º 8
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density + T,
                                 coef_m=2 * dtot * X2.density + T,
                                 rest=dtot,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = mtot * T + n * T + n * m
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        v_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(mtot, T),
                                    other=v,
                                    offset=flat_offset)
        flat_offset += np.prod(v_gpu.shape)
        copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0)
        mmv_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, T),
                                      other=out,
                                      offset=flat_offset)
        flat_offset += np.prod(mmv_gpu.shape)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            if not cuda_inputs:
                copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Ejemplo n.º 9
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, m = select_dim_over_nm_v2(max_n=N,
                                 max_m=M,
                                 coef_nm=1,
                                 coef_n=D + T,
                                 coef_m=D + T,
                                 rest=0,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        mem_needed = n * m
        if not cuda_inputs:
            mem_needed += n * T + n * D + m * D + m * T
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)

        flat_offset = 0
        nm_gpu = extract_same_stride(flat_gpu_tn,
                                     size=(n, m),
                                     other=X1,
                                     offset=flat_offset)
        flat_offset += np.prod(nm_gpu.shape)
        if not cuda_inputs:
            out_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(out_gpu.shape)
            X1s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, D),
                                          other=X1,
                                          offset=flat_offset)
            flat_offset += np.prod(X1s_gpu.shape)
            X2s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(m, D),
                                          other=X2,
                                          offset=flat_offset)
            flat_offset += np.prod(X2s_gpu.shape)
            vs_gpu = extract_same_stride(flat_gpu_tn,
                                         size=(m, T),
                                         other=v,
                                         offset=flat_offset)
            flat_offset += np.prod(vs_gpu.shape)

        for i in range(0, N, n):
            nb = min(n, N - i)
            if cuda_inputs:
                cur_X1s_gpu = X1.narrow(0, i, nb)  # n x D
            else:
                cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu,
                                                     0, 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            if cuda_inputs:
                cur_out_gpu = out.narrow(0, i, nb)  # n x T
            else:
                cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                if cuda_inputs:
                    cur_X2s_gpu = X2.narrow(0, j, mb)  # m x D
                    cur_vs_gpu = v.narrow(0, j, mb)  # m x T
                else:
                    cur_X2s_gpu = copy_to_device_noorder(
                        mb, D, X2, j, 0, X2s_gpu, 0, 0)  # m x D
                    cur_vs_gpu = copy_to_device_noorder(
                        mb, T, v, j, 0, vs_gpu, 0, 0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            if not cuda_inputs:
                # send result to CPU
                copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)
    return out
Ejemplo n.º 10
0
def generic_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda
    ntot, dtot = X1.size()
    M, T = v.size()

    # GPU Memory Usage:
    # ker_gpu  : n*M
    # v_gpu    : M*T
    # X1s_gpu  : n*d
    # X2s_gpu  : M*d
    # mmv_gpu  : n*T
    # ----------
    # total : n*d + n*(M+T) + d*M + M*T
    avail_mem = max_mem / sizeof_dtype(dtype)
    extra_mem = kernel.extra_mem()
    n, d = select_dim_over_nd(
        max_n=ntot,
        max_d=dtot,
        coef_nd=1 + extra_mem.get('nd', 0),
        coef_n=M + T + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M,
        coef_d=M + extra_mem.get('d', 0) + extra_mem.get('md', 0) * M,
        rest=M * T + extra_mem.get('m', 0),
        max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = n * M
        if not cuda_inputs:
            mem_needed += M * T + n * d + M * d + n * T
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        ker_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, M),
                                      other=X1,
                                      offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)
        if not cuda_inputs:
            X1s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, d),
                                          other=X1,
                                          offset=flat_offset)
            flat_offset += np.prod(X1s_gpu.shape)
            X2s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(M, d),
                                          other=X2,
                                          offset=flat_offset)
            flat_offset += np.prod(X2s_gpu.shape)
            mmv_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(mmv_gpu.shape)
            v_gpu = extract_same_stride(flat_gpu_tn,
                                        size=(M, T),
                                        other=v,
                                        offset=flat_offset)
            flat_offset += np.prod(v_gpu.shape)
            copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0)
        else:
            v_gpu = v

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)
            ddd = kernel._prepare(X1.narrow(0, i, ic), X2)
            c_g_ker = ker_gpu.narrow(0, 0, ic)
            c_g_ker.fill_(0.0)
            for k in range(0, dtot, d):
                kc = min(d, dtot - k)
                if cuda_inputs:
                    c_g_X1s = X1[i:i + ic, k:k + kc]
                    c_g_X2s = X2[:, k:k + kc]
                else:
                    c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu,
                                                     0, 0)
                    c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu,
                                                     0, 0)
                kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker)
            kernel._finalize(c_g_ker, ddd)
            # Multiply by the vector v
            if cuda_inputs:
                c_g_mmv = out[i:i + ic, :]
            else:
                c_g_mmv = mmv_gpu[:ic, :]
            torch.mm(c_g_ker, v_gpu, out=c_g_mmv)  # n x T
            # Copy back to host
            if not cuda_inputs:
                copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0)
    return out
Ejemplo n.º 11
0
def _generic_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    change_dtype = gpu_dtype != X1.dtype

    ntot, dtot = X1.shape
    mtot = X2.shape[0]

    # This function is slightly faster if we limit the sizes
    # of the processed blocks slightly. Especially when doing
    # a cold run since pinned-memory allocation is extremely slow.
    # We don't want to do it if we're memory constrained though.
    if max_mem > 4 * 2**30:
        max_mem /= 4
    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # - gOut    : n x m
    # - g_ssX1  : n x d
    # - g_sX2   : m x d
    # total : n*d + m*d + n*m
    if cuda_inputs and not change_dtype:
        # No allocation will be performed, so no need to split at all!
        n, d, m = ntot, dtot, mtot
    else:
        n, d, m = select_dim_fMM(avail_mem, ntot, dtot, mtot)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    s1 = torch.cuda.Stream(device=tc_device)
    with torch.cuda.device(tc_device), torch.cuda.stream(s1):
        # Initialize GPU buffers
        if not cuda_inputs or change_dtype:
            g_X1d = create_same_stride((n, d), X1, gpu_dtype, tc_device)
            g_X2d = create_same_stride((m, d), X2, gpu_dtype, tc_device)
            g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        if not cuda_inputs:
            cpu_buf = None
            if change_dtype:
                cpu_buf = create_same_stride((n, m),
                                             out,
                                             gpu_dtype,
                                             'cpu',
                                             pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow(0, j, jc)

            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                if _gpu_tns_same_memory(X1, X2) and j < i:
                    out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T)
                    continue

                X1_chunk = X1.narrow(0, i, ic)
                ddd = kernel._prepare(X1_chunk, X2_chunk)
                if not cuda_inputs or change_dtype:
                    cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                else:
                    cur_g_out = out.narrow(0, i, ic).narrow(1, j, jc)
                cur_g_out.fill_(0.0)

                for k in range(0, dtot, d):
                    kc = min(d, dtot - k)
                    # Move to GPU and type-convert
                    if (not cuda_inputs) or change_dtype:
                        cur_g_X1d = g_X1d.narrow(0, 0, ic).narrow(1, 0, kc)
                        cur_g_X1d.copy_(X1_chunk.narrow(1, k, kc))
                        cur_g_X2d = g_X2d.narrow(0, 0, jc).narrow(1, 0, kc)
                        cur_g_X2d.copy_(X2_chunk.narrow(1, k, kc))
                    else:
                        cur_g_X1d = X1_chunk.narrow(1, k, kc)
                        cur_g_X2d = X2_chunk.narrow(1, k, kc)

                    # Apply
                    a.kernel._apply(cur_g_X1d, cur_g_X2d.T, cur_g_out)

                a.kernel._finalize(cur_g_out, ddd)
                if not cuda_inputs:
                    copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                         cpu_buf, s1)
                elif change_dtype:
                    out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_g_out)
                del ddd
    return out