Esempio n. 1
0
    def test_dim_over_nm_v2_zero(self, avail_mem):
        tot_n = 400_000
        tot_m = 2_000
        n, m = select_dim_over_nm_v2(tot_n,
                                     tot_m,
                                     coef_nm=0,
                                     coef_n=0,
                                     coef_m=0,
                                     rest=0,
                                     max_mem=avail_mem)
        created = 0
        do_check(created, avail_mem, n, tot_n, m, tot_m)

        n, m = select_dim_over_nm_v2(tot_n,
                                     tot_m,
                                     coef_nm=1.3,
                                     coef_n=0,
                                     coef_m=0,
                                     rest=9890,
                                     max_mem=avail_mem)
        created = 1.3 * n * m + 9890
        do_check(created, avail_mem, n, tot_n, m, tot_m)

        n, m = select_dim_over_nm_v2(tot_n,
                                     tot_m,
                                     coef_nm=0,
                                     coef_n=2.0,
                                     coef_m=0,
                                     rest=9890,
                                     max_mem=avail_mem)
        created = 2 * n + 9890
        do_check(created, avail_mem, n, tot_n, m, tot_m)
Esempio n. 2
0
 def test_dim_over_nm_v2_notenough(self, avail_mem):
     tot_n = 40_000
     tot_m = 2_000
     tot_d = 30_720
     tot_t = 10
     with pytest.raises(MemoryError):
         select_dim_over_nm_v2(tot_n,
                               tot_m,
                               coef_nm=1.0,
                               coef_n=tot_d + tot_t,
                               coef_m=tot_d + tot_t,
                               rest=0,
                               max_mem=avail_mem)
     with pytest.raises(MemoryError):
         select_dim_over_nm_v2(tot_n,
                               tot_m,
                               coef_nm=0.1,
                               coef_n=0,
                               coef_m=0,
                               rest=12312,
                               max_mem=avail_mem)
Esempio n. 3
0
    def test_dim_over_nm_v2(self, avail_mem):
        tot_n = 40_000
        tot_m = 2_000
        tot_d = 30_720
        tot_t = 10

        n, m = select_dim_over_nm_v2(tot_n,
                                     tot_m,
                                     coef_nm=1.0,
                                     coef_n=tot_d + tot_t,
                                     coef_m=tot_d + tot_t,
                                     rest=0,
                                     max_mem=avail_mem)
        created = n * m + n * tot_t + n * tot_d + m * tot_d + m * tot_t
        do_check(created, avail_mem, n, tot_n, m, tot_m)
Esempio n. 4
0
def fmmv_cpu_sparse(X1: SparseTensor,
                    X2: SparseTensor,
                    v: torch.Tensor,
                    kernel: 'falkon.kernels.Kernel',
                    out: Optional[torch.Tensor],
                    opt: BaseOptions):
    opt = _setup_opt(opt, is_cpu=True)

    dtype = X1.dtype
    ntot, dtot = X1.size()
    mtot, T = v.size()

    # Create output matrix
    if out is None:
        out = torch.empty(ntot, T, dtype=dtype)
    out.fill_(0.0)

    avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype)
    # Narrowing X1, X2: n + m
    # Prepare - not computable, depends on kernel
    # ker_chunk : n*m
    # finalize : 0 (if can be implemented in place, kernel-dependent)
    n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=1, coef_n=1, coef_m=1, rest=0,
                                 max_mem=avail_mem)

    ker_chunk = create_same_stride((n, m), out, dtype, device='cpu')
    for i in range(0, ntot, n):
        ic = min(n, ntot - i)
        cur_out = out[i:i + ic, :]
        X1_chunk = X1.narrow_rows(i, ic)
        for j in range(0, mtot, m):
            jc = min(m, mtot - j)
            X2_chunk = X2.narrow_rows(j, jc)
            cur_ker_chunk = ker_chunk[:ic, :jc]
            cur_ker_chunk.fill_(0.0)

            ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
            kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk)
            kernel._finalize(cur_ker_chunk, ddd)

            # Multiply by the vector v
            cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc))
    return out
Esempio n. 5
0
def _sparse_fmm(proc_idx, queue, device_id):
    a: ArgsFmm = queue.get()
    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem

    ntot, dtot = X1.shape
    mtot = X2.size(0)

    avail_mem = max_mem / sizeof_dtype(gpu_dtype)
    # Memory usage:
    # X1_chunk : ntot + 2 * D * ntot * density
    # X2_chunk : dtot + 2 * D * mtot * density (because is transposed)
    # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here)
    # ker_gpu  : mtot * ntot
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density,
                                 coef_m=2 * dtot * X2.density,
                                 rest=dtot,
                                 max_mem=avail_mem)

    tc_device = torch.device('cuda:%d' % (int(device_id)))
    with torch.cuda.device(tc_device):
        # Initialize GPU buffers
        g_out = create_same_stride((n, m), out, gpu_dtype, tc_device)
        cpu_buf = None
        if X1.dtype != gpu_dtype:
            cpu_buf = create_same_stride((n, m),
                                         out,
                                         gpu_dtype,
                                         'cpu',
                                         pin_memory=True)

        for j in range(0, mtot, m):
            jc = min(m, mtot - j)

            X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype)
            X2_chunk_d = SparseTensor.from_scipy(
                X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                .index_to_int() \
                .to(device=tc_device)
            for i in range(0, ntot, n):
                ic = min(n, ntot - i)

                X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype)
                X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device)
                cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc)
                cur_g_out.fill_(0.0)

                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)
                cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                 cur_g_out)
                cur_g_out = kernel._finalize(cur_g_out, ddd)
                copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j,
                                     cpu_buf)
                del ddd, X1_chunk_d, X1_chunk
            del X2_chunk, X2_chunk_d
        del g_out
    return out
Esempio n. 6
0
def sparse_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()

    X1: SparseTensor = a.X1
    X2: SparseTensor = a.X2
    v, out = a.v, a.out
    kernel, max_mem = a.kernel, a.max_mem
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda
    ntot, dtot = X1.shape
    mtot, T = v.size()

    avail_mem = max_mem / sizeof_dtype(dtype)
    # Memory needs:
    # X1_chunk : N + 2*D*N*density
    # X2_chunk : D + 2*D*M*density (because is transposed)
    # sparse_out : N + 2*N*M*(density) (assume density = 1)
    # ker_gpu  : M*N
    # mmv_gpu  : N*T
    # v_gpu    : M*T
    # Other: GPU buffer
    n, m = select_dim_over_nm_v2(max_n=ntot,
                                 max_m=mtot,
                                 coef_nm=3,
                                 coef_n=2 + 2 * dtot * X1.density + T,
                                 coef_m=2 * dtot * X2.density + T,
                                 rest=dtot,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        # First collect necessary memory
        mem_needed = mtot * T + n * T + n * m
        # Create flat tensor
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)
        # Extract the sub-tensors
        flat_offset = 0
        v_gpu = extract_same_stride(flat_gpu_tn,
                                    size=(mtot, T),
                                    other=v,
                                    offset=flat_offset)
        flat_offset += np.prod(v_gpu.shape)
        copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0)
        mmv_gpu = extract_same_stride(flat_gpu_tn,
                                      size=(n, T),
                                      other=out,
                                      offset=flat_offset)
        flat_offset += np.prod(mmv_gpu.shape)
        # ker_gpu should be fortran-ordered due to cusparse csr2dense function
        ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset)
        flat_offset += np.prod(ker_gpu.shape)

        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            cur_mmv_gpu = mmv_gpu[:ic]  # n x T
            cur_mmv_gpu.fill_(0.0)

            X1_chunk = X1.narrow_rows(i, ic)
            X1_chunk_d = X1_chunk.index_to_int().to(device=ddev)
            for j in range(0, mtot, m):
                jc = min(m, mtot - j)

                X2_chunk = X2.narrow_rows(j, jc)
                # Prepare sparse on CPU
                ddd = kernel._prepare_sparse(X1_chunk, X2_chunk)

                # Transpose X2-chunk and convert it to CSR. This uses lots of RAM
                X2_chunk_d = SparseTensor.from_scipy(
                    X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \
                    .index_to_int() \
                    .to(device=ddev)

                cur_ker_gpu = ker_gpu[:ic, :jc]
                cur_ker_gpu.fill_(0.0)
                # Run the matrix multiplication (kernel apply)
                cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d,
                                                   cur_ker_gpu)
                cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd)

                # Multiply by the vector v
                cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc))
                del ddd, X2_chunk, X2_chunk_d

            # send result to CPU
            if not cuda_inputs:
                copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0)
            del X1_chunk, X1_chunk_d
    return out
Esempio n. 7
0
def distk_fmmv(proc_idx, queue, device_id):
    a: ArgsFmmv = queue.get()
    X1, X2, v, out = a.X1, a.X2, a.v, a.out
    kernel: L2DistanceKernel = a.kernel
    max_mem = a.max_mem

    N, D = X1.shape
    M = X2.shape[0]
    T = v.shape[1]
    dtype = X1.dtype
    cuda_inputs = X1.is_cuda

    # GPU memory usage:
    # X1s : n x D
    # X2s : m x D
    # vs  : m x T
    # nm  : n x m
    # out : n x T
    # -----------
    # total: n*m + n * (D + T) + m * (D + T) = R
    avail_mem = max_mem / sizeof_dtype(dtype)
    n, m = select_dim_over_nm_v2(max_n=N,
                                 max_m=M,
                                 coef_nm=1,
                                 coef_n=D + T,
                                 coef_m=D + T,
                                 rest=0,
                                 max_mem=avail_mem)

    ddev = torch.device('cuda:%d' % int(device_id))
    with tcd.device(ddev):
        mem_needed = n * m
        if not cuda_inputs:
            mem_needed += n * T + n * D + m * D + m * T
        flat_gpu_tn = torch.empty(size=(mem_needed, ),
                                  dtype=dtype,
                                  device=ddev)

        flat_offset = 0
        nm_gpu = extract_same_stride(flat_gpu_tn,
                                     size=(n, m),
                                     other=X1,
                                     offset=flat_offset)
        flat_offset += np.prod(nm_gpu.shape)
        if not cuda_inputs:
            out_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, T),
                                          other=out,
                                          offset=flat_offset)
            flat_offset += np.prod(out_gpu.shape)
            X1s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(n, D),
                                          other=X1,
                                          offset=flat_offset)
            flat_offset += np.prod(X1s_gpu.shape)
            X2s_gpu = extract_same_stride(flat_gpu_tn,
                                          size=(m, D),
                                          other=X2,
                                          offset=flat_offset)
            flat_offset += np.prod(X2s_gpu.shape)
            vs_gpu = extract_same_stride(flat_gpu_tn,
                                         size=(m, T),
                                         other=v,
                                         offset=flat_offset)
            flat_offset += np.prod(vs_gpu.shape)

        for i in range(0, N, n):
            nb = min(n, N - i)
            if cuda_inputs:
                cur_X1s_gpu = X1.narrow(0, i, nb)  # n x D
            else:
                cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu,
                                                     0, 0)
            sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2)
            if cuda_inputs:
                cur_out_gpu = out.narrow(0, i, nb)  # n x T
            else:
                cur_out_gpu = out_gpu.narrow(0, 0, nb)  # n x T
            cur_out_gpu.fill_(0.0)

            for j in range(0, M, m):
                mb = min(m, M - j)
                if cuda_inputs:
                    cur_X2s_gpu = X2.narrow(0, j, mb)  # m x D
                    cur_vs_gpu = v.narrow(0, j, mb)  # m x T
                else:
                    cur_X2s_gpu = copy_to_device_noorder(
                        mb, D, X2, j, 0, X2s_gpu, 0, 0)  # m x D
                    cur_vs_gpu = copy_to_device_noorder(
                        mb, T, v, j, 0, vs_gpu, 0, 0)  # m x T
                cur_nm_gpu = nm_gpu[:nb, :mb]  # n x m

                sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2)
                torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu)

                cur_nm_gpu.mul_(-2.0)
                cur_nm_gpu.add_(sq1)
                cur_nm_gpu.add_(sq2.T)
                cur_nm_gpu.clamp_min_(0)
                kernel._transform(cur_nm_gpu)

                # Multiply by the vector v
                cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu)  # n x T
            if not cuda_inputs:
                # send result to CPU
                copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0)
    return out