Exemple #1
0
def main():

    from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()

    from scipy.io import mmread
    csr_mat = mmread(args[0]).tocsr().astype(numpy.float32)

    inv_mat_diag = 1 / csr_mat.diagonal()

    print "building..."
    from pycuda.sparse.packeted import PacketedSpMV
    spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype)
    rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype)

    from pycuda.sparse.operator import DiagonalPreconditioner
    if True:
        precon = DiagonalPreconditioner(
            spmv.permute(
                gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate)))
    else:
        precon = None

    from pycuda.sparse.cg import solve_pkt_with_cg
    print "start solve"
    for i in range(4):
        start = drv.Event()
        stop = drv.Event()
        start.record()

        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        res_gpu, it_count, res_count = \
                solve_pkt_with_cg(spmv, rhs_gpu, precon,
                        tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5,
                        pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()

        stop.record()
        stop.synchronize()

        elapsed = stop.time_since(start) * 1e-3
        est_flops = (csr_mat.nnz * 2 * (it_count + res_count) +
                     csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count)

        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count

        print "residual norm: %g" % (la.norm(csr_mat * res - rhs) /
                                     la.norm(rhs))
        print(
            "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
            "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count,
                             it_count / elapsed, est_flops / elapsed / 1e9))

    # TODO: mixed precision
    # TODO: benchmark
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
Exemple #2
0
def _solve_cuda(lap_sparse, B, return_full_prob=False, maxiter=100, tol=5e-5):
    """
    solves lap_sparse X_i = B_i for each phase i, using the conjugate
    gradient method. For each pixel, the label i corresponding to the
    maximal X_i is returned.
    """
    print("using gpu mode")
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()
    csr_mat = lap_sparse
    csr_mat = csr_mat.astype(np.float32)
    inv_mat_diag = 1 / csr_mat.diagonal()
    spmv = PacketedSpMV(csr_mat, True, csr_mat.dtype)
    X = []
    for i in range(len(B)):
        rhs = -B[i].astype(spmv.dtype)
        if True:
            precon = DiagonalPreconditioner(
                spmv.permute(
                    gpuarray.to_gpu(inv_mat_diag,
                                    allocator=dev_pool.allocate)))
        else:
            precon = None
        print("start solve")
        start = drv.Event()
        stop = drv.Event()
        start.record()
        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        tol = 1e-7 if spmv.dtype == np.float64 else tol
        res_gpu, it_count, res_count = solve_pkt_with_cg(
            spmv,
            rhs_gpu,
            precon,
            tol=tol,
            pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()
        stop.record()
        stop.synchronize()
        elapsed = stop.time_since(start) * 1e-3
        est_flops = (csr_mat.nnz * 2 * (it_count + res_count) +
                     csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count)
        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count
        print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
              "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count,
                               it_count / elapsed, est_flops / elapsed / 1e9))
        x0 = res[0]
        X.append(x0)
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
    if not return_full_prob:
        X = np.array(X)
        X = np.argmax(X, axis=0)
    return X
Exemple #3
0
    def test_mempool(self):
        from pycuda.tools import bitlog2
        from pycuda.tools import DeviceMemoryPool

        pool = DeviceMemoryPool()
        queue = []
        free, total = drv.mem_get_info()

        e0 = bitlog2(free)

        for e in range(e0 - 6, e0 - 4):
            for i in range(100):
                queue.append(pool.allocate(1 << e))
                if len(queue) > 10:
                    queue.pop(0)
        del queue
        pool.stop_holding()
Exemple #4
0
    def test_mempool(self):
        from pycuda.tools import bitlog2
        from pycuda.tools import DeviceMemoryPool

        pool = DeviceMemoryPool()
        maxlen = 10
        queue = []
        free, total = drv.mem_get_info()

        e0 = bitlog2(free)

        for e in range(e0-6, e0-4):
            for i in range(100):
                queue.append(pool.allocate(1<<e))
                if len(queue) > 10:
                    queue.pop(0)
        del queue
        pool.stop_holding()
Exemple #5
0
def test1d(wavelet='haar', use_float32=False, depth=1, num_rows=512, row_size=512,
           iterations=20, gpu_input=False, gpu_output=False, gpu_mempool=False):
    try:
        dtype = numpy.float64
        if use_float32:
            dtype = numpy.float32
        img = (numpy.array(scipy.misc.ascent(), dtype=dtype)-128.)/128.
        resized_img = resize(img, (num_rows, row_size), mode='constant')

        if gpu_input:
            cont_input_array = numpy.ascontiguousarray(resized_img, dtype=dtype)
            img_array_gpu = gpuarray.to_gpu(cont_input_array)
        else:
            img_array_gpu = resized_img

        if gpu_mempool:
            dev_mem_pool = DeviceMemoryPool()
            gpu_alloc = dev_mem_pool.allocate
        else:
            gpu_alloc = cuda.mem_alloc

        pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32)

        # Forward Transform
        print('---------FORWARD DWT---------')
        t = time.time()
        for _ in range(iterations):
            dec_cpu = pywt.wavedec(resized_img, wavelet=wavelet, mode='periodization', level=depth)
        t = time.time()-t
        print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations))

        t = time.time()
        for _ in range(iterations):
            dec_gpu = pwt.dwt1d(img_array_gpu, depth=depth, gpu_output=gpu_output, gpu_allocator=gpu_alloc)
        t = time.time()-t
        print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations))

        for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu)):
            if i == 0:
                result1 = d1.get() if gpu_output else d1
                result2 = d2
            else:
                result1 = numpy.concatenate((result1, d1.get() if gpu_output else d1), axis=1)
                result2 = numpy.concatenate((result2, d2), axis=1)
        print('RMSE: {} \n'.format(rmse(result1, result2)))

        dec_cpu_g = []
        if gpu_input:
            for d in dec_cpu:
                cont_array = numpy.ascontiguousarray(d, dtype=dtype)
                dec_cpu_g.append(gpuarray.to_gpu(cont_array))
        else:
            dec_cpu_g = dec_cpu

        # Inverse Transform
        print('---------INVERSE DWT---------')
        t = time.time()
        for _ in range(iterations):
            rec_cpu = pywt.waverec(dec_cpu, wavelet=wavelet, mode='periodization')
        t = time.time()-t
        print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations))

        t = time.time()
        for _ in range(iterations):
            rec_gpu = pwt.idwt1d(dec_cpu_g, gpu_output=gpu_output, gpu_allocator=gpu_alloc)
        t = time.time()-t
        print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations))

        print('RMSE: {} '.format(rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu)))

        if gpu_mempool:
            dev_mem_pool.stop_holding()

    except Exception as e:
        tb = traceback.format_exc()
        print("%s",tb)
def main_cg():
    from optparse import OptionParser

    parser = OptionParser(
            usage="%prog [options] MATRIX-MARKET-FILE")
    parser.add_option("-s", "--is-symmetric", action="store_true",
            help="Specify that the input matrix is already symmetric")
    options, args = parser.parse_args()

    from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()

    from scipy.io import mmread
    csr_mat = mmread(args[0]).tocsr().astype(numpy.float32)

    inv_mat_diag = 1/csr_mat.diagonal()

    print "building..."
    from pycuda.sparse.packeted import PacketedSpMV
    spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype)
    rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype)

    from pycuda.sparse.operator import DiagonalPreconditioner
    if True:
        precon = DiagonalPreconditioner(
                spmv.permute(gpuarray.to_gpu(
                    inv_mat_diag, allocator=dev_pool.allocate)))
    else:
        precon = None

    from pycuda.sparse.cg import solve_pkt_with_cg
    print "start solve"
    for i in range(4):
        start = drv.Event()
        stop = drv.Event()
        start.record()

        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        res_gpu, it_count, res_count = \
                solve_pkt_with_cg(spmv, rhs_gpu, precon,
                        tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5,
                        pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()

        stop.record()
        stop.synchronize()

        elapsed = stop.time_since(start)*1e-3
        est_flops = (csr_mat.nnz*2*(it_count+res_count)
            + csr_mat.shape[0]*(2+2+2+2+2)*it_count)

        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count

        print "residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs))
        print ("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
                "%g gflops/s" % (
                    csr_mat.shape[0],
                    elapsed, it_count, res_count, it_count/elapsed,
                    est_flops/elapsed/1e9))

    # TODO: mixed precision
    # TODO: benchmark
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
Exemple #7
0
def test2d(wavelet='haar',
           use_float32=False,
           depth=1,
           num_slices=1,
           row_size=512,
           col_size=512,
           iterations=20,
           gpu_input=False,
           gpu_output=False,
           gpu_mempool=False):
    try:
        dtype = numpy.float64
        if use_float32:
            dtype = numpy.float32

        # Prepare Image Array
        img = (numpy.array(scipy.misc.ascent(), dtype=dtype) - 128.) / 128.
        resized_img = resize(img, (col_size, row_size), mode='constant')

        img_array = numpy.empty([num_slices, col_size, row_size], dtype=dtype)
        for s in range(num_slices):
            img_array[s, :, :] = resized_img[:, :]

        if gpu_input:
            cont_input_array = numpy.ascontiguousarray(img_array, dtype=dtype)
            img_array_gpu = gpuarray.to_gpu(cont_input_array)
        else:
            img_array_gpu = img_array

        if gpu_mempool:
            dev_mem_pool = DeviceMemoryPool()
            gpu_alloc = dev_mem_pool.allocate
        else:
            gpu_alloc = cuda.mem_alloc

        pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32)

        # Forward Transform
        print('---------FORWARD 2D DWT---------')
        t = time.time()
        for _ in range(iterations):
            dec_cpu = [
                pywt.wavedec2(img_array[s],
                              wavelet=wavelet,
                              mode='periodization',
                              level=depth) for s in range(num_slices)
            ]
        t = time.time() - t
        print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations))

        t = time.time()
        for _ in range(iterations):
            dec_gpu = pwt.dwt2d(img_array_gpu,
                                depth=depth,
                                gpu_output=gpu_output,
                                gpu_allocator=gpu_alloc)
        t = time.time() - t
        print('PycudaWaveletTransform:\t{:.3f} ms'.format(
            (t * 1000.) / iterations))

        dec_cpu_g = []
        for ig, vg in enumerate(dec_gpu):
            if ig == 0:
                a = numpy.empty_like(vg.get() if gpu_output else vg)
                for ic, vc in enumerate(dec_cpu):
                    a[ic, :, :] = vc[0]
                dec_cpu_g.append(a)
            else:
                dl = []
                for id, vd in enumerate(vg):
                    d = numpy.empty_like(vd.get() if gpu_output else vd)
                    for ic, vc in enumerate(dec_cpu):
                        d[ic, :, :] = vc[ig][id]
                    dl.append(d)
                dec_cpu_g.append(dl)

        for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu_g)):
            if i == 0:
                result1 = d1.get().flatten() if gpu_output else d1.flatten()
                result2 = d2.flatten()
            else:
                for d in d1:
                    result1 = numpy.concatenate(
                        (result1,
                         d.get().flatten() if gpu_output else d.flatten()))
                for d in d2:
                    result2 = numpy.concatenate((result2, d.flatten()))
        print('RMSE: {} \n'.format(rmse(result1, result2)))

        if gpu_input:
            for ig, vg in enumerate(dec_cpu_g):
                if ig == 0:
                    cont_array = numpy.ascontiguousarray(vg, dtype=dtype)
                    dec_cpu_g[ig] = gpuarray.to_gpu(cont_array)
                else:
                    for id, vd in enumerate(vg):
                        cont_array = numpy.ascontiguousarray(vd, dtype=dtype)
                        dec_cpu_g[ig][id] = gpuarray.to_gpu(cont_array)

        # Inverse Transform
        print('---------INVERSE 2D DWT---------')
        t = time.time()
        for _ in range(iterations):
            rec_cpu = [
                pywt.waverec2(d, wavelet=wavelet, mode='periodization')
                for d in dec_cpu
            ]
        t = time.time() - t
        print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations))

        t = time.time()
        for _ in range(iterations):
            rec_gpu = pwt.idwt2d(dec_cpu_g,
                                 gpu_output=gpu_output,
                                 gpu_allocator=gpu_alloc)
        t = time.time() - t
        print('PycudaWaveletTransform:\t{:.3f} ms'.format(
            (t * 1000.) / iterations))

        rec_cpu_g = numpy.empty_like(rec_gpu.get() if gpu_output else rec_gpu)
        for ic, vc in enumerate(rec_cpu):
            rec_cpu_g[ic, :, :] = vc
        print('RMSE: {} '.format(
            rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu_g)))

        if gpu_mempool:
            dev_mem_pool.stop_holding()

    except Exception as e:
        tb = traceback.format_exc()
        print("%s", tb)