Example #1
0
def main():

    from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()

    from scipy.io import mmread
    csr_mat = mmread(args[0]).tocsr().astype(numpy.float32)

    inv_mat_diag = 1 / csr_mat.diagonal()

    print "building..."
    from pycuda.sparse.packeted import PacketedSpMV
    spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype)
    rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype)

    from pycuda.sparse.operator import DiagonalPreconditioner
    if True:
        precon = DiagonalPreconditioner(
            spmv.permute(
                gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate)))
    else:
        precon = None

    from pycuda.sparse.cg import solve_pkt_with_cg
    print "start solve"
    for i in range(4):
        start = drv.Event()
        stop = drv.Event()
        start.record()

        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        res_gpu, it_count, res_count = \
                solve_pkt_with_cg(spmv, rhs_gpu, precon,
                        tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5,
                        pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()

        stop.record()
        stop.synchronize()

        elapsed = stop.time_since(start) * 1e-3
        est_flops = (csr_mat.nnz * 2 * (it_count + res_count) +
                     csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count)

        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count

        print "residual norm: %g" % (la.norm(csr_mat * res - rhs) /
                                     la.norm(rhs))
        print(
            "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
            "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count,
                             it_count / elapsed, est_flops / elapsed / 1e9))

    # TODO: mixed precision
    # TODO: benchmark
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
Example #2
0
def _solve_cuda(lap_sparse, B, return_full_prob=False, maxiter=100, tol=5e-5):
    """
    solves lap_sparse X_i = B_i for each phase i, using the conjugate
    gradient method. For each pixel, the label i corresponding to the
    maximal X_i is returned.
    """
    print("using gpu mode")
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()
    csr_mat = lap_sparse
    csr_mat = csr_mat.astype(np.float32)
    inv_mat_diag = 1 / csr_mat.diagonal()
    spmv = PacketedSpMV(csr_mat, True, csr_mat.dtype)
    X = []
    for i in range(len(B)):
        rhs = -B[i].astype(spmv.dtype)
        if True:
            precon = DiagonalPreconditioner(
                spmv.permute(
                    gpuarray.to_gpu(inv_mat_diag,
                                    allocator=dev_pool.allocate)))
        else:
            precon = None
        print("start solve")
        start = drv.Event()
        stop = drv.Event()
        start.record()
        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        tol = 1e-7 if spmv.dtype == np.float64 else tol
        res_gpu, it_count, res_count = solve_pkt_with_cg(
            spmv,
            rhs_gpu,
            precon,
            tol=tol,
            pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()
        stop.record()
        stop.synchronize()
        elapsed = stop.time_since(start) * 1e-3
        est_flops = (csr_mat.nnz * 2 * (it_count + res_count) +
                     csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count)
        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count
        print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
              "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count,
                               it_count / elapsed, est_flops / elapsed / 1e9))
        x0 = res[0]
        X.append(x0)
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
    if not return_full_prob:
        X = np.array(X)
        X = np.argmax(X, axis=0)
    return X
Example #3
0
    def __init__(self,
                 stream,
                 nx,
                 ny,
                 x_halo,
                 y_halo,
                 cpu_data=None,
                 dtype=np.float32):
        self.logger = logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.x_halo = x_halo
        self.y_halo = y_halo

        nx_halo = nx + 2 * x_halo
        ny_halo = ny + 2 * y_halo

        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
        self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)

        #For returning to download
        self.memorypool = PageLockedMemoryPool()

        #If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        #Make sure data is in proper format
        assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (
            self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(
                cpu_data.shape), str(
                    (self.ny, self.nx)), str((ny_halo, nx_halo)))
        assert cpu_data.itemsize == 4, "Wrong size of data type"
        assert not np.isfortran(
            cpu_data), "Wrong datatype (Fortran, expected C)"

        #Create copy object from host to device
        x = (nx_halo - cpu_data.shape[1]) // 2
        y = (ny_halo - cpu_data.shape[0]) // 2
        self.upload(stream,
                    cpu_data,
                    extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
def main_cg():
    from optparse import OptionParser

    parser = OptionParser(
            usage="%prog [options] MATRIX-MARKET-FILE")
    parser.add_option("-s", "--is-symmetric", action="store_true",
            help="Specify that the input matrix is already symmetric")
    options, args = parser.parse_args()

    from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()

    from scipy.io import mmread
    csr_mat = mmread(args[0]).tocsr().astype(numpy.float32)

    inv_mat_diag = 1/csr_mat.diagonal()

    print "building..."
    from pycuda.sparse.packeted import PacketedSpMV
    spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype)
    rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype)

    from pycuda.sparse.operator import DiagonalPreconditioner
    if True:
        precon = DiagonalPreconditioner(
                spmv.permute(gpuarray.to_gpu(
                    inv_mat_diag, allocator=dev_pool.allocate)))
    else:
        precon = None

    from pycuda.sparse.cg import solve_pkt_with_cg
    print "start solve"
    for i in range(4):
        start = drv.Event()
        stop = drv.Event()
        start.record()

        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        res_gpu, it_count, res_count = \
                solve_pkt_with_cg(spmv, rhs_gpu, precon,
                        tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5,
                        pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()

        stop.record()
        stop.synchronize()

        elapsed = stop.time_since(start)*1e-3
        est_flops = (csr_mat.nnz*2*(it_count+res_count)
            + csr_mat.shape[0]*(2+2+2+2+2)*it_count)

        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count

        print "residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs))
        print ("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
                "%g gflops/s" % (
                    csr_mat.shape[0],
                    elapsed, it_count, res_count, it_count/elapsed,
                    est_flops/elapsed/1e9))

    # TODO: mixed precision
    # TODO: benchmark
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
Example #5
0
    def __init__(self,
                 stream,
                 nx,
                 ny,
                 nz,
                 x_halo,
                 y_halo,
                 z_halo,
                 cpu_data=None,
                 dtype=np.float32):
        self.logger = logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.x_halo = x_halo
        self.y_halo = y_halo
        self.z_halo = z_halo

        nx_halo = nx + 2 * x_halo
        ny_halo = ny + 2 * y_halo
        nz_halo = nz + 2 * z_halo

        #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
        self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)

        #For returning to download
        self.memorypool = PageLockedMemoryPool()

        #If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        #Make sure data is in proper format
        assert cpu_data.shape == (
            nz_halo, ny_halo, nx_halo) or cpu_data.shape == (
                self.nz, self.ny,
                self.nx), "Wrong shape of data %s vs %s / %s" % (str(
                    cpu_data.shape), str(
                        (self.nz, self.ny,
                         self.nx)), str((nz_halo, ny_halo, nx_halo)))
        assert cpu_data.itemsize == 4, "Wrong size of data type"
        assert not np.isfortran(
            cpu_data), "Wrong datatype (Fortran, expected C)"

        #Create copy object from host to device
        copy = cuda.Memcpy3D()
        copy.set_src_host(cpu_data)
        copy.set_dst_device(self.data.gpudata)

        #Set offsets of destination
        x_offset = (nx_halo - cpu_data.shape[2]) // 2
        y_offset = (ny_halo - cpu_data.shape[1]) // 2
        z_offset = (nz_halo - cpu_data.shape[0]) // 2
        copy.dst_x_in_bytes = x_offset * self.data.strides[1]
        copy.dst_y = y_offset
        copy.dst_z = z_offset

        #Set pitch of destination
        copy.dst_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        width = max(self.nx, cpu_data.shape[2])
        height = max(self.ny, cpu_data.shape[1])
        depth = max(self.nz, cpu - data.shape[0])
        copy.width_in_bytes = width * cpu_data.itemsize
        copy.height = height
        copy.depth = depth

        #Perform the copy
        copy(stream)
Example #6
0
class CudaArray3D:
    """
    Uploads initial data to the CL device
    """
    def __init__(self,
                 stream,
                 nx,
                 ny,
                 nz,
                 x_halo,
                 y_halo,
                 z_halo,
                 cpu_data=None,
                 dtype=np.float32):
        self.logger = logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.x_halo = x_halo
        self.y_halo = y_halo
        self.z_halo = z_halo

        nx_halo = nx + 2 * x_halo
        ny_halo = ny + 2 * y_halo
        nz_halo = nz + 2 * z_halo

        #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz)
        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
        self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype)

        #For returning to download
        self.memorypool = PageLockedMemoryPool()

        #If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        #Make sure data is in proper format
        assert cpu_data.shape == (
            nz_halo, ny_halo, nx_halo) or cpu_data.shape == (
                self.nz, self.ny,
                self.nx), "Wrong shape of data %s vs %s / %s" % (str(
                    cpu_data.shape), str(
                        (self.nz, self.ny,
                         self.nx)), str((nz_halo, ny_halo, nx_halo)))
        assert cpu_data.itemsize == 4, "Wrong size of data type"
        assert not np.isfortran(
            cpu_data), "Wrong datatype (Fortran, expected C)"

        #Create copy object from host to device
        copy = cuda.Memcpy3D()
        copy.set_src_host(cpu_data)
        copy.set_dst_device(self.data.gpudata)

        #Set offsets of destination
        x_offset = (nx_halo - cpu_data.shape[2]) // 2
        y_offset = (ny_halo - cpu_data.shape[1]) // 2
        z_offset = (nz_halo - cpu_data.shape[0]) // 2
        copy.dst_x_in_bytes = x_offset * self.data.strides[1]
        copy.dst_y = y_offset
        copy.dst_z = z_offset

        #Set pitch of destination
        copy.dst_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        width = max(self.nx, cpu_data.shape[2])
        height = max(self.ny, cpu_data.shape[1])
        depth = max(self.nz, cpu - data.shape[0])
        copy.width_in_bytes = width * cpu_data.itemsize
        copy.height = height
        copy.depth = depth

        #Perform the copy
        copy(stream)

        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)

    def __del__(self, *args):
        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
        self.data.gpudata.free()
        self.data = None

    """
    Enables downloading data from GPU to Python
    """

    def download(self, stream, asynch=False):
        #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
        #Allocate host memory
        #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)
        #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32)
        cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx),
                                            dtype=np.float32)

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_src_device(self.data.gpudata)
        copy.set_dst_host(cpu_data)

        #Set offsets and pitch of source
        copy.src_x_in_bytes = self.x_halo * self.data.strides[1]
        copy.src_y = self.y_halo
        copy.src_z = self.z_halo
        copy.src_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = self.nx * cpu_data.itemsize
        copy.height = self.ny
        copy.depth = self.nz

        copy(stream)
        if asynch == False:
            stream.synchronize()

        return cpu_data
Example #7
0
class CudaArray2D:
    """
    Uploads initial data to the CUDA device
    """
    def __init__(self,
                 stream,
                 nx,
                 ny,
                 x_halo,
                 y_halo,
                 cpu_data=None,
                 dtype=np.float32):
        self.logger = logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.x_halo = x_halo
        self.y_halo = y_halo

        nx_halo = nx + 2 * x_halo
        ny_halo = ny + 2 * y_halo

        #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
        #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here
        self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype)

        #For returning to download
        self.memorypool = PageLockedMemoryPool()

        #If we don't have any data, just allocate and return
        if cpu_data is None:
            return

        #Make sure data is in proper format
        assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == (
            self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str(
                cpu_data.shape), str(
                    (self.ny, self.nx)), str((ny_halo, nx_halo)))
        assert cpu_data.itemsize == 4, "Wrong size of data type"
        assert not np.isfortran(
            cpu_data), "Wrong datatype (Fortran, expected C)"

        #Create copy object from host to device
        x = (nx_halo - cpu_data.shape[1]) // 2
        y = (ny_halo - cpu_data.shape[0]) // 2
        self.upload(stream,
                    cpu_data,
                    extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
        #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)

    def __del__(self, *args):
        #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
        self.data.gpudata.free()
        self.data = None

    """
    Enables downloading data from GPU to Python
    """

    def download(self, stream, cpu_data=None, asynch=False, extent=None):
        if (extent is None):
            x = self.x_halo
            y = self.y_halo
            nx = self.nx
            ny = self.ny
        else:
            x, y, nx, ny = extent

        if (cpu_data is None):
            #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
            #Allocate host memory
            #The following fails, don't know why (crashes python)
            #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
            #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
            cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)

        assert nx == cpu_data.shape[1]
        assert ny == cpu_data.shape[0]
        assert x + nx <= self.nx + 2 * self.x_halo
        assert y + ny <= self.ny + 2 * self.y_halo

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_src_device(self.data.gpudata)
        copy.set_dst_host(cpu_data)

        #Set offsets and pitch of source
        copy.src_x_in_bytes = int(x) * self.data.strides[1]
        copy.src_y = int(y)
        copy.src_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = int(nx) * cpu_data.itemsize
        copy.height = int(ny)

        copy(stream)
        if asynch == False:
            stream.synchronize()

        return cpu_data

    def upload(self, stream, cpu_data, extent=None):
        if (extent is None):
            x = self.x_halo
            y = self.y_halo
            nx = self.nx
            ny = self.ny
        else:
            x, y, nx, ny = extent

        assert (nx == cpu_data.shape[1])
        assert (ny == cpu_data.shape[0])
        assert (x + nx <= self.nx + 2 * self.x_halo)
        assert (y + ny <= self.ny + 2 * self.y_halo)

        #Create copy object from device to host
        copy = cuda.Memcpy2D()
        copy.set_dst_device(self.data.gpudata)
        copy.set_src_host(cpu_data)

        #Set offsets and pitch of source
        copy.dst_x_in_bytes = int(x) * self.data.strides[1]
        copy.dst_y = int(y)
        copy.dst_pitch = self.data.strides[0]

        #Set width in bytes to copy for each row and
        #number of rows to copy
        copy.width_in_bytes = int(nx) * cpu_data.itemsize
        copy.height = int(ny)

        copy(stream)