Example #1
0
    def __init__(self, device):

        self.api_id = get_id()

        self._device = device
        self.max_work_group_size = device.max_threads_per_block
        self.max_work_item_sizes = [
            device.max_block_dim_x, device.max_block_dim_y,
            device.max_block_dim_z
        ]

        self.max_num_groups = [
            device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z
        ]

        # there is no corresponding constant in the API at the moment
        self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32

        self.warp_size = device.warp_size

        devdata = DeviceData(device)
        self.min_mem_coalesce_width = dict(
            ((size, devdata.align_words(word_size=size))
             for size in [4, 8, 16]))
        self.local_mem_size = device.max_shared_memory_per_block

        self.compute_units = device.multiprocessor_count
Example #2
0
    def __init__(self, device):

        self.api_id = get_id()

        self._device = device
        self.max_work_group_size = device.max_threads_per_block
        self.max_work_item_sizes = [
            device.max_block_dim_x,
            device.max_block_dim_y,
            device.max_block_dim_z]

        self.max_num_groups = [
            device.max_grid_dim_x,
            device.max_grid_dim_y,
            device.max_grid_dim_z]

        # there is no corresponding constant in the API at the moment
        self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32

        self.warp_size = device.warp_size

        devdata = DeviceData(device)
        self.min_mem_coalesce_width = dict(
            ((size,devdata.align_words(word_size=size)) for size in [4, 8, 16]))
        self.local_mem_size = device.max_shared_memory_per_block

        self.compute_units = device.multiprocessor_count
Example #3
0
    def __init__(self, device, stream, mempool):

        self._stream = stream
        self._recreate_stream = stream is None

        devdata = DeviceData(device)

        self.min_mem_coalesce_width = {}
        for size in [4, 8, 16]:
            self.min_mem_coalesce_width[size] = devdata.align_words(
                word_size=size)

        self.num_smem_banks = devdata.smem_granularity
        self.max_registers = device.get_attribute(
            device_attribute.MAX_REGISTERS_PER_BLOCK)
        self.max_grid_x = 2**log2(
            device.get_attribute(device_attribute.MAX_GRID_DIM_X))
        self.max_grid_y = 2**log2(
            device.get_attribute(device_attribute.MAX_GRID_DIM_Y))
        self.max_block_size = device.get_attribute(
            device_attribute.MAX_BLOCK_DIM_X)
        self.max_shared_mem = device.get_attribute(
            device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)

        if mempool is None:
            self.allocate = cuda.mem_alloc
        else:
            self._mempool = mempool
            self.allocate = mempool.allocate
Example #4
0
def _splay_backend(n, dev):
    # heavily modified from cublas
    from pycuda.tools import DeviceData
    devdata = DeviceData(dev)

    min_threads = devdata.warp_size
    max_threads = 128
    max_blocks = 4 * devdata.thread_blocks_per_mp \
            * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)

    if n < min_threads:
        block_count = 1
        threads_per_block = min_threads
    elif n < (max_blocks * min_threads):
        block_count = (n + min_threads - 1) // min_threads
        threads_per_block = min_threads
    elif n < (max_blocks * max_threads):
        block_count = max_blocks
        grp = (n + min_threads - 1) // min_threads
        threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads
    else:
        block_count = max_blocks
        threads_per_block = max_threads

    # print "n:%d bc:%d tpb:%d" % (n, block_count, threads_per_block)
    return (block_count, 1), (threads_per_block, 1, 1)
Example #5
0
def setDevice(ndev=None):
    ''' To use CUDA or OpenCL you need a context and a device to stablish the context o                   communication '''
    cuda.init()
    nDevices = cuda.Device.count()
    print "Available Devices:"
    for i in range(nDevices):
        dev = cuda.Device(i)
        try:
            mem = cuda.mem_get_info()[-i - 1]
        except:
            mem = 0
        print "  Device {0}: {1}, Total (MB) {2:.1f}, Free (MB) {3:.1f}".format(
            i, dev.name(),
            dev.total_memory() / 2.**20, mem / 2.**20)  #mem/2.**20 )
    devNumber = 0
    if nDevices > 1:
        if ndev == None:
            devNumber = int(raw_input("Select device number: "))
        else:
            devNumber = ndev
    dev = cuda.Device(devNumber)
    #cuda.Context.pop()  #Disable previus CUDA context
    ctxCUDA = dev.make_context()
    devdata = DeviceData(dev)
    print "Using device {0}: {1}".format(devNumber, dev.name())
    return ctxCUDA, dev, devdata
Example #6
0
    def __init__(self, mat, dtype):
        self.dtype = np.dtype(dtype)
        self.index_dtype = np.dtype(np.int32)
        self.shape = mat.shape

        self.block_size = 128

        from scipy.sparse import coo_matrix
        coo_mat = coo_matrix(mat, dtype=self.dtype)

        self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype))
        self.col_gpu = gpuarray.to_gpu(coo_mat.col.astype(self.index_dtype))
        self.data_gpu = gpuarray.to_gpu(coo_mat.data)
        self.nnz = coo_mat.nnz

        from pycuda.tools import DeviceData
        dev = drv.Context.get_device()
        devdata = DeviceData()
        max_threads = (devdata.warps_per_mp * devdata.warp_size *
                       dev.multiprocessor_count)
        max_blocks = 4 * max_threads // self.block_size
        warps_per_block = self.block_size // dev.warp_size

        if self.nnz:

            def divide_into(x, y):
                return (x + y - 1) // y

            num_units = self.nnz // dev.warp_size
            num_warps = min(num_units, warps_per_block * max_blocks)
            self.num_blocks = divide_into(num_warps, warps_per_block)
            num_iters = divide_into(num_units, num_warps)

            self.interval_size = dev.warp_size * num_iters
            self.tail = num_units * dev.warp_size
Example #7
0
def orinfo(n):
    orec = OccupancyRecord(DeviceData(), n)
    return """occupancy record information
        thread blocks per multiprocessor - %d
        warps per multiprocessor - %d
        limited by - %s
        occupancy - %f
    """ % (orec.tb_per_mp, orec.warps_per_mp, orec.limited_by, orec.occupancy)
Example #8
0
def make_GPU_gradient(mesh, context):
    '''Prepare to compute gradient on the GPU w.r.t. the given mesh.
    Return gradient function.
    '''
    mx = int(getattr(mesh, 'nx', 1))
    my = int(getattr(mesh, 'ny', 1))
    mz = int(getattr(mesh, 'nz', 1))

    dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64)
    dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64)
    dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64)

    sizeof_double = 8
    with open(where + 'gradient2.cu') as fdlib:
        source = fdlib.read()
    module = SourceModule(source)

    mx_ptr = module.get_global("mx")[0]
    my_ptr = module.get_global("my")[0]
    mz_ptr = module.get_global("mz")[0]
    cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32))
    cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32))
    cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32))

    dxInv_ptr = module.get_global("dxInv")[0]
    dyInv_ptr = module.get_global("dyInv")[0]
    dzInv_ptr = module.get_global("dzInv")[0]
    cuda.memcpy_htod(dxInv_ptr, dxInv)
    cuda.memcpy_htod(dyInv_ptr, dyInv)
    cuda.memcpy_htod(dzInv_ptr, dzInv)

    deriv_x = module.get_function("gradient_x")
    deriv_y = module.get_function("gradient_y")
    deriv_z = module.get_function("gradient_z")

    block, grid = mesh.get_domain_decomposition(DeviceData().max_threads)

    d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64)
    d_deriv_y = gpuarray.empty_like(d_deriv_x)
    d_deriv_z = gpuarray.empty_like(d_deriv_x)

    def _gradient(scalar_values):
        '''Calculate three-dimensional gradient for GPUArray
        scalar_values.
        '''
        deriv_x(scalar_values, d_deriv_x, block=block, grid=grid)
        deriv_y(scalar_values, d_deriv_y, block=block, grid=grid)
        deriv_z(scalar_values, d_deriv_z, block=block, grid=grid)
        context.synchronize()

        return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension]
    return _gradient
Example #9
0
	def __init__(self, device, stream, mempool):

		self._stream = stream
		self._recreate_stream = stream is None

		devdata = DeviceData(device)

		self.min_mem_coalesce_width = {}
		for size in [4, 8, 16]:
			self.min_mem_coalesce_width[size] = devdata.align_words(word_size=size)

		self.num_smem_banks = devdata.smem_granularity
		self.max_registers = device.get_attribute(device_attribute.MAX_REGISTERS_PER_BLOCK)
		self.max_grid_x = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_X))
		self.max_grid_y = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_Y))
		self.max_block_size = device.get_attribute(device_attribute.MAX_BLOCK_DIM_X)
		self.max_shared_mem = device.get_attribute(device_attribute.MAX_SHARED_MEMORY_PER_BLOCK)

		if mempool is None:
			self.allocate = cuda.mem_alloc
		else:
			self._mempool = mempool
			self.allocate = mempool.allocate
Example #10
0
    def _compile_gpu(self):

        codegen = CudaKernelGenerator(self)

        # verify all gpuarray have equal length
        num = [len(self[key]) for key in codegen.args]
        assert num.count(num[0]) == len(num), "Variables have unequal lenth."
        num = num[0]

        try:
            mod = SourceModule(codegen.src, options = ["--ptxas-options=-v"])
            kernel = mod.get_function(self.__class__.__name__)
            kernel.prepare(codegen.arg_ctype)
        except:
            for i, line in enumerate(codegen.src.split('\n')):
                print("{}: {}".format(i, line))
            raise

        deviceData = DeviceData()
        maxThreads = int(np.float(deviceData.registers // kernel.num_regs))
        maxThreads = int(2**int(np.log(maxThreads) / np.log(2)))
        threadsPerBlock = int(min(256, maxThreads, deviceData.max_threads))
        numBlocks = (num-1) / threadsPerBlock + 1
        deviceNumBlocks = 6 * drv.Context.get_device().MULTIPROCESSOR_COUNT

        block = (threadsPerBlock, 1, 1)
        grid = (int(min(numBlocks,deviceNumBlocks)), 1)

        address = [self[key].gpudata for key in codegen.args]

        self.gpu = SimpleNamespace(
            args=codegen.args,
            arg_address=address,
            arg_ctype=codegen.arg_ctype,
            block=block,
            grid=grid,
            kernel=kernel,
            num_thread=num,
            src=codegen.src)

        self.update = self._update_gpu
Example #11
0
    def __init__(self, mat, is_symmetric, dtype):
        from pycuda.tools import DeviceData

        devdata = DeviceData()

        # all row indices in the data structure generation code are
        # "unpermuted" unless otherwise specified
        self.dtype = np.dtype(dtype)
        self.index_dtype = np.int32
        self.packed_index_dtype = np.uint32
        self.threads_per_packet = devdata.max_threads

        h, w = self.shape = mat.shape
        if h != w:
            raise ValueError("only square matrices are supported")

        self.rows_per_packet = (devdata.shared_memory -
                                100) // (2 * self.dtype.itemsize)

        self.block_count = (h + self.rows_per_packet -
                            1) // self.rows_per_packet

        # get metis partition -------------------------------------------------
        from scipy.sparse import csr_matrix

        csr_mat = csr_matrix(mat, dtype=self.dtype)

        from pymetis import part_graph

        if not is_symmetric:
            # make sure adjacency graph is undirected
            adj_mat = csr_mat + csr_mat.T
        else:
            adj_mat = csr_mat

        while True:
            cut_count, dof_to_packet_nr = part_graph(int(self.block_count),
                                                     xadj=adj_mat.indptr,
                                                     adjncy=adj_mat.indices)

            # build packet_nr_to_dofs
            packet_nr_to_dofs = {}
            for i, packet_nr in enumerate(dof_to_packet_nr):
                try:
                    dof_packet = packet_nr_to_dofs[packet_nr]
                except KeyError:
                    packet_nr_to_dofs[packet_nr] = dof_packet = []

                dof_packet.append(i)

            packet_nr_to_dofs = [
                packet_nr_to_dofs.get(i) for i in range(len(packet_nr_to_dofs))
            ]

            too_big = False
            for packet_dofs in packet_nr_to_dofs:
                if len(packet_dofs) >= self.rows_per_packet:
                    too_big = True
                    break

            if too_big:
                old_block_count = self.block_count
                self.block_count = int(2 + 1.05 * self.block_count)
                print(("Metis produced a big block at block count "
                       "%d--retrying with %d" %
                       (old_block_count, self.block_count)))
                continue

            break

        assert len(packet_nr_to_dofs) == self.block_count

        # permutations, base rows ---------------------------------------------
        (
            new2old_fetch_indices,
            old2new_fetch_indices,
            packet_base_rows,
        ) = self.find_simple_index_stuff(packet_nr_to_dofs)

        # find local row cost and remaining_coo -------------------------------
        local_row_costs, remaining_coo = self.find_local_row_costs_and_remaining_coo(
            csr_mat, dof_to_packet_nr, old2new_fetch_indices)
        local_nnz = np.sum(local_row_costs)

        assert remaining_coo.nnz == csr_mat.nnz - local_nnz

        # find thread assignment for each block -------------------------------
        thread_count = len(packet_nr_to_dofs) * self.threads_per_packet
        thread_assignments, thread_costs = self.find_thread_assignment(
            packet_nr_to_dofs, local_row_costs, thread_count)

        max_thread_costs = np.max(thread_costs)

        # build data structure ------------------------------------------------
        from .pkt_build import build_pkt_data_structure

        build_pkt_data_structure(
            self,
            packet_nr_to_dofs,
            max_thread_costs,
            old2new_fetch_indices,
            csr_mat,
            thread_count,
            thread_assignments,
            local_row_costs,
        )

        self.packet_base_rows = gpuarray.to_gpu(packet_base_rows)
        self.new2old_fetch_indices = gpuarray.to_gpu(new2old_fetch_indices)
        self.old2new_fetch_indices = gpuarray.to_gpu(old2new_fetch_indices)

        from .coordinate import CoordinateSpMV

        self.remaining_coo_gpu = CoordinateSpMV(remaining_coo, dtype)
Example #12
0
from pycuda import gpuarray as gu
from pycuda import driver
from pycuda.cumath import fabs
from pycuda.tools import DeviceData
from pytom.tompy.tools import paste_in_center

from typing import Union, Tuple

from pycuda.reduction import ReductionKernel
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from skcuda.misc import sum, max, min
from skcuda.fft import fft, ifft, Plan
from skcuda.linalg import conj

max_threads = DeviceData().max_threads

cc_mod = SourceModule("""
__global__ void update_scores_angles(float *scores, float *angles, float *ccc_map, float angleId, int num_elements, int dimx)
{ 
   const int idx = (threadIdx.x + blockIdx.x*dimx)*dimx;

   
   for (int i=0; i < dimx; i++) {
       if (idx +i < num_elements){
           if (scores[idx+i] < ccc_map[idx+i]) {
               scores[idx+i] = ccc_map[idx+i];
               angles[idx+i] = angleId;
            }
       }       
   }
Example #13
0
            'gridDim.z'
        ]
        upVar = [
            str(cuB[0]),
            str(cuB[1]),
            str(cuB[2]),
            str(cuG[0]),
            str(cuG[1]),
            str(cuG[2])
        ]
        dicVarOptim = dict(zip(downVar, upVar))
        for i in downVar:
            kFile = kFile.replace(i, dicVarOptim[i])
    if compiling:
        kFile = SourceModule(kFile, include_dirs=[myDir])
    return kFile


def getFreeMemory(show=True):
    ''' Return the free memory of the device,. Very usful to look for save device memory '''
    Mb = 1024. * 1024.
    Mbytes = float(cuda.mem_get_info()[0]) / Mb
    if show:
        print("Free Global Memory: %f Mbytes" % Mbytes)

    return cuda.mem_get_info()[0] / Mb


ctx, device = setDevice()
devdata = DeviceData(device)
Example #14
0
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda.tools import DeviceData

dd = DeviceData()
print(dd.shared_memory)
print(dd.max_threads)
print(dir(dd))

rows = 1000
cols = 1000
nagents = 10000
WALL_COLLISION_REWARD = -1.1
ROBOT_COLLISION_REWARD = -3
GOAL_REWARD = 2
view_size = 11
binwidth = 100
from math import ceil

binr = ceil(rows / binwidth)
binc = ceil(cols / binwidth)
nbins = binr * binc


def findNextPowerOf2(n):
    n = n - 1
    while n & n - 1:
        n = n & n - 1
    return n << 1