Ejemplo n.º 1
0
    def execute(self):
        # This zeros out the scratch buffer which is accumulated into using atomics
        # for update output kernels
        drv.memset_d32(self.kernels[0][5], 0, int(np.prod(self.U.shape)))

        for kernel in self.kernels:
            kernel[0].prepared_async_call(*kernel[1:])
def time_inference(engine, batch_size):
    assert(engine.get_nb_bindings() == 2)

    input_index = engine.get_binding_index(INPUT_LAYERS[0])
    output_index = engine.get_binding_index(OUTPUT_LAYERS[0])

    input_dim = engine.get_binding_dimensions(input_index).to_DimsCHW()
    output_dim = engine.get_binding_dimensions(output_index).to_DimsCHW()

    insize = batch_size * input_dim.C() * input_dim.H() * input_dim.W() * 4
    outsize = batch_size * output_dim.C() * output_dim.H() * output_dim.W() * 4

    d_input = cuda.mem_alloc(insize)
    d_output = cuda.mem_alloc(outsize)

    bindings = [int(d_input), int(d_output)]

    context = engine.create_execution_context()
    context.set_profiler(G_PROFILER)

    cuda.memset_d32(d_input, 0, insize // 4)

    for i in range(TIMING_INTERATIONS):
        context.execute(batch_size, bindings)

    context.destroy()
    return
Ejemplo n.º 3
0
def main(context, stream, plan1, N1, N2, g_buf1, g_buf2):
    #N1 = # ffts applied
    #N2 = dim of ffts
    x = np.linspace(0, 2 * np.pi, N2)

    y = np.sin(2 * x)
    ys = y
    ys = ys.reshape(1,N2)
    y = np.concatenate((y,np.zeros(nearest_2power(N2)-N2)))
    y = y.reshape(1,nearest_2power(N2))

    for i in xrange(N1-1): #append N1-1 sines
        yi = np.sin(2 * (i+2) * x)
        yis = yi
        yis = yis.reshape(1,N2)
        ys = np.concatenate(((ys),(yis)),0)
        yi = np.concatenate((yi,np.zeros(nearest_2power(N2)-N2)))
        yi = yi.reshape(1,nearest_2power(N2))
        y = np.concatenate(((y),(yi)),0)
        
    y = y.transpose()
    yim= np.zeros(y.shape)
    y = np.array(y,np.float64)
    yw = y.transpose()
    yimw = yim.transpose()

    aw = np.fft.fft(ys,int(nearest_2power(N2)),1)
    bw = np.real(np.fft.ifft(aw,int(nearest_2power(N2)),1))
    aw0 = np.fft.fft(y,int(nearest_2power(N2)),0)
    bw0 = np.real(np.fft.ifft(aw0,int(nearest_2power(N2)),0))
    
    gpu_testmat = gpuarray.to_gpu(y)
    gpu_testmatim = gpuarray.to_gpu(yim)
    plan1.execute(gpu_testmat, gpu_testmatim, batch=N1) 
    gfft = gpu_testmat.get() #get fft result
    plan1.execute(gpu_testmat, gpu_testmatim, inverse=True, batch=N1) 
    gifft = np.real(gpu_testmat.get()) #get ifft result
    
    cuda.memcpy_htod(g_buf1, y)
    cuda.memset_d32(g_buf2, 0, yim.size*2) # double all zero bits should be zero again. This function only works for 32 bit, so we need twice as many
    plan1.execute(g_buf1, g_buf2, batch=N1) 
    grfft=np.empty_like(y)
    cuda.memcpy_dtoh(grfft, g_buf1)  #fft result
    plan1.execute(g_buf1, g_buf2, inverse=True, batch=N1) 
    grifft=np.empty_like(y)
    cuda.memcpy_dtoh(grifft, g_buf1) #ifft result

    if Plot:
        np.set_printoptions(threshold=np.nan)
        
        #plot cuda fft results
        f, axarr = plt.subplots(5, sharex=False)
        axarr[0].plot(y)
        axarr[1].plot(gfft)
        axarr[2].plot(gifft)
        axarr[3].plot(grfft)
        axarr[4].plot(grifft)
        plt.show()
        raise SystemExit    
Ejemplo n.º 4
0
def pycuda_zeros(arr, shape):
    if arr is None or arr.shape != shape:
        arr = gpuarray.zeros(shape, dtype=np.float32)
    else:
        if type(arr) != gpuarray.GPUArray:
            arr = to_gpuarray(arr)
    pycu.memset_d32(arr.gpudata, 0, arr.size)
    return arr
Ejemplo n.º 5
0
def pycuda_zeros(arr, shape):
    if arr is None or arr.shape != shape:
        arr = gpuarray.zeros(shape, dtype=np.float32)
    else:
        if type(arr) != gpuarray.GPUArray:
            arr = to_gpuarray(arr)
    pycu.memset_d32(arr.gpudata, 0, arr.size)
    return arr
Ejemplo n.º 6
0
    def _malloc_impl(self, nbytes):
        import pycuda.driver as cuda

        # Allocate
        data = cuda.mem_alloc(nbytes)

        # Zero
        cuda.memset_d32(data, 0, nbytes // 4)

        return data
Ejemplo n.º 7
0
Archivo: base.py Proyecto: pv101/PyFR
    def _malloc_impl(self, nbytes):
        import pycuda.driver as cuda

        # Allocate
        data = cuda.mem_alloc(nbytes)

        # Zero
        cuda.memset_d32(data, 0, nbytes // 4)

        return data
Ejemplo n.º 8
0
	def gfx_init( self ) :
		try :
			print 'compiling'
			self.prog = sh.compile_program_vfg( 'shad/balls' )

			print 'compiled'

			self.loc_mmv = sh.get_loc(self.prog,'modelview' )
			self.loc_mp  = sh.get_loc(self.prog,'projection')
			self.l_color = sh.get_loc(self.prog,'color'     )
			self.l_size  = sh.get_loc(self.prog,'ballsize'  )

		except ValueError as ve :
			print "Shader compilation failed: " + str(ve)
			sys.exit(0)    

#        glUseProgram( self.prog )
#        glUniform1i( pointsid , 0 );
#        glUseProgram( 0 )

		#
		# cuda init
		#
		self.grid = (int(self.BOX),int(self.BOX))
		self.block = (1,1,int(self.BOX))

		print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))
#        print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_X
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z

		floatbytes = np.dtype(np.float32).itemsize

		self.gpos = glGenBuffers(1)
		glBindBuffer( GL_ARRAY_BUFFER , self.gpos )
		glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW )
		glBindBuffer( GL_ARRAY_BUFFER , 0 )

		self.df1 = cuda_driver.mem_alloc( self.f.nbytes )
		self.df2 = cuda_driver.mem_alloc( self.f.nbytes )

		cuda_driver.memcpy_htod( self.df1 , self.f )
		cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q )

		mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' )

		self.collision = mod.get_function("collision_step")
		self.collision.prepare( "Piii" )

		self.streaming = mod.get_function("streaming_step")
		self.streaming.prepare( "PPiii" )

		self.colors = mod.get_function("colors")
		self.colors.prepare( "PPiii" )
Ejemplo n.º 9
0
def execute(positions, num_particles, num_frames):
    #Get host positions:
    cpuPos = numpy.array(positions, dtype=numpy.float32)
    #Allocate position space on device:
    devPos = cuda.mem_alloc(cpuPos.nbytes)
    #Copy positions:
    cuda.memcpy_htod(devPos, cpuPos)

    #Allocate device velocities:
    devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes)
    cuda.memset_d32(devVels, 0, 2 * num_particles)
    # #Copy velocities:
    # cuda.memcpy_htod(devVels, cpuVels)

    #Allocate and initialize device in bounds to false:
    #inBounds = numpy.zeros(num_particles, dtype=bool)
    devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes)
    cuda.memset_d8(devInBounds, True, num_particles)

    # inB = numpy.zeros(num_particles, dtype=numpy.bool)
    # cuda.memcpy_dtoh(inB, devInBounds)
    # print inB

    # cuda.memcpy_htod(devInBounds, inBounds)
    # numBlocks = 1#(num_particles // 512) + 1;
    grid_dim = ((num_particles // NUM_THREADS) + 1, 1)
    print grid_dim
    runframe = module.get_function("runframe")
    frames = [None] * num_frames
    for i in range(num_frames):
        runframe(devPos,
                 devVels,
                 devInBounds,
                 numpy.int32(num_particles),
                 grid=grid_dim,
                 block=(NUM_THREADS, 1, 1))
        #Get the positions from device:
        cuda.memcpy_dtoh(cpuPos, devPos)
        frames[i] = cpuPos.copy()
        #frames[i] = copy(cpuPos)
        #write_frame(out, cpuPos, num_particles)

    #Simulation destination file:
    # out = open(OUTPUT_FILE, 'w')
    # write_header(out, num_particles)
    # for frame in frames:
    #     write_frame(out, frame, num_particles)

    #clean up...
    #out.close()
    devPos.free()
    devVels.free()
    devInBounds.free()
Ejemplo n.º 10
0
def execute(positions, num_particles, num_frames):
    #Get host positions:
    cpuPos = numpy.array(positions, dtype=numpy.float32)
    #Allocate position space on device:
    devPos = cuda.mem_alloc(cpuPos.nbytes)
    #Copy positions:
    cuda.memcpy_htod(devPos, cpuPos)
    
    #Allocate device velocities:
    devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes)
    cuda.memset_d32(devVels, 0, 2 * num_particles)
    # #Copy velocities:
    # cuda.memcpy_htod(devVels, cpuVels)
    
    #Allocate and initialize device in bounds to false:
    #inBounds = numpy.zeros(num_particles, dtype=bool)
    devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes)
    cuda.memset_d8(devInBounds, True, num_particles)
    
    # inB = numpy.zeros(num_particles, dtype=numpy.bool)
    # cuda.memcpy_dtoh(inB, devInBounds)
    # print inB
    
    # cuda.memcpy_htod(devInBounds, inBounds)
    # numBlocks = 1#(num_particles // 512) + 1;
    grid_dim = ((num_particles // NUM_THREADS) + 1, 1)
    print grid_dim
    runframe = module.get_function("runframe")
    frames = [None] * num_frames
    for i in range(num_frames):
        runframe(devPos, devVels, devInBounds, 
                 numpy.int32(num_particles),
                 grid=grid_dim,
                 block=(NUM_THREADS, 1, 1))
        #Get the positions from device:
        cuda.memcpy_dtoh(cpuPos, devPos)
        frames[i] = cpuPos.copy()
        #frames[i] = copy(cpuPos)
        #write_frame(out, cpuPos, num_particles)
    
    #Simulation destination file:
    # out = open(OUTPUT_FILE, 'w')
    # write_header(out, num_particles)
    # for frame in frames:
    #     write_frame(out, frame, num_particles)
    
    #clean up...
    #out.close()
    devPos.free()
    devVels.free()
    devInBounds.free()
Ejemplo n.º 11
0
def leapfrogStationary(d_x, d_t, v, xmin, xmax, alpha):

  # --- Allocate device memory space for solution
  d_u  = cuda.mem_alloc((N + 1) * (M + 1) * 4)
  d_u1 = cuda.mem_alloc((N + 1)           * 4)
  d_u2 = cuda.mem_alloc((N + 1)           * 4)
  d_u3 = cuda.mem_alloc((N + 1)           * 4)
  # --- Set memory to zero
  cuda.memset_d32(d_u , 0x00, (N + 1) * (M + 1))
  cuda.memset_d32(d_u1, 0x00, (N + 1)          )
  cuda.memset_d32(d_u2, 0x00, (N + 1)          )
  cuda.memset_d32(d_u3, 0x00, (N + 1)          )
  # u     = np.zeros(((M + 1), N + 1))

  blockDim  = (BLOCKSIZE, 1, 1)
  gridDim   = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1)

  # --- Step0
  setStep0(d_u1, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.int32(N), block = blockDim, grid = gridDim)

  # --- Step1
  setStep1(d_u1, d_u2, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.float32(dt), np.int32(N), block = blockDim, grid = gridDim)

  for l in range(1, M - 1):

    updateShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)
    # updateNoShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)
    # updateNoSharedNotWorking(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim)

    cuda.memcpy_dtod(d_u1, d_u2, (N + 1) * 4)
    cuda.memcpy_dtod(d_u2, d_u3, (N + 1) * 4)

  return d_u
Ejemplo n.º 12
0
def render():
  global invViewMatrix_h, c_invViewMatrix
  global gl_PBO, cuda_PBO
  global width_GL, height_GL, density, brightness, transferOffset, transferScale
  global block2D_GL, grid2D_GL
  global tex, transferTex
  global testData_d
  cuda.memcpy_htod( c_invViewMatrix,  invViewMatrix_h)
  for i in range(nTextures):
    if i == 0: tex.set_array(plotData_dArray)
    if i == 1: tex.set_array(plotData_dArray_1)
    # map PBO to get CUDA device pointer
    cuda_PBO_map = cuda_PBO[i].map()
    cuda_PBO_ptr, cuda_PBO_size = cuda_PBO_map.device_ptr_and_size()
    cuda.memset_d32( cuda_PBO_ptr, 0, width_GL*height_GL )
    renderKernel( np.intp(cuda_PBO_ptr), np.int32(width_GL), np.int32(height_GL), density, brightness, transferOffset, transferScale, grid=grid2D_GL, block = block2D_GL, texrefs=[tex, transferTex] )
    cuda_PBO_map.unmap()
Ejemplo n.º 13
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Ejemplo n.º 14
0
    def __init__(self,
                 kernel_set="fgemm_int64_wide32",
                 locks=1024,
                 calc_partials=True,
                 bench=False):

        m = re.search(r'wide(\d+)', kernel_set)
        if m:
            self.width = int(m.group(1))
        else:
            raise ValueError("Invalid kernel_set")

        self.locks = locks
        self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin")
        self.mode = 0 if calc_partials else 4
        self.fgemm = dict()
        for op in ("nt", "nn", "tn"):
            mod = self.module.get_function(kernel_set + "_" + op)
            mod.prepare("PPPIIIIIIHH")
            self.fgemm[op] = mod

        fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64")
        fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["fprop_conv"] = fprop_conv

        bprop_conv = self.module.get_function(
            "bprop_conv_float32_CRST64N64T64")
        bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["bprop_conv"] = bprop_conv

        udpate_conv = self.module.get_function(
            "update_conv_float32_CRST64K64T64")
        udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["update_conv"] = udpate_conv

        self.gpulock = drv.mem_alloc(locks * 4)
        drv.memset_d32(self.gpulock, 0, locks)

        self.bench = bench
        if bench:
            self.start = drv.Event()
            self.end = drv.Event()
Ejemplo n.º 15
0
    def _assign(self, value):

        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8(self.gpudata,
                                  unpack_from('B', value)[0], self.size)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16(self.gpudata,
                                   unpack_from('H', value)[0], self.size)
                else:
                    drv.memset_d32(self.gpudata,
                                   unpack_from('I', value)[0], self.size)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Ejemplo n.º 16
0
def leapfrog(d_x, d_t, v, alpha):

  # --- Allocate device memory space for solution
  d_u = cuda.mem_alloc((N + 1) * (M + 1) * 4)
  # --- Set memory to zero
  cuda.memset_d32(d_u, 0x00, (N + 1) * (M + 1))

  # --- Initial condition
  blockDim  = (BLOCKSIZE, 1, 1)
  gridDim   = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1)
  initialConditionKernel(d_u, d_t, d_x, np.float32(v), np.int32(N), block = blockDim, grid = gridDim)

  # --- First step
  matsunoFirstStep(d_u, d_t, d_x, np.float32(v), np.float32(alpha), np.int32(N), block = blockDim, grid = gridDim)

  Q = (1. - alpha) / (1. + alpha)
  for l in range(1, M):
    updateKernel(d_u, d_t, d_x, np.float32(v), np.float32(alpha), np.float32(Q), np.int32(l), np.int32(N), block = blockDim, grid = gridDim);    # --- Boundary condition

  return d_u
Ejemplo n.º 17
0
    def __init__(self, backend, dtype, ioshape, initval, iopacking, tags):
        super(CUDAMatrixBase, self).__init__(backend, ioshape, iopacking, tags)

        # Data type info
        self.dtype = dtype
        self.itemsize = np.dtype(dtype).itemsize

        # Dimensions
        nrow, ncol = backend.compact_shape(ioshape, iopacking)
        self.nrow = nrow
        self.ncol = ncol

        # Compute the size, in bytes, of the minor dimension
        colsz = self.ncol*self.itemsize

        if 'align' in tags:
            # Allocate a 2D array aligned to the major dimension
            self.data, self.pitch = cuda.mem_alloc_pitch(colsz, nrow,
                                                         self.itemsize)
            self._nbytes = nrow*self.pitch

            # Ensure that the pitch is a multiple of itemsize
            assert (self.pitch % self.itemsize) == 0
        else:
            # Allocate a standard, tighly packed, array
            self._nbytes = colsz*nrow
            self.data = cuda.mem_alloc(self._nbytes)
            self.pitch = colsz

        self.leaddim = self.pitch / self.itemsize
        self.leadsubdim = self.soa_shape[-1]
        self.traits = (nrow, self.leaddim, self.leadsubdim, self.dtype)

        # Zero the entire matrix (incl. slack)
        assert (self._nbytes % 4) == 0
        cuda.memset_d32(self.data, 0, self._nbytes/4)

        # Process any initial values
        if initval is not None:
            self.set(initval)
Ejemplo n.º 18
0
def cu_lpf(stimulus, dt, freq):
    """
    CUDA implementation of low-pass-filter.

    stimulus: ndarray
        The input to be filtered.
    dt: float
        The sampling interval of the input.
    freq: float
        The cut-off frequency of the low pass filter.
    """
    num = len(stimulus)
    num_fft = int(num / 2 + 1)
    idtype = stimulus.dtype
    odtype = np.complex128 if idtype == np.float64 else np.complex64

    if not isinstance(stimulus, gpuarray.GPUArray):
        d_stimulus = gpuarray.to_gpu(stimulus)
    else:
        d_stimulus = stimulus

    plan = Plan(stimulus.shape, idtype, odtype)
    d_fstimulus = gpuarray.empty(num_fft, odtype)
    fft(d_stimulus, d_fstimulus, plan)

    df = 1.0 / dt / num
    idx = int(freq // df)

    unit = int(d_fstimulus.dtype.itemsize / 4)
    offset = int(d_fstimulus.gpudata) + d_fstimulus.dtype.itemsize * idx

    cuda.memset_d32(offset, 0, unit * (num_fft - idx))

    plan = Plan(stimulus.shape, odtype, idtype)
    d_lpf_stimulus = gpuarray.empty(num, idtype)
    ifft(d_fstimulus, d_lpf_stimulus, plan, False)

    return d_lpf_stimulus.get()
    def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False):

        m = re.search( r'wide(\d+)', kernel_set)
        if m:
            self.width  = int(m.group(1))
        else:
            raise ValueError("Invalid kernel_set")
        
        self.locks   = locks
        self.module  = drv.module_from_file("kernels/" + kernel_set + ".cubin")
        self.mode    = 0 if calc_partials else 4
        self.fgemm   = dict()
        for op in ("nt", "nn", "tn"):
            mod = self.module.get_function(kernel_set + "_" + op)
            mod.prepare("PPPIIIIIIHH")
            self.fgemm[op] = mod

        fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64")
        fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["fprop_conv"] = fprop_conv

        bprop_conv = self.module.get_function("bprop_conv_float32_CRST64N64T64")
        bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["bprop_conv"] = bprop_conv

        udpate_conv = self.module.get_function("update_conv_float32_CRST64K64T64")
        udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["update_conv"] = udpate_conv

        self.gpulock = drv.mem_alloc(locks*4)
        drv.memset_d32(self.gpulock, 0, locks)

        self.bench = bench
        if bench:
            self.start = drv.Event()
            self.end   = drv.Event()
Ejemplo n.º 20
0
"""Animation (skip if GPU)."""

anim = animation.ArtistAnimation(fig, imSequence, interval = 50, blit = True)
# anim.save('waveEquation2D.mp4')

rc('animation', html = 'jshtml')
anim

"""Allocate solution on device."""

# --- Allocate device memory space for solution
d_u     = cuda.mem_alloc(Nx * Ny * 8)
d_uold  = cuda.mem_alloc(Nx * Ny * 8)
d_unew  = cuda.mem_alloc(Nx * Ny * 8)
# --- Set memory to zero
cuda.memset_d32(d_u,    0x00, Nx * Ny)
cuda.memset_d32(d_uold, 0x00, Nx * Ny)
cuda.memset_d32(d_unew, 0x00, Nx * Ny)

"""Transfering the initial condition from host to device."""

cuda.memcpy_htod(d_uold, u_old)
cuda.memcpy_htod(d_u,    u)

"""Solution at the subsequent steps."""

fig = plt.figure()

blockDim  = (BLOCKSIZEX, BLOCKSIZEY, 1)
gridDim   = (int(iDivUp(Nx, BLOCKSIZEX)), int(iDivUp(Ny, BLOCKSIZEY)), 1)
Ejemplo n.º 21
0
def basic_add_performance_2():
    """Measures memory latency for certain operations."""

    base_src = Template("""
    .entry $FNAME ( .param .u32 out )
    {
        .reg .u32 base, off, clka, clkb, clkoa, clkob, clks, tmp, iter;
        .reg .pred p;

        mov.u32         iter,   $RUNS;
        mov.u32         clks,   0;
        mov.u32         tmp,    0;

        ld.const.u32    base,   [scratch];
        $MULT
        mov.u32         lcg_state,  scratch;

    warmup:
        mov.u32         clka,   %clock;
        $OPER
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         warmup;

        mov.u32         clkoa,  %clock;
        mov.u32         iter,   $RUNS;
    loop:
        //call.uni        (tmp),  lcg_rounds, (100);
        $LCGROUNDS
        mov.u32         clka,   %clock;
        $OPER
        xor.b32         clka,   clka,   tmp;
        mov.u32         clkb,   %clock;
        xor.b32         clka,   clka,   tmp;
        sub.u32         clka,   clkb,   clka;
        add.u32         clks,   clks,   clka;
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         loop;
        mov.u32         clkob,  %clock;
        sub.u32         clkoa,  clkob,  clkoa;

        mov.u32         iter,   $RUNS;
    cooldown:
        $OPER
        sub.u32         iter,   iter,   1;
        setp.ne.u32     p,      iter,   0;
    @p  bra.uni         cooldown;

        ld.param.u32    base,   [out];
        call.uni        (off),  get_gtid,   ();
        shr.u32         off,    off,    5;
        mad24.lo.u32    base,   off,    8,  base;
        call.uni        (tmp),  lcg_rounds, (1);
        st.volatile.global.b32  [base], tmp;
        st.volatile.global.b32  [base], clks;

        add.u32         base,   base,   4;
        st.global.b32   [base], clkoa;
    }
    """)

    addrtypes = {
            'single': {'label': "all conflicts",  'ADDRTYPE': "single",
                       'MULT': "mov.u32 off, %smid;" +
                               "mad24.lo.u32 base, off, 128, base;"},
            'uncoa':  {'label': "uncoalesced",    'ADDRTYPE': "uncoa",
                       'MULT': "call.uni        (off),  get_gtid,   ();" +
                               "mad24.lo.u32 base, off, 128, base;"},
            'coa':    {'label': "coalesced",      'ADDRTYPE': "coa",
                       'MULT': "call.uni        (off),  get_gtid,   ();" +
                               "mad24.lo.u32 base, off, 4, base;"},
            }

    # Evil, I know, DRY and all
    addrtypesorder = ['single', 'uncoa', 'coa']

    opertypes = {
            'atomic':       "atom.global.add.u32 tmp, [base], tmp;",
            'red':          "red.global.add.u32     [base], clks;",
            'store':        "st.global.u32 [base], clks;",
            'load':         "ld.global.u32 tmp, [base];",
            'load_store': """
                ld.global.u32 tmp, [base];
                add.u32 tmp, tmp, clks;
                st.global.u32 [base], tmp;
                """
            }

    opertypesorder = ['load', 'store', 'load_store', 'red', 'atomic']

    lcgtext = "mad.lo.u32  lcg_state,  lcg_state, 1664525, 1013904223;\n"*50

    order = []
    for va in addrtypesorder:
        for k in sorted(opertypes.keys()):
            order.append((va, k))

    runs = 512
    rounds = 4
    mod = stdlib + "\n.const .u32 scratch;"
    for (addr, oper) in order:
        c = dict(addrtypes[addr])
        c['otype'] = oper
        c['OPER'] = opertypes[oper]
        c['RUNS'] = runs
        c['FNAME'] = "%s_%s" % (addr, oper)
        c['LCGROUNDS'] = lcgtext
        mod += base_src.substitute(c)
    for i in enumerate(mod.split('\n')):
        print "%3d %s" % i
    disassemble(mod)
    mod = cuda.module_from_buffer(mod)
    figs = []
    barwidth = 0.3

    scratch = cuda.mem_alloc(1024*16*30*128)
    scratchptr = mod.get_global('scratch')
    cuda.memset_d32(scratchptr[0], int(scratch), 1)

    def plot(title, names, vals, errs):
        N=len(vals[0])
        bw=2*.9/len(names)
        fig = plt.figure()
        ax = fig.add_subplot(111, title=title)
        ax.set_ylabel('Clocks')
        ax.set_xlabel('Warps/SM')
        ax.set_xticks(range(N))
        ax.set_xticklabels([1<<i for i in range(N)])
        for idx, (name,val,err) in enumerate(zip(names, vals, errs)):
            ax.bar([i+bw*(idx/2)-.45 for i in range(N)], val, bw, yerr=err,
                     color=colors[idx], label=name, zorder=-idx)
        ax.axis(ymin=0)
        ax.legend(loc=0)
        return fig

    for addr in addrtypesorder:
        addrlbl = addrtypes[addr]['label']
        print "Access pattern:", addrlbl
        interms, interes, totalms, totales = [], [], [], []
        for operidx, oper in enumerate(opertypesorder):
            interm, intere, totalm, totale = [], [], [], []
            for dim in ((1, 1), (2, 1), (4, 1), (8, 1), (8, 2), (8, 4)):
                vals = numpy.zeros( (dim[0] * dim[1] * 30, 2) )
                fn = mod.get_function('%s_%s' % (addr, oper))
                for round in range(rounds+1):
                    a = numpy.zeros_like(vals).astype(numpy.int32)
                    fn(cuda.InOut(a), block=(32 * dim[0], 1, 1),
                                      grid=(30 * dim[1], 1))
                    if round != 0: vals += a
                    time.sleep(.005)
                means = scipy.mean(vals, axis=0) / (runs*rounds)
                stds = scipy.std(vals, axis=0) / (runs*rounds)
                # this is just gross
                interm.append(means[0])
                totalm.append(means[1])
                intere.append(stds[0])
                totale.append(stds[1])
                print "%16s: %1.7f±%1.6f" % (oper, means[0], stds[0])
                print "%16s: %1.7f±%1.6f" % (oper+' total', means[1], stds[1])
            interms.append(interm)
            interes.append(intere)
            interms.append(totalm)
            interes.append(totale)

        names = []
        for i in opertypesorder:
            names.append(i)
            names.append(i + ' total')

        fig1 = plot('Compute memory latency, %s access pattern' % addrlbl,
                    names, interms, interes)
        figs.append((addr, fig1))

    return figs
Ejemplo n.º 22
0
def kmeans(objects, numClusters, threshold):
    """
    objects: numCoords x numObjs
    """

    event = cuda.Event()
    """ Step 0 cast to float, copy to device """
    objects = objects.astype(np.float32)
    objects_gpu = cuda.mem_alloc(objects.nbytes)
    cuda.memcpy_htod(objects_gpu, objects)
    numCoords, numObjs = objects.shape
    """ Step 1. Load cuda module """
    src = open("cuda_kmeans.cu").read()
    mod = SourceModule(src, include_dirs=[os.getcwd()])
    find_nearest_cluster = mod.get_function("find_nearest_cluster")
    compute_delta = mod.get_function("compute_delta")
    reduce_clusterSize = mod.get_function("reduce_clusterSize")
    reduce_centroids = mod.get_function("reduce_centroids")
    update_centroids_clusterSize = mod.get_function(
        "update_centroids_clusterSize")
    """ Step 2. define some constant """
    # For find_nearest_cluster
    threadsPer_FNC_Block = 128
    num_FNC_Blocks = int(math.ceil(float(numObjs) / threadsPer_FNC_Block))
    # SDSize = shared memory size
    FNC_SDSize = threadsPer_FNC_Block * 2 + numClusters * numCoords * 4
    # For compute_delta
    threadsPer_CD_Block = 128 if num_FNC_Blocks > 128 else nextPowerOfTwo(
        num_FNC_Blocks)
    num_CD_Blocks = int(math.ceil(float(num_FNC_Blocks) / threadsPer_CD_Block))
    CD_SDSize = threadsPer_CD_Block * 4
    """ Step 3. Init centroids using first K elements, define some variables """
    centroids = init_centroids(objects, numClusters)
    centroids_gpu = cuda.mem_alloc(centroids.nbytes)
    cuda.memcpy_htod(centroids_gpu, centroids)

    _, interm_gpu = getHostDevicePair((num_FNC_Blocks, ), np.int32,
                                      0)  # interm means intermediate
    membership, membership_gpu = getHostDevicePair(
        (numObjs, ), np.int32, -1)  # initialize membership to -1
    reduceInterm, reduceInterm_gpu = getHostDevicePair((num_CD_Blocks, ),
                                                       np.int32, 0)
    clusterSize, clusterSize_gpu = getHostDevicePair((numClusters, ), np.int32,
                                                     0)
    # seg means segregated
    segClusterSize, segClusterSize_gpu = getHostDevicePair(
        (num_FNC_Blocks, numClusters), np.int32, 0)
    _, segCentroids_gpu = getHostDevicePair(
        (num_FNC_Blocks, numCoords, numClusters), np.int32, 0)

    for loop in range(500):
        find_nearest_cluster(np.int32(numCoords),
                             np.int32(numObjs),
                             np.int32(numClusters),
                             objects_gpu,
                             centroids_gpu,
                             membership_gpu,
                             interm_gpu,
                             block=(threadsPer_FNC_Block, 1, 1),
                             grid=(num_FNC_Blocks, 1),
                             shared=FNC_SDSize)
        event.synchronize()
        """validating centroids"""
        """
        cuda.memcpy_dtoh(membership, membership_gpu)   
        cent_valid = np.zeros_like(centroids)
        clusterSize_valid = np.zeros_like(clusterSize)
        for i in range(numObjs):
            clusterSize_valid[membership[i]] += 1
            cent_valid[:,membership[i]] += objects[:,i] 
        
        cent_valid = cent_valid / clusterSize_valid
        print("\nvalid")
        print(cent_valid)
        """

        compute_delta(interm_gpu,
                      reduceInterm_gpu,
                      np.int32(num_FNC_Blocks),
                      block=(threadsPer_CD_Block, 1, 1),
                      grid=(num_CD_Blocks, 1),
                      shared=CD_SDSize)
        event.synchronize()

        cuda.memcpy_dtoh(reduceInterm, reduceInterm_gpu)
        event.synchronize()
        delta = reduceInterm.sum()

        # set segClusterSize and segCentroids to 0
        cuda.memset_d32(clusterSize_gpu, 0, numClusters)
        cuda.memset_d32(segClusterSize_gpu, 0, num_FNC_Blocks * numClusters)
        cuda.memset_d32(centroids_gpu, 0, numCoords * numClusters)
        cuda.memset_d32(segCentroids_gpu, 0,
                        num_FNC_Blocks * numCoords * numClusters)

        event.synchronize()

        update_centroids_clusterSize(objects_gpu,
                                     membership_gpu,
                                     segCentroids_gpu,
                                     segClusterSize_gpu,
                                     np.int32(numCoords),
                                     np.int32(numObjs),
                                     np.int32(numClusters),
                                     block=(threadsPer_FNC_Block, 1, 1),
                                     grid=(num_FNC_Blocks, 1))
        event.synchronize()

        reduce_clusterSize(segClusterSize_gpu,
                           clusterSize_gpu,
                           np.int32(num_FNC_Blocks),
                           np.int32(numClusters),
                           block=(numClusters, 1, 1))
        event.synchronize()

        reduce_centroids(segCentroids_gpu,
                         centroids_gpu,
                         clusterSize_gpu,
                         np.int32(num_FNC_Blocks),
                         np.int32(numClusters),
                         np.int32(numCoords),
                         block=(numClusters, 1, 1),
                         grid=(numCoords, 1))
        event.synchronize()
        """
        cuda.memcpy_dtoh(centroids, centroids_gpu)    
        print("computed centroids")
        print(centroids)         
        """

        delta /= float(numObjs)
        if delta <= threshold:
            break

    loop += 1
    cuda.memcpy_dtoh(centroids, centroids_gpu)
    #print(centroids)

    print("Looped for", loop, "iterations")
    return centroids
Ejemplo n.º 23
0
    def __init__(self,
                 default_dtype=np.float32,
                 stochastic_round=False,
                 deterministic=None,
                 device_id=0,
                 bench=False,
                 scratch_size=0,
                 hist_bins=64,
                 hist_offset=-48,
                 compat_mode=None,
                 enable_winograd=True,
                 cache_dir=os.path.join(os.path.expanduser('~'),
                                        'nervana/cache')):
        if default_dtype not in [np.float16, np.float32]:
            raise ValueError('Default data type for nervanagpu '
                             'backend must be float16 or 32')

        if default_dtype is np.float32:
            if stochastic_round:
                if stochastic_round is True:
                    raise ValueError('Default rounding bit width is not '
                                     'supported for fp32.  Please specify '
                                     'number of bits to round to.')
                logger.warn(
                    'Using 32 bit floating point and setting stochastic '
                    'rounding to %d bits' % stochastic_round)

        # context
        drv.init()
        self.device_type = 1
        self.device_id = device_id if device_id is not None else 0
        self.ctx = drv.Device(device_id).make_context()

        # super class init
        super(NervanaGPU, self).__init__(default_dtype,
                                         compat_mode=compat_mode,
                                         deterministic=deterministic)

        # log
        logger.info("Initialized NervanaGPU")

        # stochastic_round
        assert stochastic_round is False, "Are you sure about using SR globally in the backend?"
        if stochastic_round:
            if stochastic_round is True:
                stochastic_round = 10
        else:
            stochastic_round = 0

        # attributes
        self.scratch_size = scratch_size
        self.scratch_offset = 0
        self.round_mode = stochastic_round
        self.bench = bench
        self.stream = None
        self.buf = {}
        self.buf_active = {}
        self.warmup = False

        # store histograms for batched memcpy
        self.hist_bins = hist_bins
        self.hist_offset = hist_offset
        self.hist_map = dict()
        self.hist_idx = 0
        self.hist_max = 4 * 4096
        self.hist_base = drv.mem_alloc(self.hist_bins * self.hist_max * 4)
        drv.memset_d32(self.hist_base, 0, self.hist_bins * self.hist_max)

        self.compute_capability = (4, 0)
        self.use_cudac_kernels = True

        self.enable_winograd = enable_winograd
        self.cache_dir = cache_dir
        if not os.path.isdir(self.cache_dir):
            os.makedirs(self.cache_dir)
Ejemplo n.º 24
0
    def __init__(self, jitfunc1, jitfunc2, fd1_d, fd2_d, model, dx, source_dt,
                 sources, pad_width, pml_width=None):
        super(PycudaPropagator, self).__init__(model.astype(np.float32),
                                               np.float32(dx),
                                               np.float32(source_dt),
                                               sources,
                                               np.int32(pad_width),
                                               pml_width=pml_width)
        self.jitfunc1 = jitfunc1
        self.jitfunc2 = jitfunc2

        # allocate and copy model to GPU

        self.model.padded_property_gpu = {}
        self.model.padded_property_gpu['vp2dt2'] = \
                drv.mem_alloc(self.model.padded_property['vp2dt2'].nbytes)
        drv.memcpy_htod(self.model.padded_property_gpu['vp2dt2'],
                self.model.padded_property['vp2dt2'])

        # allocate and initialize wavefields
        self.wavefield.current_gpu = \
                drv.mem_alloc(self.wavefield.current.nbytes)
        drv.memset_d32(self.wavefield.current_gpu, 0,
                       self.wavefield.current.size)
        self.wavefield.previous_gpu = \
                drv.mem_alloc(self.wavefield.previous.nbytes)
        drv.memset_d32(self.wavefield.previous_gpu, 0,
                       self.wavefield.previous.size)

        # allocate and initialize PML arrays
        self.pml.sigma_gpu = []
        for dim in range(self.geometry.ndim):
            self.pml.phi[dim].current_gpu = \
                    drv.mem_alloc(self.pml.phi[dim].current.nbytes)
            drv.memset_d32(self.pml.phi[dim].current_gpu, 0,
                           self.pml.phi[dim].current.size)
            self.pml.phi[dim].previous_gpu = \
                    drv.mem_alloc(self.pml.phi[dim].previous.nbytes)
            drv.memset_d32(self.pml.phi[dim].previous_gpu, 0,
                           self.pml.phi[dim].previous.size)
            self.pml.sigma_gpu.append(drv.mem_alloc(self.pml.sigma[dim].nbytes))
            drv.memcpy_htod(self.pml.sigma_gpu[dim], self.pml.sigma[dim])

        # allocate and copy sources arrays
        self.sources.amplitude_gpu \
                = drv.mem_alloc(self.sources.amplitude.nbytes)
        drv.memcpy_htod(self.sources.amplitude_gpu,
                        self.sources.amplitude)

        self.sources.padded_locations_gpu \
                = drv.mem_alloc(self.sources.padded_locations.nbytes)
        drv.memcpy_htod(self.sources.padded_locations_gpu,
                        self.sources.padded_locations)


        # create and copy finite difference coeffs to constant memory
        self.fd1_d = fd1_d
        fd1 = np.array([8/12, -1/12], np.float32) / dx
        drv.memcpy_htod(self.fd1_d, fd1)

        self.fd2_d = fd2_d
        if self.geometry.ndim == 1:
            fd2 = np.array([-5/2, 4/3, -1/12], np.float32) / dx**2
        elif self.geometry.ndim == 2:
            fd2 = np.array([-10/2, 4/3, -1/12], np.float32) / dx**2
        drv.memcpy_htod(self.fd2_d, fd2)

        # set block and grid dimensions
        threadsperblockx = 32
        blockspergridx = ((self.geometry.propagation_shape_padded[-1]
                           + (threadsperblockx - 1))
                          // threadsperblockx)
        if self.geometry.ndim == 1:
            threadsperblockz = 1
            blockspergridz = self.sources.num_shots
        elif self.geometry.ndim == 2:
            threadsperblockz = 32
            blockspergridz = ((self.geometry.propagation_shape_padded[-2]
                               + (threadsperblockz - 1))
                              // threadsperblockz) * self.sources.num_shots

        self.griddim = int(blockspergridx), int(blockspergridz)
        self.blockdim = int(threadsperblockx), int(threadsperblockz), 1
Ejemplo n.º 25
0
    def __init__(self,
                 default_dtype=np.float32,
                 stochastic_round=False,
                 deterministic=None,
                 device_id=0,
                 bench=False,
                 scratch_size=0,
                 hist_bins=64,
                 hist_offset=-48,
                 compat_mode=None,
                 enable_winograd=True,
                 cache_dir=os.path.join(os.path.expanduser('~'), 'nervana/cache')):
        if default_dtype not in [np.float16, np.float32]:
            raise ValueError('Default data type for nervanagpu '
                             'backend must be float16 or 32')

        if default_dtype is np.float32:
            if stochastic_round:
                if stochastic_round is True:
                    raise ValueError('Default rounding bit width is not '
                                     'supported for fp32.  Please specify '
                                     'number of bits to round to.')
                logger.warn('Using 32 bit floating point and setting stochastic '
                            'rounding to %d bits' % stochastic_round)

        # context
        drv.init()
        self.device_type = 1
        self.device_id = device_id if device_id is not None else 0
        self.ctx = drv.Device(device_id).make_context()

        # super class init
        super(NervanaGPU, self).__init__(default_dtype,
                                         compat_mode=compat_mode,
                                         deterministic=deterministic)

        # log
        logger.info("Initialized NervanaGPU")

        # stochastic_round
        assert stochastic_round is False, "Are you sure about using SR globally in the backend?"
        if stochastic_round:
            if stochastic_round is True:
                stochastic_round = 10
        else:
            stochastic_round = 0

        # attributes
        self.scratch_size = scratch_size
        self.scratch_offset = 0
        self.round_mode = stochastic_round
        self.bench = bench
        self.stream = None
        self.buf = {}
        self.buf_active = {}
        self.warmup = False

        # store histograms for batched memcpy
        self.hist_bins = hist_bins
        self.hist_offset = hist_offset
        self.hist_map = dict()
        self.hist_idx = 0
        self.hist_max = 4*4096
        self.hist_base = drv.mem_alloc(self.hist_bins * self.hist_max * 4)
        drv.memset_d32(self.hist_base, 0, self.hist_bins * self.hist_max)

        self.compute_capability = (4,0)
        self.use_cudac_kernels = True

        self.enable_winograd = enable_winograd
        self.cache_dir = cache_dir
        if not os.path.isdir(self.cache_dir):
            os.makedirs(self.cache_dir)
Ejemplo n.º 26
0
def ssf_cuda(q, r, block_size=128, copy=True):
    import pycuda.driver as cuda
    import pycuda.gpuarray as ga
    from time import time
    from numpy import prod, float32, int32

    nq, dim = q.shape
    npart = r.shape[0]

    global timer_copy, timer_memory, timer_zero, timer_exp, timer_sum

    # CUDA execution dimensions
    block = (block_size, 1, 1)
    grid = (60, 1)

    # access module functions, textures and constants
    if not 'compute_ssf' in globals():
        global compute_ssf, finalise_ssf, tex_q, dim_ptr, npart_ptr, nq_ptr
        compute_ssf = ssf_module.get_function('compute_ssf')
        finalise_ssf = ssf_module.get_function('finalise_ssf')
        tex_q = ssf_module.get_texref('tex_q')
        dim_ptr = ssf_module.get_global('dim')[0]
        npart_ptr = ssf_module.get_global('npart')[0]
        nq_ptr = ssf_module.get_global('nq')[0]

    # set device constants
    t1 = time()
    cuda.memset_d32(dim_ptr, dim, 1)
    cuda.memset_d32(npart_ptr, npart, 1)
    cuda.memset_d32(nq_ptr, nq, 1)
    t2 = time()
    timer_copy += t2 - t1

    # copy particle positions to device
    # (x0, x1, x2, ..., xN, y0, y1, y2, ..., yN, z0, z1, z2, ..., zN)
    if copy:
        global gpu_r
        t1 = time()
        gpu_r = ga.to_gpu(r.T.flatten().astype(float32))
        t2 = time()
        timer_copy += t2 - t1

    # allocate space for results
    t1 = time()
    gpu_sin = ga.empty(int(nq * prod(grid)), float32)
    gpu_cos = ga.empty(int(nq * prod(grid)), float32)
    gpu_ssf = ga.empty(int(prod(grid)), float32)
    t2 = time()
    timer_memory += t2 - t1

    # copy group of wavevectors with (almost) equal magnitude
    t1 = time()
    gpu_q = ga.to_gpu(q.flatten().astype(float32))
    gpu_q.bind_to_texref_ext(tex_q)
    t2 = time()
    timer_copy += t2 - t1

    # compute exp(iq·r) for each particle
    t1 = time()
    compute_ssf(gpu_sin, gpu_cos, gpu_r,
                block=block, grid=grid, texrefs=[tex_q])
    t2 = time()
    # compute sum(sin(q·r))^2 + sum(cos(q·r))^2 per wavevector
    # and sum over wavevectors
    finalise_ssf(gpu_sin, gpu_cos, gpu_ssf, int32(prod(grid)),
                 block=block, grid=grid)
    result = sum(gpu_ssf.get())
    t3 = time()
    timer_exp += t2 - t1
    timer_sum += t3 - t2

    # normalize result with #wavevectors and #particles
    return result / (nq * npart)
Ejemplo n.º 27
0
def go_sort(count, stream=None):
    grids = count / 8192

    keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16)
    #keys = np.arange(count, dtype=np.uint16)
    #np.random.shuffle(keys)
    mkeys = np.reshape(keys, (grids, 8192))
    vals = np.arange(count, dtype=np.uint32)
    dkeys = cuda.to_device(keys)
    dvals = cuda.to_device(vals)
    print 'Done seeding'

    dpfxs = cuda.mem_alloc(grids * 256 * 4)
    doffsets = cuda.mem_alloc(count * 2)
    launch('prefix_scan_8_0',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)

    # This stage will be rejiggered along with the split
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)

    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0)
        #print frle(tkeys & 0xff)

    d_skeys = cuda.mem_alloc(count * 2)
    d_svals = cuda.mem_alloc(count * 4)
    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)
    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        # Test integrity of sort (keys and values kept together):
        #   skeys[i] = keys[svals[i]] for all i
        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

    dkeys, d_skeys = d_skeys, dkeys
    dvals, d_svals = d_svals, dvals

    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)

    launch('prefix_scan_8_8',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)
    if not stream:
        pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = np.reshape(tkeys, (grids, 8192))

        new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8)
        print np.nonzero(new_offs != offsets)
        fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8)
        #print frle(fkeys)

    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        #print cuda.from_device(doffsets, (4, 8192), np.uint16)
        #print cuda.from_device(dkeys, (4, 8192), np.uint16)
        #print cuda.from_device(d_skeys, (4, 8192), np.uint16)

        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

        sorted_keys = np.sort(keys)
        # Test that ordering is correct. (Note that we don't need 100%
        # correctness, so this test should be made "soft".)
        print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
Ejemplo n.º 28
0
def go_sort(count, stream=None):
    grids = count / 8192

    keys = np.fromstring(np.random.bytes(count*2), dtype=np.uint16)
    #keys = np.arange(count, dtype=np.uint16)
    #np.random.shuffle(keys)
    mkeys = np.reshape(keys, (grids, 8192))
    vals = np.arange(count, dtype=np.uint32)
    dkeys = cuda.to_device(keys)
    dvals = cuda.to_device(vals)
    print 'Done seeding'

    dpfxs = cuda.mem_alloc(grids * 256 * 4)
    doffsets = cuda.mem_alloc(count * 2)
    launch('prefix_scan_8_0', doffsets, dpfxs, dkeys,
            block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)

    # This stage will be rejiggered along with the split
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1)

    launch('convert_offsets', doffsets, dsplit, dkeys, i32(0),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0)
        #print frle(tkeys & 0xff)

    d_skeys = cuda.mem_alloc(count * 2)
    d_svals = cuda.mem_alloc(count * 4)
    if not stream:
        cuda.memset_d32(d_skeys, 0, count/2)
        cuda.memset_d32(d_svals, 0xffffffff, count)
    launch('radix_sort_maybe', d_skeys, d_svals,
            dkeys, dvals, doffsets, dpfxs, dsplit, i32(0),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    if not stream:
        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        # Test integrity of sort (keys and values kept together):
        #   skeys[i] = keys[svals[i]] for all i
        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

    dkeys, d_skeys = d_skeys, dkeys
    dvals, d_svals = d_svals, dvals

    if not stream:
        cuda.memset_d32(d_skeys, 0, count/2)
        cuda.memset_d32(d_svals, 0xffffffff, count)

    launch('prefix_scan_8_8', doffsets, dpfxs, dkeys,
            block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1)
    if not stream:
        pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
    launch('convert_offsets', doffsets, dsplit, dkeys, i32(8),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = np.reshape(tkeys, (grids, 8192))

        new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8)
        print np.nonzero(new_offs != offsets)
        fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8)
        #print frle(fkeys)

    launch('radix_sort_maybe', d_skeys, d_svals,
            dkeys, dvals, doffsets, dpfxs, dsplit, i32(8),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    if not stream:
        #print cuda.from_device(doffsets, (4, 8192), np.uint16)
        #print cuda.from_device(dkeys, (4, 8192), np.uint16)
        #print cuda.from_device(d_skeys, (4, 8192), np.uint16)

        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

        sorted_keys = np.sort(keys)
        # Test that ordering is correct. (Note that we don't need 100%
        # correctness, so this test should be made "soft".)
        print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'