def sort(self): keyBits = self.keyBits self.radixSortKeysOnly(keyBits) clu.enqueue_copy(self.queue, src=self.dkeys, dst=self.sortedkeys) clu.enqueue_copy(self.queue, src=self.dvalues, dst=self.sortedvalues) self.keys[:] = self.sortedkeys[:self.n] self.values[:] = self.sortedvalues[:self.n]
def test_move_particles(self): """ Move the particles, set the dirty flag to true and recompute """ particles = self.particles pa = particles.arrays[0] particles.setup_cl(self.ctx) pa = self.particles.arrays[0] domain_manager = particles.domain_manager q = cl.CommandQueue(self.ctx) device_x = pa.get_cl_buffer('x') device_y = pa.get_cl_buffer('y') xnew = numpy.array([1,0,0,1]).astype(numpy.float32) ynew = numpy.array([0,0,1,1]).astype(numpy.float32) enqueue_copy(q, src=xnew, dst=device_x) enqueue_copy(q, src=ynew, dst=device_y) pa.set_dirty(True) particles.update() cellids = domain_manager.cellids['test'] ix = domain_manager.ix['test'] iy = domain_manager.iy['test'] iz = domain_manager.iz['test'] domain_manager.enqueue_copy() self.assertEqual( ix[0], 1 ) self.assertEqual( ix[1], 0 ) self.assertEqual( ix[2], 0 ) self.assertEqual( ix[3], 1 ) self.assertEqual( iy[0], 0 ) self.assertEqual( iy[1], 0 ) self.assertEqual( iy[2], 1 ) self.assertEqual( iy[3], 1 ) self.assertEqual( cellids[0], 1 ) self.assertEqual( cellids[1], 0 ) self.assertEqual( cellids[2], 2 ) self.assertEqual( cellids[3], 3 )
def test_create_cl_buffers(self): """ Test the creation of the OpenCL arrays """ pa = self.pa # create the OpenCL buffers pa.setup_cl(self.ctx, self.queue) for prop in pa.properties: cl_prop = 'cl_' + prop self.assertTrue( pa.cl_properties.has_key(cl_prop) ) # get the OpenCL buffer for the property buffer = pa.get_cl_buffer(prop) # get the PySPH numpy array for the property pysph_arr = pa.get(prop) # read the contents of the OpenCL buffer in a dummy array _array = numpy.ones_like(pysph_arr) carray = pa.properties[prop] dtype = carray.get_c_type() if pa.cl_precision == "single": if dtype == "double": _array = _array.astype(numpy.float32) pysph_arr = pysph_arr.astype(numpy.float32) if dtype == "long": _array = _array.astype(numpy.int32) pysph_arr = pysph_arr.astype(numpy.int32) cl_utils.enqueue_copy(self.queue, dst=_array, src=buffer) self.assertEqual( len(_array), len(pysph_arr) ) np = len(_array) for i in range(np): self.assertAlmostEqual( _array[i], pysph_arr[i], 10 )
def _permute(self, bits): """Launch the permute kernel Using the host-scanned thread histograms, this kernel shuffles the array values in the keys and values to perform the actual sort. We first copy the scanned histograms to the device, compute local mem size and then launch the kernel. After the kernel launch, the sorted keys and values are read back to the host for the next pass. """ ctx = self.context q = self.queue # copy the scanned histograms to the device clu.enqueue_copy(q, src=self.histograms, dst=self.dscanedhistograms) # global and local sizes global_sizes = (self.nelements/self.radices,) local_sizes = (self.group_size,) # allocate local memory for the permute kernel launch local_mem_size = self.group_size * self.radices * 2 local_mem = cl.LocalMemory(size=local_mem_size) # enqueue the kernel for execution self.program.permute(q, global_sizes, local_sizes, self.dkeys, self.dvalues, self.dscanedhistograms, bits, local_mem, self.dsortedkeys, self.dsortedvalues).wait() # read sorted results back to the host clu.enqueue_copy(q, src=self.dsortedkeys, dst=self.sortedkeys) clu.enqueue_copy(q, src=self.dsortedvalues, dst=self.sortedvalues) clu.enqueue_copy(q, src=self.dsortedkeys, dst=self.dkeys) clu.enqueue_copy(q, src=self.dsortedvalues, dst=self.dvalues)
def _histogram(self, bits): """Launch the histogram kernel Each thread will load it's work region (256 values) into shared memory and will compute the histogram/frequency of occurance of each element. Remember that the implementation assumes that we sort the 32 bit keys and values 8 bits at a time and as such the histogram bins/buckets for each thread are also 256. We first copy the currenty unsorted data to the device before calculating local memory size and then launching the kernel. After the kernel launch, we read the computed thread histograms to the host, where these will be scanned. """ ctx = self.context q = self.queue # global/local sizes global_sizes = (self.nelements/self.radices,) local_sizes = (self.group_size,) # copy the unsorted data to the device # the unsorted data is in _keys and dkeys #clu.enqueue_copy(q, src=self._keys, dst=self.dkeys) # allocate the local memory for the histogram kernel local_mem_size = self.group_size * self.radices * 2 local_mem = cl.LocalMemory(size=local_mem_size) # enqueue the kernel for execution self.program.histogram(q, global_sizes, local_sizes, self.dkeys, self.dhistograms, bits, local_mem).wait() # read the result to the host buffer clu.enqueue_copy(q, src=self.dhistograms, dst=self.histograms)
def enqueue_copy(self): """ Copy the Buffer contents to the host The cell counts buffer is copied to the host. """ if self.with_cl: for pa in self.arrays: enqueue_copy(self.queue, dst=self.cellids[pa.name], src=self.dcellids[pa.name]) enqueue_copy(self.queue, dst=self.indices[pa.name], src=self.dindices[pa.name]) enqueue_copy(queue=self.queue, dst=self.cell_counts[pa.name], src=self.dcell_counts[pa.name])
def _cl_update(self): """Update the data structures. The following three steps are performed in order: (a) The particles are binned using a standard algorithm like the one for linked lists. (b) Sort the resulting cellids (keys) and indices (values) using the RadixSort objects (c) Compute the cell counts by examining the sorted cellids """ # context and queue ctx = self.context q = self.queue # get the cell limits ncx, ncy, ncz = self.ncx, self.ncy, self.ncz mcx, mcy, mcz = self.mcx, self.mcy, self.mcz narrays = self.narrays for i in range(narrays): pa = self.arrays[i] np = pa.get_number_of_particles() # get launch parameters for this array global_sizes = (np,1,1) local_sizes = (1,1,1) x = pa.get_cl_buffer("x") y = pa.get_cl_buffer("y") z = pa.get_cl_buffer("z") # bin the particles to get device cellids cellids = self.cellids[pa.name] indices = self.indices[pa.name] cellc = self.cell_counts[pa.name] dcellids = self.dcellids[pa.name] dindices = self.dindices[pa.name] dcell_counts = self.dcell_counts[pa.name] self.prog.bin( q, global_sizes, local_sizes, x, y, z, dcellids, self.cell_size, ncx, ncy, ncz, mcx, mcy, mcz ).wait() # read the cellids into host array clu.enqueue_copy(q, src=dcellids, dst=cellids) # initialize the RadixSort with keys and values keys = cellids values = indices rsort = self.rsort[ pa.name ] rsort.initialize(keys, values, self.context) # sort the keys (cellids) and values (indices) rsort.sort() sortedcellids = rsort.dkeys self.prog.compute_cell_counts(q, global_sizes, local_sizes, sortedcellids, dcell_counts, numpy.uint32(self.ncells), numpy.uint32(np)).wait() # read the result back to host # THIS MAY NEED TO BE DONE OR WE COULD SIMPLY LET IT RESIDE # ON THE DEVICE. clu.enqueue_copy(q, src=dcell_counts, dst=self.cell_counts[pa.name])
def enqueue_copy(self): """ Copy the Buffer contents to the host The buffers copied are cellids, head, next, dix, diy, diz """ if self.with_cl: for pa in self.arrays: enqueue_copy(self.queue, dst=self.cellids[pa.name], src=self.dcellids[pa.name]) enqueue_copy(self.queue, dst=self.head[pa.name], src=self.dhead[pa.name]) enqueue_copy(self.queue, dst=self.Next[pa.name], src=self.dnext[pa.name]) enqueue_copy(self.queue, dst=self.ix[pa.name], src=self.dix[pa.name]) enqueue_copy(self.queue, dst=self.iy[pa.name], src=self.diy[pa.name]) enqueue_copy(self.queue, dst=self.iz[pa.name], src=self.diz[pa.name])