def _get_exec_configs(self, threads_max, padding, smem_per_thread, \ shape_filter): """ Find all valid execution configurations. """ # Padding of the kernel. y_pad = sum(padding[0:2]) z_pad = sum(padding[2:4]) # Shared memory requirements. smem_size = lambda b_shape: smem_per_thread * \ (b_shape[0] * b_shape[1]) # The kind of shapes that we are interested in. if shape_filter is 'skinny': # Only z-dominant shapes. my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \ (b_shape[1] > 8) and ((b_shape[1] % 16) == 0) elif shape_filter is 'square': # Only square-ish shapes. my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \ (b_shape[1] < 2 * b_shape[0]) and \ (b_shape[0] > 8) and \ (b_shape[1] > 8) elif shape_filter is 'all': # All shapes okay. my_filter = lambda b_shape: b_shape[1] > 1 # Must be greater than 1. else: raise TypeError('Unrecognized shape filter.') # Function defining valid block shapes. smem_max = get_space_info()['max_shared_mem'] is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \ my_filter(b_shape) and \ (b_shape[0] * b_shape[1]) <= \ threads_max # Create a list of all valid block shapes. valid_block_shapes = [] z_max = get_space_info()['max_block_z'] y_max = get_space_info()['max_block_y'] for j in range(y_pad+1, y_max+1): for k in range(z_pad+1, z_max+1): if is_valid_shape((j,k)): valid_block_shapes.append((j,k)) # Block shape is (yy,zz). # A hack for profiling # valid_block_shapes = ((31,16),) # valid_block_shapes = ((17,22),) if not valid_block_shapes: # Make sure the list is not empty. raise TypeError('No valid shapes found.') # Create a list of all possible execution configurations. # Note that the convention for both block_shape and grid_shape is # (yy,zz). Among other things, this leads to the (slightly) # tricky computation of grid_shape. sp_shape = get_space_info()['shape'] # Shape of the space. return [{ 'block_shape': vbs, \ 'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \ int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \ 'smem_size': smem_size(vbs)} for vbs in valid_block_shapes]
def test_partition(self): """ Make sure the x_ranges span the entire space without any gaps. """ shapes = ((200,30,10), (33,10,10), (130,5,5), (111,2,2)) for shape in shapes: space.initialize_space(shape) x = comm.gather(space.get_space_info()['x_range']) if comm.Get_rank() == 0: self.assertEqual(x[0][0], 0) self.assertEqual(x[-1][-1], space.get_space_info()['shape'][0]) for k in range(len(x)-1): self.assertEqual(x[k][1], x[k+1][0])
def test_partition(self): """ Make sure the x_ranges span the entire space without any gaps. """ shapes = ((200, 30, 10), (33, 10, 10), (130, 5, 5), (111, 2, 2)) for shape in shapes: space.initialize_space(shape) x = comm.gather(space.get_space_info()['x_range']) if comm.Get_rank() == 0: self.assertEqual(x[0][0], 0) self.assertEqual(x[-1][-1], space.get_space_info()['shape'][0]) for k in range(len(x) - 1): self.assertEqual(x[k][1], x[k + 1][0])
def __init__(self, dtype, op='sum'): """ Create an Out. Input variables dtype -- numpy dtype. Keyword variables op -- type of reduction operation to perform. Default='sum'. At this time, only the "sum" operation is supported. """ self._set_gce_type('out') self._get_dtype(dtype) # Validate dtype. if op not in ('sum',): # Validate op. raise TypeError('Invalid op.') self.op = op # Obtain the neutral value and store it in the result variable. neutral_val = {'sum': 0} # Create the intermediary values. shape = get_space_info()['shape'] self.to_gpu((neutral_val[op] * \ np.ones((1, shape[1], shape[2]))).astype(self.dtype))
def __init__(self, dtype, op='sum'): """ Create an Out. Input variables dtype -- numpy dtype. Keyword variables op -- type of reduction operation to perform. Default='sum'. At this time, only the "sum" operation is supported. """ self._set_gce_type('out') self._get_dtype(dtype) # Validate dtype. if op not in ('sum', ): # Validate op. raise TypeError('Invalid op.') self.op = op # Obtain the neutral value and store it in the result variable. neutral_val = {'sum': 0} # Create the intermediary values. shape = get_space_info()['shape'] self.to_gpu((neutral_val[op] * \ np.ones((1, shape[1], shape[2]))).astype(self.dtype))
def test_get_info(self): """ Test the get_space_info function. """ # # We should get an error if we haven't initialized a space yet. # self.assertRaises(TypeError, space.get_space_info) shape = (100,2,3) space.initialize_space(shape) info = space.get_space_info() self.assertEqual(info['shape'], shape)
def test_get_info(self): """ Test the get_space_info function. """ # # We should get an error if we haven't initialized a space yet. # self.assertRaises(TypeError, space.get_space_info) shape = (100, 2, 3) space.initialize_space(shape) info = space.get_space_info() self.assertEqual(info['shape'], shape)
def get_cpu_raw(cpu_data, k): # Make sure overlapped data is accurate as well. xr = space.get_space_info()['x_range'] if comm.Get_rank() == 0: pad_back = cpu_data[-k:, :, :] else: pad_back = cpu_data[xr[0] - k:xr[0], :, :] if comm.Get_rank() == comm.Get_size() - 1: pad_front = cpu_data[:k, :, :] else: pad_front = cpu_data[xr[1]:xr[1] + k, :, :] return np.concatenate((pad_back, cpu_data[xr[0]:xr[1],:,:], \ pad_front), axis=0)
def get_cpu_raw(cpu_data, k): # Make sure overlapped data is accurate as well. xr = space.get_space_info()['x_range'] if comm.Get_rank() == 0: pad_back = cpu_data[-k:,:,:] else: pad_back = cpu_data[xr[0]-k:xr[0],:,:] if comm.Get_rank() == comm.Get_size() - 1: pad_front = cpu_data[:k,:,:] else: pad_front = cpu_data[xr[1]:xr[1]+k,:,:] return np.concatenate((pad_back, cpu_data[xr[0]:xr[1],:,:], \ pad_front), axis=0)
def test_ecc_disabled(self): """ Make sure ECC is disabled. """ space.initialize_space((100, 2, 3)) self.assertTrue(space.get_space_info()['ecc_enabled'] == False, \ 'ECC enabled! Should be disabled for best performance.')
def __init__(self, array_or_dtype, x_overlap=0): """ Create a spatial grid on the GPU(s). Input variables array_or_dtype -- can either be a numpy array of the same shape as the global space, or a numpy dtype. If a valid array is passed, it will be loaded on to the GPU. If a dtype is passed, then an array of zeros, of that dtype will be loaded onto the GPU. Optional variables x_overlap -- the number of adjacent cells in either the negative or positive x-direction that need to simultaneously be accessed along with the current cell. Must be a non-negative integer. Default value is 0. """ shape = get_space_info()['shape'] # Get the shape of the space. xr = get_space_info()['x_range'] # Get the local x_range. all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range. local_shape = (xr[1]-xr[0], shape[1], shape[2]) self._set_gce_type('grid') # Set the gce type to grid. # Make sure overlap option is valid. if type(x_overlap) is not int: raise TypeError('x_overlap must be an integer.') elif x_overlap < 0: raise TypeError('x_overlap must be a non-negative integer.') if comm.rank == 0: # Process the array_or_dtype input variable. if type(array_or_dtype) is np.ndarray: # Input is an array. array = array_or_dtype # Make sure the array is of the correct shape. if array.shape != shape: raise TypeError('Shape of array does not match shape of space.') # Make sure the array is of a valid datatype. self._get_dtype(array.dtype.type) elif type(array_or_dtype) is type: # Input is a datatype. self._get_dtype(array_or_dtype) # Validate the dtype. array = np.zeros(shape, dtype=self.dtype) # Make a zeros array. else: # Invalid input. raise TypeError('Input variable must be a numpy array or dtype') # Prepare array to be scattered. array = [array[r[0]:r[1],:,:] for r in all_x_ranges] else: array = None array = comm.scatter(array) self._get_dtype(array.dtype.type) # # Narrow down the array to local x_range. # array = array[xr[0]:xr[1],:,:] # Add padding to array, if needed. self._xlap = x_overlap if self._xlap is not 0: padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype) array = np.concatenate((padding, array, padding), axis=0) self.to_gpu(array) # Load onto device. # Determine information needed for synchronization. if self._xlap is not 0: # Calculates the pointer to the x offset in a grid. ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \ x_pos * shape[1] * shape[2] # Pointers to different sections of the grid that are relevant # for synchronization. self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \ 'back_dest': ptr_dx(0), \ 'back_src': ptr_dx(self._xlap), \ 'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)} # Buffers used during synchronization. self._sync_buffers = [drv.pagelocked_empty( \ (self._xlap, shape[1], shape[2]), \ self.dtype) for k in range(4)] # Streams used during synchronization. self._sync_streams = [drv.Stream() for k in range(4)] # Used to identify neighboring MPI nodes with whom to synchronize. self._sync_adj = get_space_info()['mpi_adj'] # Offset in bytes to the true start of the grid. # This is used to "hide" overlap areas from the kernel. self._xlap_offset = self.data.dtype.itemsize * \ self._xlap * shape[1] * shape[2] self.synchronize() # Synchronize the grid. comm.Barrier() # Wait for all grids to synchronize before proceeding.
def __init__(self, code, *vars, **kwargs): """ Prepare a cuda function that will execute on the GCE space. Input variables: code -- The looped cuda code to be executed. vars -- (name, gce_type, numpy_type) of the input arguments. Keyword variables: pre_loop -- Cuda code that is executed before the loop code. shape_filter -- Can be either 'all', 'skinny', or 'square'. padding -- (yn, yp, zn, zp), describes the number of "extra" threads to be run on the border of each thread block. smem_per_thread -- Number of bytes of shared memory needed by a thread. """ # Make sure there are no extraneous keyword arguments. if any([key not in \ ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread') for key in kwargs.keys()]): raise TypeError('Invalid key used.') # Process keyword arguments. pre_code = kwargs.get('pre_loop', '') shape_filter = kwargs.get('shape_filter', 'skinny') padding = kwargs.get('padding', (0,0,0,0)) smem_per_thread = kwargs.get('smem_per_thread', 0) # Dictionary for conversion from numpy to cuda types. cuda_types = {np.float32: 'float', np.float64: 'double', \ np.complex64: 'pycuda::complex<float>', \ np.complex128: 'pycuda::complex<double>'} # Dictionary for conversion from numpy to alternate type for Consts. alt_types = {np.float32: 'float', np.float64: 'double', \ np.complex64: 'float2', np.complex128: 'double2'} # Process vars. params = [{'name': v[0], \ 'gce_type': v[1], \ 'dtype': v[2], \ 'cuda_type': cuda_types[v[2]]} for v in vars] # for k in range(len(params)): # We need size information for consts. # if params[k]['gce_type'] is 'const': # params[k]['num_elems'] = vars[k][3] # params[k]['alt_type'] = alt_types[params[k]['dtype']] # Get the template and render it using jinja2. shape = get_space_info()['shape'] # Shape of the space. template = _jinja_env.get_template(_template_file) cuda_source = template.render( params=params, \ padding=padding, \ dims =get_space_info()['shape'], \ x_range=get_space_info()['x_range'], \ preloop_code=pre_code, \ loop_code=code, \ flat_tag='_f') # Write out the source code for debugging purposes. open('/tmp/gce_kernel.cu', 'w').write(cuda_source) # Compile the code into a callable cuda function. mod = compiler.SourceModule(cuda_source) # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache. self.fun = mod.get_function('_gce_kernel') # Prefer 48KB of L1 cache when possible. self.fun.set_cache_config(drv.func_cache.PREFER_L1) # Get address of global variable in module. # Note: contains a work-around for problems with complex types. my_get_global = lambda name: mod.get_global('_' + name + '_temp') # Useful information about the kernel. self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \ 'const_bytes': self.fun.const_size_bytes, \ 'local_bytes': self.fun.local_size_bytes, \ 'num_regs': self.fun.num_regs} # Get some valid execution configurations. self.exec_configs = self._get_exec_configs( \ self.fun.max_threads_per_block, \ padding, smem_per_thread, shape_filter) # Prepare the function by telling pycuda the types of the inputs. arg_types = [] for p in params: if p['gce_type'] is 'number': arg_types.append(p['dtype']) # elif p['gce_type'] is 'const': # arg_types.append(p['dtype']) # # pass # Consts don't actually get passed in. else: arg_types.append(np.intp) self.fun.prepare([np.int32, np.int32] + arg_types) # Define the function which we will use to execute the kernel. # TODO: Make a shortcut version with lower overhead. # Used for asynchronous execution and timing. stream = drv.Stream() start, start2, pad_done, sync_done, comp_done, all_done = \ [drv.Event() for k in range(6)] # Kernel execution over a range of x-values. def execute_range(x_start, x_end, gpu_params, cfg, stream): """ Defines asynchronous kernel execution for a range of x. """ self.fun.prepared_async_call( \ cfg['grid_shape'][::-1], \ cfg['block_shape'][::-1] + (1,), \ stream, \ *([np.int32(x_start), np.int32(x_end)] + gpu_params), \ shared_size=cfg['smem_size']) x_start, x_end = get_space_info()['x_range'] # This node's range. def execute(cfg, *args, **kwargs): # Parse keyword arguments. post_sync_grids = kwargs.get('post_sync', None) # Parse the inputs. gpu_params = [] for k in range(len(params)): if params[k]['gce_type'] is 'number': gpu_params.append(params[k]['dtype'](args[k])) elif params[k]['gce_type'] is 'const': # Load Const. gpu_params.append(args[k].data.ptr) # Const no longer actually "const" in cuda code. # d_ptr, size_in_bytes = my_get_global(params[k]['name']) # drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes) elif params[k]['gce_type'] is 'grid': if args[k]._xlap is 0: gpu_params.append(args[k].data.ptr) else: gpu_params.append(args[k].data.ptr + \ args[k]._xlap_offset) elif params[k]['gce_type'] is 'out': args[k].data.fill(args[k].dtype(0)) # Initialize the Out. gpu_params.append(args[k].data.ptr) else: raise TypeError('Invalid input type.') # See if we need to synchronize grids after kernel execution. if post_sync_grids is None: sync_pad = 0 else: sync_pad = max([g._xlap for g in post_sync_grids]) start2.record(stream) comm.Barrier() start.record(stream) # Execute kernel in padded regions first. execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream) execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream) pad_done.record(stream) # Just for timing purposes. stream.synchronize() # Wait for execution to finish. # Begin kernel execution in remaining "core" region. execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream) comp_done.record(stream) # Timing only. # While core kernel is executing, perform synchronization. if post_sync_grids is not None: # Synchronization needed. for grid in post_sync_grids: grid.synchronize_start() # Start synchronization. # Keep on checking until everything is done. while not (all([grid.synchronize_isdone() \ for grid in post_sync_grids]) and \ stream.is_done()): pass else: # Nothing to synchronize. stream.synchronize() # Just wait for execution to finish. sync_done.record() # Timing. # Obtain the result for all Outs. batch_reduce(*[args[k] for k in range(len(params)) \ if params[k]['gce_type'] is 'out']) all_done.record() # Timing. all_done.synchronize() # The delay between sync_done and comp_done should be small. # Otherwise, the parallelization efficiency is suffering. print "(%d)" % comm.Get_rank(), for milliseconds in [event_done.time_since(start) for event_done in \ (start2, pad_done, sync_done, comp_done, all_done)]: print "%1.4f " % milliseconds, print cfg['block_shape'] return comp_done.time_since(start) # Return time needed to execute the function. self.execute = execute # Save execution function in Kernel instance. self.min_exec_time = float('inf') # Stores the fastest execution time.
def __init__(self, code, *vars, **kwargs): """ Prepare a cuda function that will execute on the GCE space. Input variables: code -- The looped cuda code to be executed. vars -- (name, gce_type, numpy_type) of the input arguments. Keyword variables: pre_loop -- Cuda code that is executed before the loop code. shape_filter -- Can be either 'all', 'skinny', or 'square'. padding -- (yn, yp, zn, zp), describes the number of "extra" threads to be run on the border of each thread block. smem_per_thread -- Number of bytes of shared memory needed by a thread. """ # Make sure there are no extraneous keyword arguments. if any([key not in \ ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread') for key in kwargs.keys()]): raise TypeError('Invalid key used.') # Process keyword arguments. pre_code = kwargs.get('pre_loop', '') shape_filter = kwargs.get('shape_filter', 'skinny') padding = kwargs.get('padding', (0, 0, 0, 0)) smem_per_thread = kwargs.get('smem_per_thread', 0) # Dictionary for conversion from numpy to cuda types. cuda_types = {np.float32: 'float', np.float64: 'double', \ np.complex64: 'pycuda::complex<float>', \ np.complex128: 'pycuda::complex<double>'} # Dictionary for conversion from numpy to alternate type for Consts. alt_types = {np.float32: 'float', np.float64: 'double', \ np.complex64: 'float2', np.complex128: 'double2'} # Process vars. params = [{'name': v[0], \ 'gce_type': v[1], \ 'dtype': v[2], \ 'cuda_type': cuda_types[v[2]]} for v in vars] # for k in range(len(params)): # We need size information for consts. # if params[k]['gce_type'] is 'const': # params[k]['num_elems'] = vars[k][3] # params[k]['alt_type'] = alt_types[params[k]['dtype']] # Get the template and render it using jinja2. shape = get_space_info()['shape'] # Shape of the space. template = _jinja_env.get_template(_template_file) cuda_source = template.render( params=params, \ padding=padding, \ dims =get_space_info()['shape'], \ x_range=get_space_info()['x_range'], \ preloop_code=pre_code, \ loop_code=code, \ flat_tag='_f') # Write out the source code for debugging purposes. open('/tmp/gce_kernel.cu', 'w').write(cuda_source) # Compile the code into a callable cuda function. mod = compiler.SourceModule(cuda_source) # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache. self.fun = mod.get_function('_gce_kernel') # Prefer 48KB of L1 cache when possible. self.fun.set_cache_config(drv.func_cache.PREFER_L1) # Get address of global variable in module. # Note: contains a work-around for problems with complex types. my_get_global = lambda name: mod.get_global('_' + name + '_temp') # Useful information about the kernel. self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \ 'const_bytes': self.fun.const_size_bytes, \ 'local_bytes': self.fun.local_size_bytes, \ 'num_regs': self.fun.num_regs} # Get some valid execution configurations. self.exec_configs = self._get_exec_configs( \ self.fun.max_threads_per_block, \ padding, smem_per_thread, shape_filter) # Prepare the function by telling pycuda the types of the inputs. arg_types = [] for p in params: if p['gce_type'] is 'number': arg_types.append(p['dtype']) # elif p['gce_type'] is 'const': # arg_types.append(p['dtype']) # # pass # Consts don't actually get passed in. else: arg_types.append(np.intp) self.fun.prepare([np.int32, np.int32] + arg_types) # Define the function which we will use to execute the kernel. # TODO: Make a shortcut version with lower overhead. # Used for asynchronous execution and timing. stream = drv.Stream() start, start2, pad_done, sync_done, comp_done, all_done = \ [drv.Event() for k in range(6)] # Kernel execution over a range of x-values. def execute_range(x_start, x_end, gpu_params, cfg, stream): """ Defines asynchronous kernel execution for a range of x. """ self.fun.prepared_async_call( \ cfg['grid_shape'][::-1], \ cfg['block_shape'][::-1] + (1,), \ stream, \ *([np.int32(x_start), np.int32(x_end)] + gpu_params), \ shared_size=cfg['smem_size']) x_start, x_end = get_space_info()['x_range'] # This node's range. def execute(cfg, *args, **kwargs): # Parse keyword arguments. post_sync_grids = kwargs.get('post_sync', None) # Parse the inputs. gpu_params = [] for k in range(len(params)): if params[k]['gce_type'] is 'number': gpu_params.append(params[k]['dtype'](args[k])) elif params[k]['gce_type'] is 'const': # Load Const. gpu_params.append(args[k].data.ptr) # Const no longer actually "const" in cuda code. # d_ptr, size_in_bytes = my_get_global(params[k]['name']) # drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes) elif params[k]['gce_type'] is 'grid': if args[k]._xlap is 0: gpu_params.append(args[k].data.ptr) else: gpu_params.append(args[k].data.ptr + \ args[k]._xlap_offset) elif params[k]['gce_type'] is 'out': args[k].data.fill(args[k].dtype(0)) # Initialize the Out. gpu_params.append(args[k].data.ptr) else: raise TypeError('Invalid input type.') # See if we need to synchronize grids after kernel execution. if post_sync_grids is None: sync_pad = 0 else: sync_pad = max([g._xlap for g in post_sync_grids]) start2.record(stream) comm.Barrier() start.record(stream) # Execute kernel in padded regions first. execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream) execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream) pad_done.record(stream) # Just for timing purposes. stream.synchronize() # Wait for execution to finish. # Begin kernel execution in remaining "core" region. execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream) comp_done.record(stream) # Timing only. # While core kernel is executing, perform synchronization. if post_sync_grids is not None: # Synchronization needed. for grid in post_sync_grids: grid.synchronize_start() # Start synchronization. # Keep on checking until everything is done. while not (all([grid.synchronize_isdone() \ for grid in post_sync_grids]) and \ stream.is_done()): pass else: # Nothing to synchronize. stream.synchronize() # Just wait for execution to finish. sync_done.record() # Timing. # Obtain the result for all Outs. batch_reduce(*[args[k] for k in range(len(params)) \ if params[k]['gce_type'] is 'out']) all_done.record() # Timing. all_done.synchronize() # The delay between sync_done and comp_done should be small. # Otherwise, the parallelization efficiency is suffering. print "(%d)" % comm.Get_rank(), for milliseconds in [event_done.time_since(start) for event_done in \ (start2, pad_done, sync_done, comp_done, all_done)]: print "%1.4f " % milliseconds, print cfg['block_shape'] return comp_done.time_since( start) # Return time needed to execute the function. self.execute = execute # Save execution function in Kernel instance. self.min_exec_time = float('inf') # Stores the fastest execution time.
def _get_exec_configs(self, threads_max, padding, smem_per_thread, \ shape_filter): """ Find all valid execution configurations. """ # Padding of the kernel. y_pad = sum(padding[0:2]) z_pad = sum(padding[2:4]) # Shared memory requirements. smem_size = lambda b_shape: smem_per_thread * \ (b_shape[0] * b_shape[1]) # The kind of shapes that we are interested in. if shape_filter is 'skinny': # Only z-dominant shapes. my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \ (b_shape[1] > 8) and ((b_shape[1] % 16) == 0) elif shape_filter is 'square': # Only square-ish shapes. my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \ (b_shape[1] < 2 * b_shape[0]) and \ (b_shape[0] > 8) and \ (b_shape[1] > 8) elif shape_filter is 'all': # All shapes okay. my_filter = lambda b_shape: b_shape[ 1] > 1 # Must be greater than 1. else: raise TypeError('Unrecognized shape filter.') # Function defining valid block shapes. smem_max = get_space_info()['max_shared_mem'] is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \ my_filter(b_shape) and \ (b_shape[0] * b_shape[1]) <= \ threads_max # Create a list of all valid block shapes. valid_block_shapes = [] z_max = get_space_info()['max_block_z'] y_max = get_space_info()['max_block_y'] for j in range(y_pad + 1, y_max + 1): for k in range(z_pad + 1, z_max + 1): if is_valid_shape((j, k)): valid_block_shapes.append( (j, k)) # Block shape is (yy,zz). # A hack for profiling # valid_block_shapes = ((31,16),) # valid_block_shapes = ((17,22),) if not valid_block_shapes: # Make sure the list is not empty. raise TypeError('No valid shapes found.') # Create a list of all possible execution configurations. # Note that the convention for both block_shape and grid_shape is # (yy,zz). Among other things, this leads to the (slightly) # tricky computation of grid_shape. sp_shape = get_space_info()['shape'] # Shape of the space. return [{ 'block_shape': vbs, \ 'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \ int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \ 'smem_size': smem_size(vbs)} for vbs in valid_block_shapes]