Example #1
0
    def _get_exec_configs(self, threads_max, padding, smem_per_thread, \
                        shape_filter):
        """ Find all valid execution configurations. """

        # Padding of the kernel.
        y_pad = sum(padding[0:2])
        z_pad = sum(padding[2:4])

        # Shared memory requirements.
        smem_size = lambda b_shape: smem_per_thread * \
                                            (b_shape[0] * b_shape[1])

        # The kind of shapes that we are interested in.
        if shape_filter is 'skinny': # Only z-dominant shapes.
            my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \
					(b_shape[1] > 8) and ((b_shape[1] % 16) == 0)
        elif shape_filter is 'square': # Only square-ish shapes.
            my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \
                                        (b_shape[1] < 2 * b_shape[0]) and \
					(b_shape[0] > 8) and \
					(b_shape[1] > 8)
        elif shape_filter is 'all': # All shapes okay.
            my_filter = lambda b_shape: b_shape[1] > 1 # Must be greater than 1.
        else:
            raise TypeError('Unrecognized shape filter.')
        
        # Function defining valid block shapes.
        smem_max = get_space_info()['max_shared_mem']
        is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \
                                            my_filter(b_shape) and \
                                            (b_shape[0] * b_shape[1]) <= \
                                                threads_max 

        # Create a list of all valid block shapes.
        valid_block_shapes = []
        z_max = get_space_info()['max_block_z']
        y_max = get_space_info()['max_block_y']
        for j in range(y_pad+1, y_max+1):
            for k in range(z_pad+1, z_max+1):
                if is_valid_shape((j,k)):
                    valid_block_shapes.append((j,k)) # Block shape is (yy,zz).

        # A hack for profiling
        # valid_block_shapes = ((31,16),)
        # valid_block_shapes = ((17,22),)

        if not valid_block_shapes: # Make sure the list is not empty.
            raise TypeError('No valid shapes found.')

        # Create a list of all possible execution configurations.
        # Note that the convention for both block_shape and grid_shape is 
        # (yy,zz). Among other things, this leads to the (slightly)
        # tricky computation of grid_shape.
        sp_shape = get_space_info()['shape'] # Shape of the space.
        return [{   'block_shape': vbs, \
                    'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \
                                    int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \
                    'smem_size': smem_size(vbs)}
                for vbs in valid_block_shapes]
Example #2
0
 def test_partition(self):
     """ Make sure the x_ranges span the entire space without any gaps. """
     shapes = ((200,30,10), (33,10,10), (130,5,5), (111,2,2))
     for shape in shapes:
         space.initialize_space(shape)
         x = comm.gather(space.get_space_info()['x_range'])
         if comm.Get_rank() == 0:
             self.assertEqual(x[0][0], 0)
             self.assertEqual(x[-1][-1], space.get_space_info()['shape'][0])
             for k in range(len(x)-1):
                 self.assertEqual(x[k][1], x[k+1][0])
Example #3
0
 def test_partition(self):
     """ Make sure the x_ranges span the entire space without any gaps. """
     shapes = ((200, 30, 10), (33, 10, 10), (130, 5, 5), (111, 2, 2))
     for shape in shapes:
         space.initialize_space(shape)
         x = comm.gather(space.get_space_info()['x_range'])
         if comm.Get_rank() == 0:
             self.assertEqual(x[0][0], 0)
             self.assertEqual(x[-1][-1], space.get_space_info()['shape'][0])
             for k in range(len(x) - 1):
                 self.assertEqual(x[k][1], x[k + 1][0])
Example #4
0
    def __init__(self, dtype, op='sum'):
        """ Create an Out.

        Input variables
        dtype -- numpy dtype.

        Keyword variables
        op -- type of reduction operation to perform. Default='sum'.
            At this time, only the "sum" operation is supported.
        """

        self._set_gce_type('out')
        self._get_dtype(dtype) # Validate dtype.

        if op not in ('sum',): # Validate op.
            raise TypeError('Invalid op.')
        self.op = op

        # Obtain the neutral value and store it in the result variable.
        neutral_val = {'sum': 0}

        # Create the intermediary values.
        shape = get_space_info()['shape']
        self.to_gpu((neutral_val[op] * \
                        np.ones((1, shape[1], shape[2]))).astype(self.dtype))
Example #5
0
    def __init__(self, dtype, op='sum'):
        """ Create an Out.

        Input variables
        dtype -- numpy dtype.

        Keyword variables
        op -- type of reduction operation to perform. Default='sum'.
            At this time, only the "sum" operation is supported.
        """

        self._set_gce_type('out')
        self._get_dtype(dtype)  # Validate dtype.

        if op not in ('sum', ):  # Validate op.
            raise TypeError('Invalid op.')
        self.op = op

        # Obtain the neutral value and store it in the result variable.
        neutral_val = {'sum': 0}

        # Create the intermediary values.
        shape = get_space_info()['shape']
        self.to_gpu((neutral_val[op] * \
                        np.ones((1, shape[1], shape[2]))).astype(self.dtype))
Example #6
0
    def test_get_info(self):
        """ Test the get_space_info function. """
#         # We should get an error if we haven't initialized a space yet.
#         self.assertRaises(TypeError, space.get_space_info)

        shape = (100,2,3)
        space.initialize_space(shape)
        info = space.get_space_info()
        self.assertEqual(info['shape'], shape)
Example #7
0
    def test_get_info(self):
        """ Test the get_space_info function. """
        #         # We should get an error if we haven't initialized a space yet.
        #         self.assertRaises(TypeError, space.get_space_info)

        shape = (100, 2, 3)
        space.initialize_space(shape)
        info = space.get_space_info()
        self.assertEqual(info['shape'], shape)
Example #8
0
def get_cpu_raw(cpu_data, k):
    # Make sure overlapped data is accurate as well.
    xr = space.get_space_info()['x_range']
    if comm.Get_rank() == 0:
        pad_back = cpu_data[-k:, :, :]
    else:
        pad_back = cpu_data[xr[0] - k:xr[0], :, :]

    if comm.Get_rank() == comm.Get_size() - 1:
        pad_front = cpu_data[:k, :, :]
    else:
        pad_front = cpu_data[xr[1]:xr[1] + k, :, :]

    return np.concatenate((pad_back, cpu_data[xr[0]:xr[1],:,:], \
                                pad_front), axis=0)
Example #9
0
def get_cpu_raw(cpu_data, k):
    # Make sure overlapped data is accurate as well.
    xr = space.get_space_info()['x_range']
    if comm.Get_rank() == 0:
        pad_back = cpu_data[-k:,:,:]
    else:
        pad_back = cpu_data[xr[0]-k:xr[0],:,:]

    if comm.Get_rank() == comm.Get_size() - 1:
        pad_front = cpu_data[:k,:,:]
    else:
        pad_front = cpu_data[xr[1]:xr[1]+k,:,:]

    return np.concatenate((pad_back, cpu_data[xr[0]:xr[1],:,:], \
                                pad_front), axis=0)
Example #10
0
 def test_ecc_disabled(self):
     """ Make sure ECC is disabled. """
     space.initialize_space((100, 2, 3))
     self.assertTrue(space.get_space_info()['ecc_enabled'] == False, \
         'ECC enabled! Should be disabled for best performance.')
Example #11
0
    def __init__(self, array_or_dtype, x_overlap=0):
        """ Create a spatial grid on the GPU(s).

        Input variables
        array_or_dtype -- can either be a numpy array of the same shape as
            the global space, or a numpy dtype. If a valid array is passed, 
            it will be loaded on to the GPU. If a dtype is passed, then
            an array of zeros, of that dtype will be loaded onto the GPU.

        Optional variables
        x_overlap -- the number of adjacent cells in either the negative or
            positive x-direction that need to simultaneously be accessed along
            with the current cell. Must be a non-negative integer. Default
            value is 0.

        """

        shape = get_space_info()['shape'] # Get the shape of the space.
        xr = get_space_info()['x_range'] # Get the local x_range.
        all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range.
        local_shape = (xr[1]-xr[0], shape[1], shape[2])

        self._set_gce_type('grid') # Set the gce type to grid.

        # Make sure overlap option is valid.
        if type(x_overlap) is not int:
            raise TypeError('x_overlap must be an integer.')
        elif x_overlap < 0:
            raise TypeError('x_overlap must be a non-negative integer.')

        if comm.rank == 0:
            # Process the array_or_dtype input variable.
            if type(array_or_dtype) is np.ndarray: # Input is an array.
                array = array_or_dtype

                # Make sure the array is of the correct shape.
                if array.shape != shape:
                    raise TypeError('Shape of array does not match shape of space.')

                # Make sure the array is of a valid datatype.
                self._get_dtype(array.dtype.type)


            elif type(array_or_dtype) is type: # Input is a datatype.
                self._get_dtype(array_or_dtype) # Validate the dtype.
                array = np.zeros(shape, dtype=self.dtype) # Make a zeros array.

            else: # Invalid input.
                raise TypeError('Input variable must be a numpy array or dtype')

            # Prepare array to be scattered.
            array = [array[r[0]:r[1],:,:] for r in all_x_ranges]

        else:
            array = None

        array = comm.scatter(array)
        self._get_dtype(array.dtype.type)

#         # Narrow down the array to local x_range.
#         array = array[xr[0]:xr[1],:,:]

        # Add padding to array, if needed.
        self._xlap = x_overlap
        if self._xlap is not 0:
            padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype)
            array = np.concatenate((padding, array, padding), axis=0)

        self.to_gpu(array) # Load onto device.

        # Determine information needed for synchronization.
        if self._xlap is not 0:
            # Calculates the pointer to the x offset in a grid.
            ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \
                                        x_pos * shape[1] * shape[2]
            
            # Pointers to different sections of the grid that are relevant
            # for synchronization.
            self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \
                                'back_dest': ptr_dx(0), \
                                'back_src': ptr_dx(self._xlap), \
                                'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)}

            # Buffers used during synchronization.
            self._sync_buffers = [drv.pagelocked_empty( \
                                    (self._xlap, shape[1], shape[2]), \
                                    self.dtype) for k in range(4)]

            # Streams used during synchronization.
            self._sync_streams = [drv.Stream() for k in range(4)]

            # Used to identify neighboring MPI nodes with whom to synchronize.
            self._sync_adj = get_space_info()['mpi_adj']

            # Offset in bytes to the true start of the grid.
            # This is used to "hide" overlap areas from the kernel.
            self._xlap_offset = self.data.dtype.itemsize * \
                                self._xlap * shape[1] * shape[2]

            self.synchronize() # Synchronize the grid.
            comm.Barrier() # Wait for all grids to synchronize before proceeding.
Example #12
0
 def test_ecc_disabled(self):
     """ Make sure ECC is disabled. """
     space.initialize_space((100, 2, 3))
     self.assertTrue(space.get_space_info()['ecc_enabled'] == False, \
         'ECC enabled! Should be disabled for best performance.')
Example #13
0
    def __init__(self, code, *vars, **kwargs):
        """ Prepare a cuda function that will execute on the GCE space.

        Input variables:
        code -- The looped cuda code to be executed.
        vars -- (name, gce_type, numpy_type) of the input arguments.

        Keyword variables:
        pre_loop -- Cuda code that is executed before the loop code.
        shape_filter -- Can be either 'all', 'skinny', or 'square'.
        padding -- (yn, yp, zn, zp), describes the number of "extra" threads
            to be run on the border of each thread block.
        smem_per_thread -- Number of bytes of shared memory needed by a thread.
        """

        # Make sure there are no extraneous keyword arguments.
        if any([key not in \
                ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread') 
                for key in kwargs.keys()]):
            raise TypeError('Invalid key used.')

        # Process keyword arguments.
        pre_code = kwargs.get('pre_loop', '') 
        shape_filter = kwargs.get('shape_filter', 'skinny')
        padding = kwargs.get('padding', (0,0,0,0))
        smem_per_thread = kwargs.get('smem_per_thread', 0)

        # Dictionary for conversion from numpy to cuda types.
        cuda_types = {np.float32: 'float', np.float64: 'double', \
                        np.complex64: 'pycuda::complex<float>', \
                        np.complex128: 'pycuda::complex<double>'}
        # Dictionary for conversion from numpy to alternate type for Consts.
        alt_types = {np.float32: 'float', np.float64: 'double', \
                        np.complex64: 'float2', np.complex128: 'double2'}

        # Process vars.
        params = [{'name': v[0], \
                'gce_type': v[1], \
                'dtype': v[2], \
                'cuda_type': cuda_types[v[2]]} for v in vars]
#         for k in range(len(params)): # We need size information for consts.
#             if params[k]['gce_type'] is 'const':
#                 params[k]['num_elems'] = vars[k][3]
#                 params[k]['alt_type'] = alt_types[params[k]['dtype']]

        # Get the template and render it using jinja2.
        shape = get_space_info()['shape'] # Shape of the space.
        template = _jinja_env.get_template(_template_file)
        cuda_source = template.render(  params=params, \
                                        padding=padding, \
                                        dims =get_space_info()['shape'], \
                                        x_range=get_space_info()['x_range'], \
                                        preloop_code=pre_code, \
                                        loop_code=code, \
                                        flat_tag='_f')

        # Write out the source code for debugging purposes.
        open('/tmp/gce_kernel.cu', 'w').write(cuda_source) 

        # Compile the code into a callable cuda function.
        mod = compiler.SourceModule(cuda_source)
        # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache.
        self.fun = mod.get_function('_gce_kernel')

        # Prefer 48KB of L1 cache when possible.
        self.fun.set_cache_config(drv.func_cache.PREFER_L1)

        # Get address of global variable in module.
        # Note: contains a work-around for problems with complex types.
        my_get_global = lambda name: mod.get_global('_' + name + '_temp') 

        # Useful information about the kernel.
        self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \
                            'const_bytes': self.fun.const_size_bytes, \
                            'local_bytes': self.fun.local_size_bytes, \
                            'num_regs': self.fun.num_regs}

        # Get some valid execution configurations. 
        self.exec_configs = self._get_exec_configs( \
                                        self.fun.max_threads_per_block, \
                                        padding, smem_per_thread, shape_filter)

        # Prepare the function by telling pycuda the types of the inputs.
        arg_types = []
        for p in params:
            if p['gce_type'] is 'number':
                arg_types.append(p['dtype'])
#             elif p['gce_type'] is 'const':
#                 arg_types.append(p['dtype'])
#                 # pass # Consts don't actually get passed in.
            else:
                arg_types.append(np.intp)
        self.fun.prepare([np.int32, np.int32] + arg_types)

        # Define the function which we will use to execute the kernel.
        # TODO: Make a shortcut version with lower overhead.
        # Used for asynchronous execution and timing.
        stream = drv.Stream()
        start, start2, pad_done, sync_done, comp_done, all_done = \
            [drv.Event() for k in range(6)]

        # Kernel execution over a range of x-values.
        def execute_range(x_start, x_end, gpu_params, cfg, stream):
            """ Defines asynchronous kernel execution for a range of x. """
            self.fun.prepared_async_call( \
                cfg['grid_shape'][::-1], \
                cfg['block_shape'][::-1] + (1,), \
                stream, \
                *([np.int32(x_start), np.int32(x_end)] + gpu_params), \
                shared_size=cfg['smem_size'])

        x_start, x_end = get_space_info()['x_range'] # This node's range.

        def execute(cfg, *args, **kwargs):

            # Parse keyword arguments.
            post_sync_grids = kwargs.get('post_sync', None)

            # Parse the inputs.
            gpu_params = []
            for k in range(len(params)):
                if params[k]['gce_type'] is 'number':
                    gpu_params.append(params[k]['dtype'](args[k]))
                elif params[k]['gce_type'] is 'const': # Load Const.
                    gpu_params.append(args[k].data.ptr)
                    # Const no longer actually "const" in cuda code.
#                     d_ptr, size_in_bytes = my_get_global(params[k]['name'])
#                     drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes)
                elif params[k]['gce_type'] is 'grid': 
                    if args[k]._xlap is 0:
                        gpu_params.append(args[k].data.ptr)
                    else:
                        gpu_params.append(args[k].data.ptr + \
                                            args[k]._xlap_offset)
                elif params[k]['gce_type'] is 'out': 
                    args[k].data.fill(args[k].dtype(0)) # Initialize the Out.
                    gpu_params.append(args[k].data.ptr)
                else:
                    raise TypeError('Invalid input type.')

            # See if we need to synchronize grids after kernel execution.
            if post_sync_grids is None:
                sync_pad = 0
            else:
                sync_pad = max([g._xlap for g in post_sync_grids])

            start2.record(stream)
            comm.Barrier()
            start.record(stream)
            
            # Execute kernel in padded regions first.
            execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream)
            execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream)
            pad_done.record(stream) # Just for timing purposes.
            stream.synchronize() # Wait for execution to finish.

            # Begin kernel execution in remaining "core" region.
            execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, cfg, stream)
            comp_done.record(stream) # Timing only.

            # While core kernel is executing, perform synchronization.
            if post_sync_grids is not None: # Synchronization needed.
                for grid in post_sync_grids:
                    grid.synchronize_start() # Start synchronization.

                # Keep on checking until everything is done.
                while not (all([grid.synchronize_isdone() \
                                for grid in post_sync_grids]) and \
                        stream.is_done()):
                    pass

            else: # Nothing to synchronize.
                stream.synchronize() # Just wait for execution to finish.

            sync_done.record() # Timing.

            # Obtain the result for all Outs.
            batch_reduce(*[args[k] for k in range(len(params)) \
                                if params[k]['gce_type'] is 'out'])
            all_done.record() # Timing.
            all_done.synchronize()

            # The delay between sync_done and comp_done should be small.
            # Otherwise, the parallelization efficiency is suffering.
            print "(%d)" % comm.Get_rank(),
            for milliseconds in  [event_done.time_since(start) for event_done in \
                            (start2, pad_done, sync_done, comp_done, all_done)]:
                print "%1.4f " % milliseconds, 
            print cfg['block_shape']

            return comp_done.time_since(start) # Return time needed to execute the function.
        
        self.execute = execute # Save execution function in Kernel instance.
        self.min_exec_time = float('inf') # Stores the fastest execution time.
Example #14
0
    def __init__(self, code, *vars, **kwargs):
        """ Prepare a cuda function that will execute on the GCE space.

        Input variables:
        code -- The looped cuda code to be executed.
        vars -- (name, gce_type, numpy_type) of the input arguments.

        Keyword variables:
        pre_loop -- Cuda code that is executed before the loop code.
        shape_filter -- Can be either 'all', 'skinny', or 'square'.
        padding -- (yn, yp, zn, zp), describes the number of "extra" threads
            to be run on the border of each thread block.
        smem_per_thread -- Number of bytes of shared memory needed by a thread.
        """

        # Make sure there are no extraneous keyword arguments.
        if any([key not in \
                ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread')
                for key in kwargs.keys()]):
            raise TypeError('Invalid key used.')

        # Process keyword arguments.
        pre_code = kwargs.get('pre_loop', '')
        shape_filter = kwargs.get('shape_filter', 'skinny')
        padding = kwargs.get('padding', (0, 0, 0, 0))
        smem_per_thread = kwargs.get('smem_per_thread', 0)

        # Dictionary for conversion from numpy to cuda types.
        cuda_types = {np.float32: 'float', np.float64: 'double', \
                        np.complex64: 'pycuda::complex<float>', \
                        np.complex128: 'pycuda::complex<double>'}
        # Dictionary for conversion from numpy to alternate type for Consts.
        alt_types = {np.float32: 'float', np.float64: 'double', \
                        np.complex64: 'float2', np.complex128: 'double2'}

        # Process vars.
        params = [{'name': v[0], \
                'gce_type': v[1], \
                'dtype': v[2], \
                'cuda_type': cuda_types[v[2]]} for v in vars]
        #         for k in range(len(params)): # We need size information for consts.
        #             if params[k]['gce_type'] is 'const':
        #                 params[k]['num_elems'] = vars[k][3]
        #                 params[k]['alt_type'] = alt_types[params[k]['dtype']]

        # Get the template and render it using jinja2.
        shape = get_space_info()['shape']  # Shape of the space.
        template = _jinja_env.get_template(_template_file)
        cuda_source = template.render(  params=params, \
                                        padding=padding, \
                                        dims =get_space_info()['shape'], \
                                        x_range=get_space_info()['x_range'], \
                                        preloop_code=pre_code, \
                                        loop_code=code, \
                                        flat_tag='_f')

        # Write out the source code for debugging purposes.
        open('/tmp/gce_kernel.cu', 'w').write(cuda_source)

        # Compile the code into a callable cuda function.
        mod = compiler.SourceModule(cuda_source)
        # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache.
        self.fun = mod.get_function('_gce_kernel')

        # Prefer 48KB of L1 cache when possible.
        self.fun.set_cache_config(drv.func_cache.PREFER_L1)

        # Get address of global variable in module.
        # Note: contains a work-around for problems with complex types.
        my_get_global = lambda name: mod.get_global('_' + name + '_temp')

        # Useful information about the kernel.
        self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \
                            'const_bytes': self.fun.const_size_bytes, \
                            'local_bytes': self.fun.local_size_bytes, \
                            'num_regs': self.fun.num_regs}

        # Get some valid execution configurations.
        self.exec_configs = self._get_exec_configs( \
                                        self.fun.max_threads_per_block, \
                                        padding, smem_per_thread, shape_filter)

        # Prepare the function by telling pycuda the types of the inputs.
        arg_types = []
        for p in params:
            if p['gce_type'] is 'number':
                arg_types.append(p['dtype'])
#             elif p['gce_type'] is 'const':
#                 arg_types.append(p['dtype'])
#                 # pass # Consts don't actually get passed in.
            else:
                arg_types.append(np.intp)
        self.fun.prepare([np.int32, np.int32] + arg_types)

        # Define the function which we will use to execute the kernel.
        # TODO: Make a shortcut version with lower overhead.
        # Used for asynchronous execution and timing.
        stream = drv.Stream()
        start, start2, pad_done, sync_done, comp_done, all_done = \
            [drv.Event() for k in range(6)]

        # Kernel execution over a range of x-values.
        def execute_range(x_start, x_end, gpu_params, cfg, stream):
            """ Defines asynchronous kernel execution for a range of x. """
            self.fun.prepared_async_call( \
                cfg['grid_shape'][::-1], \
                cfg['block_shape'][::-1] + (1,), \
                stream, \
                *([np.int32(x_start), np.int32(x_end)] + gpu_params), \
                shared_size=cfg['smem_size'])

        x_start, x_end = get_space_info()['x_range']  # This node's range.

        def execute(cfg, *args, **kwargs):

            # Parse keyword arguments.
            post_sync_grids = kwargs.get('post_sync', None)

            # Parse the inputs.
            gpu_params = []
            for k in range(len(params)):
                if params[k]['gce_type'] is 'number':
                    gpu_params.append(params[k]['dtype'](args[k]))
                elif params[k]['gce_type'] is 'const':  # Load Const.
                    gpu_params.append(args[k].data.ptr)
                    # Const no longer actually "const" in cuda code.
#                     d_ptr, size_in_bytes = my_get_global(params[k]['name'])
#                     drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes)
                elif params[k]['gce_type'] is 'grid':
                    if args[k]._xlap is 0:
                        gpu_params.append(args[k].data.ptr)
                    else:
                        gpu_params.append(args[k].data.ptr + \
                                            args[k]._xlap_offset)
                elif params[k]['gce_type'] is 'out':
                    args[k].data.fill(args[k].dtype(0))  # Initialize the Out.
                    gpu_params.append(args[k].data.ptr)
                else:
                    raise TypeError('Invalid input type.')

            # See if we need to synchronize grids after kernel execution.
            if post_sync_grids is None:
                sync_pad = 0
            else:
                sync_pad = max([g._xlap for g in post_sync_grids])

            start2.record(stream)
            comm.Barrier()
            start.record(stream)

            # Execute kernel in padded regions first.
            execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream)
            execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream)
            pad_done.record(stream)  # Just for timing purposes.
            stream.synchronize()  # Wait for execution to finish.

            # Begin kernel execution in remaining "core" region.
            execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params,
                          cfg, stream)
            comp_done.record(stream)  # Timing only.

            # While core kernel is executing, perform synchronization.
            if post_sync_grids is not None:  # Synchronization needed.
                for grid in post_sync_grids:
                    grid.synchronize_start()  # Start synchronization.

                # Keep on checking until everything is done.
                while not (all([grid.synchronize_isdone() \
                                for grid in post_sync_grids]) and \
                        stream.is_done()):
                    pass

            else:  # Nothing to synchronize.
                stream.synchronize()  # Just wait for execution to finish.

            sync_done.record()  # Timing.

            # Obtain the result for all Outs.
            batch_reduce(*[args[k] for k in range(len(params)) \
                                if params[k]['gce_type'] is 'out'])
            all_done.record()  # Timing.
            all_done.synchronize()

            # The delay between sync_done and comp_done should be small.
            # Otherwise, the parallelization efficiency is suffering.
            print "(%d)" % comm.Get_rank(),
            for milliseconds in  [event_done.time_since(start) for event_done in \
                            (start2, pad_done, sync_done, comp_done, all_done)]:
                print "%1.4f " % milliseconds,
            print cfg['block_shape']

            return comp_done.time_since(
                start)  # Return time needed to execute the function.

        self.execute = execute  # Save execution function in Kernel instance.
        self.min_exec_time = float('inf')  # Stores the fastest execution time.
Example #15
0
    def _get_exec_configs(self, threads_max, padding, smem_per_thread, \
                        shape_filter):
        """ Find all valid execution configurations. """

        # Padding of the kernel.
        y_pad = sum(padding[0:2])
        z_pad = sum(padding[2:4])

        # Shared memory requirements.
        smem_size = lambda b_shape: smem_per_thread * \
                                            (b_shape[0] * b_shape[1])

        # The kind of shapes that we are interested in.
        if shape_filter is 'skinny':  # Only z-dominant shapes.
            my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \
     (b_shape[1] > 8) and ((b_shape[1] % 16) == 0)
        elif shape_filter is 'square':  # Only square-ish shapes.
            my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \
                                        (b_shape[1] < 2 * b_shape[0]) and \
     (b_shape[0] > 8) and \
     (b_shape[1] > 8)
        elif shape_filter is 'all':  # All shapes okay.
            my_filter = lambda b_shape: b_shape[
                1] > 1  # Must be greater than 1.
        else:
            raise TypeError('Unrecognized shape filter.')

        # Function defining valid block shapes.
        smem_max = get_space_info()['max_shared_mem']
        is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \
                                            my_filter(b_shape) and \
                                            (b_shape[0] * b_shape[1]) <= \
                                                threads_max

        # Create a list of all valid block shapes.
        valid_block_shapes = []
        z_max = get_space_info()['max_block_z']
        y_max = get_space_info()['max_block_y']
        for j in range(y_pad + 1, y_max + 1):
            for k in range(z_pad + 1, z_max + 1):
                if is_valid_shape((j, k)):
                    valid_block_shapes.append(
                        (j, k))  # Block shape is (yy,zz).

        # A hack for profiling
        # valid_block_shapes = ((31,16),)
        # valid_block_shapes = ((17,22),)

        if not valid_block_shapes:  # Make sure the list is not empty.
            raise TypeError('No valid shapes found.')

        # Create a list of all possible execution configurations.
        # Note that the convention for both block_shape and grid_shape is
        # (yy,zz). Among other things, this leads to the (slightly)
        # tricky computation of grid_shape.
        sp_shape = get_space_info()['shape']  # Shape of the space.
        return [{   'block_shape': vbs, \
                    'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \
                                    int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \
                    'smem_size': smem_size(vbs)}
                for vbs in valid_block_shapes]