def lower_parfor_rollback(lowerer, parfor): try: _lower_parfor_gufunc(lowerer, parfor) if config.DEBUG: device_filter_str = ( dpctl.get_current_queue().get_sycl_device().filter_string) msg = "Parfor offloaded to " + device_filter_str print(msg, parfor.loc) except Exception as e: device_filter_str = ( dpctl.get_current_queue().get_sycl_device().filter_string) msg = ( "Failed to offload parfor to " + device_filter_str + ". Falling " "back to default CPU parallelization. Please file a bug report " "at https://github.com/IntelPython/numba-dppy. To help us debug " "the issue, please add the traceback to the bug report.") if not config.DEBUG: msg += " Set the environment variable NUMBA_DPPY_DEBUG to 1 to " msg += "generate a traceback." warnings.warn(NumbaPerformanceWarning(msg, parfor.loc)) raise e
def auto_device(obj, stream=0, copy=True, user_explicit=False): """ Create a DeviceRecord or DeviceArray like obj and optionally copy data from host to device. If obj already represents device memory, it is returned and no copy is made. """ if _driver.is_device_memory(obj): return obj, False elif hasattr(obj, '__cuda_array_interface__'): return numba.cuda.as_cuda_array(obj), False else: if isinstance(obj, np.void): devobj = from_record_like(obj, stream=stream) else: # This allows you to pass non-array objects like constants and # objects implementing the array interface # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.interface.html # into this function (with no overhead -- copies -- for `obj`s # that are already `ndarray`s. obj = np.array(obj, copy=False, subok=True) sentry_contiguous(obj) devobj = from_array_like(obj, stream=stream) if copy: if config.CUDA_WARN_ON_IMPLICIT_COPY: if (not user_explicit and (not isinstance(obj, DeviceNDArray) and isinstance(obj, np.ndarray))): msg = ("Host array used in CUDA kernel will incur " "copy overhead to/from device.") warn(NumbaPerformanceWarning(msg)) devobj.copy_to_device(obj, stream=stream) return devobj, True
def matmul_typer(self, a, b, out=None): """ Typer function for Numpy matrix multiplication. """ if not isinstance(a, types.Array) or not isinstance(b, types.Array): return if not all(x.ndim in (1, 2) for x in (a, b)): raise TypingError("%s only supported on 1-D and 2-D arrays" % (self.func_name, )) # Output dimensionality ndims = set([a.ndim, b.ndim]) if ndims == set([2]): # M * M out_ndim = 2 elif ndims == set([1, 2]): # M* V and V * M out_ndim = 1 elif ndims == set([1]): # V * V out_ndim = 0 if out is not None: if out_ndim == 0: raise TypeError( "explicit output unsupported for vector * vector") elif out.ndim != out_ndim: raise TypeError("explicit output has incorrect dimensionality") if not isinstance(out, types.Array) or out.layout != "C": raise TypeError("output must be a C-contiguous array") all_args = (a, b, out) else: all_args = (a, b) if not (config.DISABLE_PERFORMANCE_WARNINGS or all(x.layout in "CF" for x in (a, b))): msg = "%s is faster on contiguous arrays, called on %s" % ( self.func_name, (a, b), ) warnings.warn(NumbaPerformanceWarning(msg)) if not all(x.dtype == a.dtype for x in all_args): raise TypingError("%s arguments must all have the same dtype" % (self.func_name, )) if not isinstance(a.dtype, (types.Float, types.Complex)): raise TypingError("%s only supported on float and complex arrays" % (self.func_name, )) if out: return out elif out_ndim > 0: return types.Array(a.dtype, out_ndim, "C") else: return a.dtype
def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): self.dispatcher = dispatcher self.griddim = griddim self.blockdim = blockdim self.stream = stream self.sharedmem = sharedmem if config.CUDA_LOW_OCCUPANCY_WARNINGS: ctx = get_context() smcount = ctx.device.MULTIPROCESSOR_COUNT grid_size = griddim[0] * griddim[1] * griddim[2] if grid_size < 2 * smcount: msg = ("Grid size ({grid}) < 2 * SM count ({sm}) " "will likely result in GPU under utilization due " "to low occupancy.") msg = msg.format(grid=grid_size, sm=2 * smcount) warn(NumbaPerformanceWarning(msg))
def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): self.dispatcher = dispatcher self.griddim = griddim self.blockdim = blockdim self.stream = stream self.sharedmem = sharedmem if config.CUDA_LOW_OCCUPANCY_WARNINGS: # Warn when the grid has fewer than 128 blocks. This number is # chosen somewhat heuristically - ideally the minimum is 2 times # the number of SMs, but the number of SMs varies between devices - # some very small GPUs might only have 4 SMs, but an H100-SXM5 has # 132. In general kernels should be launched with large grids # (hundreds or thousands of blocks), so warning when fewer than 128 # blocks are used will likely catch most beginner errors, where the # grid tends to be very small (single-digit or low tens of blocks). min_grid_size = 128 grid_size = griddim[0] * griddim[1] * griddim[2] if grid_size < min_grid_size: msg = (f"Grid size {grid_size} will likely result in GPU " "under-utilization due to low occupancy.") warn(NumbaPerformanceWarning(msg))