def __call__(self, *args): self._sentry_resource_limit() ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] # contains lambdas to execute on return retr = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values, retr) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = kernargs.value + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align # Actual Kernel launch qq = ctx.default_queue if self.stream is None: hsa.implicit_sync() # Dispatch signal = None if self.stream is not None: signal = hsa.create_signal(1) qq.insert_barrier(self.stream._get_last_signal()) qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size, signal=signal) if self.stream is not None: self.stream._add_signal(signal) # retrieve auto converted arrays for wb in retr: wb() # Free kernel region if kernargs is not None: if self.stream is None: kernarg_region.free(kernargs) else: self.stream._add_callback( lambda: kernarg_region.free(kernargs))
def __call__(self, *args): ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = ctypes.addressof(kernargs) + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align assert base <= ctypes.sizeof( kernargs), "Kernel argument size is invalid" # Actual Kernel launch qq = ctx.default_queue # Dispatch qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size) # Free kernel region kernarg_region.free(kernargs)