Ejemplo n.º 1
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol,
                    kernargs,
                    workgroup_size=self.local_size,
                    grid_size=self.global_size,
                    signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(
                    lambda: kernarg_region.free(kernargs))
Ejemplo n.º 2
0
    def __call__(self, *args):
        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = ctypes.addressof(kernargs) + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        assert base <= ctypes.sizeof(
            kernargs), "Kernel argument size is invalid"

        # Actual Kernel launch
        qq = ctx.default_queue

        # Dispatch
        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
                    grid_size=self.global_size)

        # Free kernel region
        kernarg_region.free(kernargs)
Ejemplo n.º 3
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
                    grid_size=self.global_size, signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(lambda: kernarg_region.free(kernargs))