Exemple #1
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
Exemple #3
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if not self.determ:
                drv.memset_d32_async(*self.zero_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:])

            if self.convert_args:
                _fp_convert(*self.convert_args)

        if unbind:
            self.zero_args = self.convert_args = None
            for kernel_params in self.kernels:
                kernel_params[3:8] = (None,) * 5
Exemple #4
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if not self.determ:
                drv.memset_d32_async(*self.zero_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:])

            if self.convert_args:
                _fp_convert(*self.convert_args)

        if unbind:
            self.zero_args = self.convert_args = None
            for kernel_params in self.kernels:
                kernel_params[3:8] = (None, ) * 5
Exemple #5
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_transpose_kernel(self.dtype_str)

        kernel = kernel_specs.get_kernel(self.kernel[0])
        for r in range(repeat):

            # let atomic adds accumulate on top
            if not self.beta:
                drv.memset_d8_async(*self.zero_args)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            kernel.prepared_async_call(*self.kernel[1:])

        if unbind:
            self.zero_args = None
            self.shuffle_args[2:5] = (None,) * 3
            self.kernel[3:8] = (None,) * 5
Exemple #6
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_transpose_kernel(self.dtype_str)

        kernel = kernel_specs.get_kernel(self.kernel[0])
        for r in range(repeat):

            # let atomic adds accumulate on top
            if not self.beta:
                drv.memset_d8_async(*self.zero_args)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            kernel.prepared_async_call(*self.kernel[1:])

        if unbind:
            self.zero_args = None
            self.shuffle_args[2:5] = (None, ) * 3
            self.kernel[3:8] = (None, ) * 5
Exemple #7
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype_str)

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype_str)

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8