Esempio n. 1
0
    def execute(self, repeat=1, unbind=True):
        for r in range(repeat):
            drv.memset_d32_async(*self.zero_args)
            self.kernel.prepared_async_call(*self.launch_args)
            if self.convert_args:
                _fp_convert(*self.convert_args)

        if unbind:
            self.zero_args = self.convert_args = None
            self.launch_args[2:9] = (None,) * 7
Esempio n. 2
0
 def execute(self, repeat=1, unbind=True):
     print('repeat', repeat)
     for r in range(repeat):
         if self.bsum_zero:
             drv.memset_d32_async(*self.bsum_zero)
         print('calling kernel', self.kernel, 'args', self.launch_args, 'shared_size', self.shared)
         self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared)
     if unbind:
         self.bsum_zero = None
         self.launch_args[2:9] = (None,) * 7
Esempio n. 3
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
Esempio n. 5
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:])

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)
            self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            self.launch_args[2:9] = (None,) * 7
Esempio n. 6
0
    def execute(self, repeat=1, unbind=True):
        C = self.shuffle_args[12]
        assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel"

        shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:])

        for r in range(repeat):
            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)
            shuffle_kernel.prepared_async_call(*self.shuffle_args)
            self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            self.launch_args[2:9] = (None,) * 7
Esempio n. 7
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async(self.gpudata,
                                        unpack_from('B', value)[0], self.size,
                                        stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                         unpack_from('H', value)[0], self.size,
                                         stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                         unpack_from('I', value)[0], self.size,
                                         stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes,
                                      stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Esempio n. 8
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size, stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size, stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size, stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Esempio n. 9
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:])

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)
            self.kernel.prepared_async_call(*self.launch_args,
                                            shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None, ) * 3
            self.launch_args[2:9] = (None, ) * 7
Esempio n. 10
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if not self.determ:
                drv.memset_d32_async(*self.zero_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:])

            if self.convert_args:
                _fp_convert(*self.convert_args)

        if unbind:
            self.zero_args = self.convert_args = None
            for kernel_params in self.kernels:
                kernel_params[3:8] = (None,) * 5
Esempio n. 11
0
    def execute(self, repeat=1, unbind=True):

        for r in range(repeat):

            if not self.determ:
                drv.memset_d32_async(*self.zero_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:])

            if self.convert_args:
                _fp_convert(*self.convert_args)

        if unbind:
            self.zero_args = self.convert_args = None
            for kernel_params in self.kernels:
                kernel_params[3:8] = (None, ) * 5
Esempio n. 12
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype_str)

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_shuffle_kernel(self.dtype_str)

        for r in range(repeat):

            if self.bsum_zero:
                drv.memset_d32_async(*self.bsum_zero)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            for kernel_params in self.kernels:
                kernel = kernel_specs.get_kernel(kernel_params[0])
                kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared)

        if unbind:
            self.bsum_zero = None
            self.shuffle_args[2:5] = (None,) * 3
            for kernel_params in self.kernels:
                kernel_params[3:11] = (None,) * 8