Example #1
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async(self.gpudata,
                                        unpack_from('B', value)[0], self.size,
                                        stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                         unpack_from('H', value)[0], self.size,
                                         stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                         unpack_from('I', value)[0], self.size,
                                         stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes,
                                      stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Example #2
0
    def _assign(self, value):

        stream = self.backend.stream
        if isinstance(value, (int, float)):

            # if we have a contiguous array, then use the speedy driver kernel
            if self.is_contiguous:

                value = self.dtype.type(value)

                if self.dtype.itemsize == 1:
                    drv.memset_d8_async( self.gpudata,
                                   unpack_from('B', value)[0],
                                   self.size, stream)
                elif self.dtype.itemsize == 2:
                    drv.memset_d16_async(self.gpudata,
                                   unpack_from('H', value)[0],
                                   self.size, stream)
                else:
                    drv.memset_d32_async(self.gpudata,
                                   unpack_from('I', value)[0],
                                   self.size, stream)

            # otherwise use our copy kerel
            else:
                OpTreeNode.build("assign", self, value)

        elif isinstance(value, GPUTensor):
            # TODO: add an is_binary_compat like function
            if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype:
                drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream)
            else:
                OpTreeNode.build("assign", self, value)

        # collapse and execute an op tree as a kernel
        elif isinstance(value, OpTreeNode):
            OpTreeNode.build("assign", self, value)

        # assign to numpy array (same as set())
        elif isinstance(value, np.ndarray):
            self.set(value, device=None)

        else:
            raise TypeError("Invalid type for assignment: %s" % type(value))

        return self
Example #3
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_transpose_kernel(self.dtype_str)

        kernel = kernel_specs.get_kernel(self.kernel[0])
        for r in range(repeat):

            # let atomic adds accumulate on top
            if not self.beta:
                drv.memset_d8_async(*self.zero_args)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            kernel.prepared_async_call(*self.kernel[1:])

        if unbind:
            self.zero_args = None
            self.shuffle_args[2:5] = (None,) * 3
            self.kernel[3:8] = (None,) * 5
Example #4
0
    def execute(self, repeat=1, unbind=True):

        shuffle_kernel = _get_transpose_kernel(self.dtype_str)

        kernel = kernel_specs.get_kernel(self.kernel[0])
        for r in range(repeat):

            # let atomic adds accumulate on top
            if not self.beta:
                drv.memset_d8_async(*self.zero_args)

            shuffle_kernel.prepared_async_call(*self.shuffle_args)

            kernel.prepared_async_call(*self.kernel[1:])

        if unbind:
            self.zero_args = None
            self.shuffle_args[2:5] = (None, ) * 3
            self.kernel[3:8] = (None, ) * 5