def execute(self, repeat=1, unbind=True): for r in range(repeat): drv.memset_d32_async(*self.zero_args) self.kernel.prepared_async_call(*self.launch_args) if self.convert_args: _fp_convert(*self.convert_args) if unbind: self.zero_args = self.convert_args = None self.launch_args[2:9] = (None,) * 7
def execute(self, repeat=1, unbind=True): print('repeat', repeat) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) print('calling kernel', self.kernel, 'args', self.launch_args, 'shared_size', self.shared) self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared) if unbind: self.bsum_zero = None self.launch_args[2:9] = (None,) * 7
def execute(self, repeat=1, unbind=True): for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared) if unbind: self.bsum_zero = None for kernel_params in self.kernels: kernel_params[3:11] = (None,) * 8
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:]) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) shuffle_kernel.prepared_async_call(*self.shuffle_args) self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared) if unbind: self.bsum_zero = None self.shuffle_args[2:5] = (None,) * 3 self.launch_args[2:9] = (None,) * 7
def execute(self, repeat=1, unbind=True): C = self.shuffle_args[12] assert C >= 4, "C dim must be 4 or greater for CUDA C backprop kernel" shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:]) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) shuffle_kernel.prepared_async_call(*self.shuffle_args) self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared) if unbind: self.bsum_zero = None self.shuffle_args[2:5] = (None,) * 3 self.launch_args[2:9] = (None,) * 7
def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async(self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def _assign(self, value): stream = self.backend.stream if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8_async( self.gpudata, unpack_from('B', value)[0], self.size, stream) elif self.dtype.itemsize == 2: drv.memset_d16_async(self.gpudata, unpack_from('H', value)[0], self.size, stream) else: drv.memset_d32_async(self.gpudata, unpack_from('I', value)[0], self.size, stream) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod_async(self.gpudata, value.gpudata, self.nbytes, stream) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value, device=None) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_shuffle_kernel(self.dtype.str[1:]) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) shuffle_kernel.prepared_async_call(*self.shuffle_args) self.kernel.prepared_async_call(*self.launch_args, shared_size=self.shared) if unbind: self.bsum_zero = None self.shuffle_args[2:5] = (None, ) * 3 self.launch_args[2:9] = (None, ) * 7
def execute(self, repeat=1, unbind=True): for r in range(repeat): if not self.determ: drv.memset_d32_async(*self.zero_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:]) if self.convert_args: _fp_convert(*self.convert_args) if unbind: self.zero_args = self.convert_args = None for kernel_params in self.kernels: kernel_params[3:8] = (None,) * 5
def execute(self, repeat=1, unbind=True): for r in range(repeat): if not self.determ: drv.memset_d32_async(*self.zero_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:]) if self.convert_args: _fp_convert(*self.convert_args) if unbind: self.zero_args = self.convert_args = None for kernel_params in self.kernels: kernel_params[3:8] = (None, ) * 5
def execute(self, repeat=1, unbind=True): shuffle_kernel = _get_shuffle_kernel(self.dtype_str) for r in range(repeat): if self.bsum_zero: drv.memset_d32_async(*self.bsum_zero) shuffle_kernel.prepared_async_call(*self.shuffle_args) for kernel_params in self.kernels: kernel = kernel_specs.get_kernel(kernel_params[0]) kernel.prepared_async_call(*kernel_params[1:], shared_size=self.shared) if unbind: self.bsum_zero = None self.shuffle_args[2:5] = (None,) * 3 for kernel_params in self.kernels: kernel_params[3:11] = (None,) * 8