def index_block_batches_T(indexed, tensor, batch, patchsize, nblocks, gpuid): # -- prepare data -- print("indexed.shape ", indexed.shape) print("tensor.shape ", tensor.shape) print("batch.shape ", batch.shape) batchsize = batch.shape[2] indexed = indexed[:, :, :batchsize] numba.cuda.select_device(gpuid) indexed_nba = cuda.as_cuda_array(indexed) batch_nba = cuda.as_cuda_array(batch) tensor_nba = cuda.as_cuda_array(tensor) # -- prepare cuda -- npix = tensor.shape[1] threads_per_block = 64 blocks = npix // threads_per_block + 1 # -- run cuda -- index_block_batches_cuda_T[blocks, threads_per_block](indexed_nba, tensor_nba, batch_nba, patchsize, nblocks) return indexed
def index_along_frames(patches, dframes): # -- set cuda device -- device = patches.device gpuid = device.index numba.cuda.select_device(gpuid) # -- get shapes -- naligns, npatches, nframes, nftrs = patches.shape naligns, npatches = dframes.shape fpatches = torch.zeros((naligns, npatches, nftrs)) fpatches = fpatches.to(device) # print("naligns,npatches,nframes,nftrs",naligns,npatches,nframes,nftrs) # -- create numba cuda -- dframes_nba = cuda.as_cuda_array(dframes) patches_nba = cuda.as_cuda_array(patches) fpatches_nba = cuda.as_cuda_array(fpatches) # -- exec indexing cuda-kernel -- threads_per_block = (32, 32) blocks_aligns = naligns // threads_per_block[0] + ( naligns % threads_per_block[0] != 0) blocks_patches = npatches // threads_per_block[1] + ( npatches % threads_per_block[1] != 0) blocks = (blocks_aligns, blocks_patches) index_along_frames_cuda[blocks, threads_per_block](fpatches_nba, patches_nba, dframes_nba) return fpatches
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): rand_mat = (cp.random.rand(nrows, ncols) * 10) rand_mat = cp.array(rand_mat, order=order).astype(dtype) if type == 'numpy': result = np.array(cp.asnumpy(rand_mat), order=order) if type == 'cupy': result = rand_mat if type == 'numba': result = nbcuda.as_cuda_array(rand_mat) if type == 'cudf': result = cudf.DataFrame() result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat)) if type == 'pandas': result = cudf.DataFrame() result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat)) result = result.to_pandas() if type == 'cuml': result = CumlArray(data=rand_mat, dtype=dtype, shape=rand_mat.shape, order=order if order != 'K' else None) if out_dtype: return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype), order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order)
def test_array_views(self): """Views created via array interface support: - Strided slices - Strided slices """ h_arr = np.random.random(10) c_arr = cuda.to_device(h_arr) arr = cuda.as_cuda_array(c_arr) # __getitem__ interface accesses expected data # Direct views np.testing.assert_array_equal(arr.copy_to_host(), h_arr) np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr) # Slicing np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5]) # Strided view np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2]) # View of strided array arr_strided = cuda.as_cuda_array(c_arr[::2]) np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2]) # A strided-view-of-array and view-of-strided-array have the same # shape, strides, itemsize, and alloc_size self.assertEqual(arr[::2].shape, arr_strided.shape) self.assertEqual(arr[::2].strides, arr_strided.strides) self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize) self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size) self.assertEqual(arr[::2].nbytes, arr_strided.size * arr_strided.dtype.itemsize) # __setitem__ interface propogates into external array # Writes to a slice arr[:5] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host(), np.concatenate((np.full(5, np.pi), h_arr[5:]))) # Writes to a slice from a view arr[:5] = arr[5:] np.testing.assert_array_equal(c_arr.copy_to_host(), np.concatenate((h_arr[5:], h_arr[5:]))) # Writes through a view arr[:] = cuda.to_device(h_arr) np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr) # Writes to a strided slice arr[::2] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host()[::2], np.full(5, np.pi), ) np.testing.assert_array_equal(c_arr.copy_to_host()[1::2], h_arr[1::2])
def test_consume_no_sync(self): # Create a foreign array with no stream f_arr = ForeignArray(cuda.device_array(10)) with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', return_value=None) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was not called mock_sync.assert_not_called()
def test_consume_sync(self): # Create a foreign array with a stream s = cuda.stream() f_arr = ForeignArray(cuda.device_array(10, stream=s)) with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', return_value=None) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was called mock_sync.assert_called_once_with()
def create_input(input_type, dtype, shape, order): rand_ary = cp.ones(shape, dtype=dtype, order=order) if input_type == 'numpy': return np.array(cp.asnumpy(rand_ary), dtype=dtype, order=order) elif input_type == 'numba': return cuda.as_cuda_array(rand_ary) elif input_type == 'series': return cudf.Series(cuda.as_cuda_array(rand_ary)) else: return rand_ary
def get_small_dataset(output_type): ary = [[1.0, 4.0, 4.0], [2.0, 2.0, 2.0], [5.0, 1.0, 1.0]] ary = cp.asarray(ary) if output_type == 'numba': return as_cuda_array(ary) elif output_type == 'cupy': return ary elif output_type == 'numpy': return cp.asnumpy(ary) else: return cudf.DataFrame.from_gpu_matrix(as_cuda_array(ary))
def __init__(self, minibatch: int, maxT: int, maxU: int, alphabet_size: int, workspace, blank: int, num_threads: int, stream): """ Helper class to launch the CUDA Kernels to compute the Transducer Loss. Args: minibatch: Int representing the batch size. maxT: The maximum possible acoustic sequence length. Represents T in the logprobs tensor. maxU: The maximum possible target sequence length. Represents U in the logprobs tensor. alphabet_size: The vocabulary dimension V+1 (inclusive of RNNT blank). workspace: An allocated chunk of memory that will be sliced off and reshaped into required blocks used as working memory. blank: Index of the RNNT blank token in the vocabulary. Generally the first or last token in the vocab. num_threads: Number of OMP threads to launch. stream: Numba Cuda Stream. """ self.minibatch_ = minibatch self.maxT_ = maxT self.maxU_ = maxU self.alphabet_size_ = alphabet_size self.gpu_workspace = cuda.as_cuda_array( workspace ) # a flat vector of floatX numbers that represents allocated memory slices self.blank_ = blank self.num_threads_ = num_threads self.stream_ = stream # type: cuda.cudadrv.driver.Stream if num_threads > 0: numba.set_num_threads(min(multiprocessing.cpu_count(), num_threads)) else: self.num_threads_ = numba.get_num_threads()
def apply_binarize(in_col, width): buf = rmm.DeviceBuffer(size=(in_col.size * width)) out = cuda.as_cuda_array(buf).view("int8").reshape((in_col.size, width)) if out.size > 0: out[:] = 0 binarize.forall(out.size)(in_col, out, width) return out
def __call__(self, tensor, mode=0): r""" Converts float weights to quantized weights. Args: - tensor: input data - mode: GFPQ mode for param GFPQ_MODE_INIT(0): There is no valid parameter in param[]. Generate the parameter and filled in param[]. GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate new parameter, update param[] when the new parameter is better. GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't generate parameter. Just use the param[]. """ return tensor data_cuda_array = cuda.as_cuda_array(tensor.data.detach()) data_p = data_cuda_array.device_ctypes_pointer self._param.mode = mode ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY( data_p, data_cuda_array.size, self._bit_width, ctypes.byref(self._param), self._stream.handle, self._cublas_handle) assert ret == 0, "HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret) return tensor
def convert_dtype(X, to_dtype=np.float32): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) X_m = X_m.astype(to_dtype) return cuda.as_cuda_array(X_m) else: raise TypeError("Received unsupported input type " % type(X)) return X
def check_ipc_handle_serialization(self, index_arg=None, foreign=False): # prepare data for IPC arr = np.arange(10, dtype=np.intp) devarr = cuda.to_device(arr) if index_arg is not None: devarr = devarr[index_arg] if foreign: devarr = cuda.as_cuda_array(ForeignArray(devarr)) expect = devarr.copy_to_host() # create IPC handle ctx = cuda.current_context() ipch = ctx.get_ipc_handle(devarr.gpu_data) # pickle buf = pickle.dumps(ipch) ipch_recon = pickle.loads(buf) self.assertIs(ipch_recon.base, None) self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle)) self.assertEqual(ipch_recon.size, ipch.size) # spawn new process for testing ctx = mp.get_context('spawn') result_queue = ctx.Queue() args = (ipch, result_queue) proc = ctx.Process(target=serialize_ipc_handle_test, args=args) proc.start() succ, out = result_queue.get() if not succ: self.fail(out) else: np.testing.assert_equal(expect, out) proc.join(3)
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype`, raising a TypeError if the conversion would lose information. """ would_lose_info = _typecast_will_lose_information(X, to_dtype) if would_lose_info: raise TypeError("Data type conversion would lose information.") if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) return X_m elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)): return X.astype(to_dtype, copy=False) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) X_m = X_m.astype(to_dtype, copy=False) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type: %s" % type(X)) return X
def predict(self, features_gen=None, as_cuda_array=False, flatten=True): if features_gen is None: features_gen = self._build_features() predicted_cva = torch.empty((self.diffusion_engine.num_defs_per_path * self.diffusion_engine.num_paths, 1), dtype=torch.float32, device=self.device) with cuda.devices.gpus[self.device.index]: d_predicted_cva = cuda.as_cuda_array( predicted_cva.view(self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths)) if as_cuda_array: out = d_predicted_cva else: out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=np.float32) if flatten: out = out.reshape(-1) while True: t = yield self._predict(t, features_gen, predicted_cva) if not as_cuda_array: d_predicted_cva.copy_to_host(out) yield out
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): rand_mat = (cp.random.rand(nrows, ncols) * 10) rand_mat = cp.array(rand_mat, dtype=dtype, order=order) if type == 'numpy': result = np.array(cp.asnumpy(rand_mat), order=order) if type == 'cupy': result = rand_mat if type == 'numba': result = nbcuda.as_cuda_array(rand_mat) if type == 'cudf': result = cudf.DataFrame(rand_mat) if type == 'pandas': result = pdDF(cp.asnumpy(rand_mat)) if type == 'cuml': result = CumlArray(data=rand_mat) if out_dtype: return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype), order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order)
def test_consume_sync_disabled(self): # Create a foreign array with a stream s = cuda.stream() f_arr = ForeignArray(cuda.device_array(10, stream=s)) # Set sync to false before testing. The test suite should generally be # run with sync enabled, but stash the old value just in case it is # not. with override_config('CUDA_ARRAY_INTERFACE_SYNC', False): with patch.object(cuda.cudadrv.driver.Stream, 'synchronize', return_value=None) as mock_sync: cuda.as_cuda_array(f_arr) # Ensure the synchronize method of a stream was not called mock_sync.assert_not_called()
def test_consume_no_stream(self): # Create a foreign array with no stream f_arr = ForeignArray(cuda.device_array(10)) # Ensure that the imported array has no default stream c_arr = cuda.as_cuda_array(f_arr) self.assertEqual(c_arr.stream, 0)
def deserialize(cls, header, frames): # Deserialize the mask, value, and offset frames arrays = [] for each_frame in frames: if hasattr(each_frame, "__cuda_array_interface__"): each_frame = cuda.as_cuda_array(each_frame) elif isinstance(each_frame, memoryview): each_frame = np.asarray(each_frame) each_frame = cudautils.to_device(each_frame) arrays.append(libcudf.cudf.get_ctype_ptr(each_frame)) # Use from_offsets to get nvstring data. # Note: array items = [nbuf, sbuf, obuf] scount = header["nvstrings"] data = nvstrings.from_offsets( arrays[1], arrays[2], scount, nbuf=arrays[0], ncount=header["null_count"], bdevmem=True, ) return column.as_column(data)
def auto_device(self, obj, stream=0, copy=True): """ Create a DeviceRecord or DeviceArray like obj and optionally copy data from host to device. If obj already represents device memory, it is returned and no copy is made. Uses RMM for device memory allocation if necessary. """ if cuda.driver.is_device_memory(obj): return obj, False if hasattr(obj, '__cuda_array_interface__'): new_dev_array = cuda.as_cuda_array(obj) # Allocate new output array using rmm and copy the numba device # array to an rmm owned device array out_dev_array = self.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) return out_dev_array, False else: if isinstance(obj, np.void): # raise NotImplementedError("DeviceRecord type not supported " # "by RMM") devobj = cuda.devicearray.from_record_like(obj, stream=stream) else: if not isinstance(obj, np.ndarray): obj = np.asarray(obj) cuda.devicearray.sentry_contiguous(obj) devobj = self.device_array_like(obj, stream=stream) if copy: devobj.copy_to_device(obj, stream=stream) return devobj, True
def device_to_host(obj: object) -> DeviceSerialized: header, frames = serialize(obj, serializers=["cuda", "pickle"]) is_cuda = [hasattr(f, "__cuda_array_interface__") for f in frames] frames = [ cuda.as_cuda_array(f).copy_to_host() if ic else f for ic, f in zip(is_cuda, frames) ] return DeviceSerialized(header, frames, is_cuda)
def uniform_pix_sampling(aligned, S=2): nframes, nimages, ncolor, h, w = aligned.shape device = aligned.device gpuid = device.index numba.cuda.select_device(gpuid) sims = torch.zeros((S, nimages, ncolor, h, w)).to(device) # rands = np.random.choice(nframes,(h,w)) aligned_nba = cuda.as_cuda_array(aligned) sims_nba = cuda.as_cuda_array(sims) index_bursts_by_frames(sims_nba, aligned_nba) masks = torch.zeros((S, nimages, ncolor, h, w)).to(device) masks_nba = cuda.as_cuda_array(masks) fill_masks(masks_nba, aligned_nba) return sims, masks
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): if has_cupy: import cupy as cp rand_mat = (cp.random.rand(nrows, ncols) * 10) rand_mat = cp.array(rand_mat, order=order).astype(dtype) if type == 'numpy': result = np.array(cp.asnumpy(rand_mat), order=order) if type == 'cupy': result = rand_mat if type == 'numba': result = cuda.as_cuda_array(rand_mat) if type == 'dataframe': X_df = cudf.DataFrame() result = X_df.from_gpu_matrix(cuda.as_cuda_array(rand_mat)) if out_dtype: return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype), order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order) else: rand_mat = (np.random.rand(nrows, ncols) * 10) rand_mat = np.array(rand_mat, order=order).astype(dtype) if type == 'numpy': result = deepcopy(rand_mat) if type == 'cupy': result = None if type == 'numba': result = cuda.to_device(rand_mat) if type == 'dataframe': X_df = cudf.DataFrame() result = X_df.from_gpu_matrix(cuda.to_device(rand_mat)) if out_dtype: return result, rand_mat.astype(out_dtype) else: return result, rand_mat
def as_contiguous(arr): assert arr.ndim == 1 cupy_dtype = arr.dtype if np.issubdtype(cupy_dtype, np.datetime64): cupy_dtype = np.dtype("int64") arr = arr.view("int64") out = cupy.ascontiguousarray(cupy.asarray(arr)) return cuda.as_cuda_array(out).view(arr.dtype)
def convert_dtype(X, to_dtype=np.float32): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ # Using cuDF for converting numba and device array interface inputs # if CuPy not installed, temporary while CuPy conda package # causes nccl conflicts if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series): return X.astype(to_dtype) elif cuda.is_cuda_array(X): if has_cupy(): import cupy as cp X_m = cp.asarray(X) X_m = X_m.astype(to_dtype) return cuda.as_cuda_array(X_m) else: warnings.warn("Using cuDF for dtype conversion, install" "CuPy for faster data conversion.") if (len(X.shape) == 1): return cudf.Series(X).astype(to_dtype).to_gpu_array() else: X_df = cudf.DataFrame() X = X_df.from_gpu_matrix(X) X = convert_dtype(X, to_dtype=to_dtype) return X.as_gpu_matrix() elif isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if dtype != to_dtype: new_cols = [(col, X._cols[col].astype(to_dtype)) for col in X._cols] overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) if overflowed > 0: raise TypeError("Data type conversion resulted" "in data loss.") return cudf.DataFrame(new_cols) else: raise TypeError("Received unsupported input type " % type(X)) return X
def test_consume_stream(self): # Create a foreign array with a stream s = cuda.stream() f_arr = ForeignArray(cuda.device_array(10, stream=s)) # Ensure that an imported array has the stream as its default stream c_arr = cuda.as_cuda_array(f_arr) self.assertTrue(c_arr.stream.external) self.assertEqual(c_arr.stream.handle.value, s.handle.value)
def full(size, value, dtype): cupy_dtype = dtype if np.issubdtype(cupy_dtype, np.datetime64): time_unit, _ = np.datetime_data(cupy_dtype) cupy_dtype = np.int64 value = np.datetime64(value, time_unit).view(cupy_dtype) out = cupy.full(size, value, cupy_dtype) return cuda.as_cuda_array(out).view(dtype)
def as_device_array(self, obj): # We don't want to call as_cuda_array on objects that are already Numba # device arrays, because this results in exporting the array as a # Producer then importing it as a Consumer, which causes a # synchronization on the array's stream (if it has one) by default. # When we have a Numba device array, we can simply return it. if cuda.cudadrv.devicearray.is_cuda_ndarray(obj): return obj return cuda.as_cuda_array(obj)
def _build_labels_backward(self, as_cuda_tensor): d_spread_integral_now = self.diffusion_engine.d_spread_integrals[0, 1:] d_spread_integral_next = self.diffusion_engine.d_spread_integrals[1, 1:] d_mtm_next = self.diffusion_engine.d_mtm_by_cpty[0] d_rate_integral_now = self.diffusion_engine.d_dom_rate_integral[0] d_rate_integral_next = self.diffusion_engine.d_dom_rate_integral[1] d_def = self.diffusion_engine.d_def_indicators[0] d_labels_by_cpty = self.diffusion_engine.d_mtm_by_cpty[1] t_out = torch.empty((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=torch.float32, device=self.device) with cuda.devices.gpus[self.device.index]: d_out = cuda.as_cuda_array(t_out) if as_cuda_tensor: out = t_out else: out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=np.float32) out[:] = 0 if as_cuda_tensor: yield out.view(-1, 1) else: yield out.reshape(-1, 1) d_spread_integral_next.copy_to_device( self.diffusion_engine.spread_integrals[ self.diffusion_engine.num_coarse_steps, 1:]) d_rate_integral_next.copy_to_device( self.diffusion_engine.dom_rate_integral[ self.diffusion_engine.num_coarse_steps]) accumulate = False for t in range(self.diffusion_engine.num_coarse_steps - 1, -1, -1): d_spread_integral_now.copy_to_device( self.diffusion_engine.spread_integrals[t, 1:]) d_rate_integral_now.copy_to_device( self.diffusion_engine.dom_rate_integral[t]) d_mtm_next.copy_to_device(self.diffusion_engine.mtm_by_cpty[t + 1]) d_def.copy_to_device(self.diffusion_engine.def_indicators[t]) self.__cuda_build_labels_backward(d_spread_integral_now, d_spread_integral_next, d_rate_integral_now, d_rate_integral_next, d_mtm_next, d_labels_by_cpty, t > 0, accumulate) self.__cuda_aggregate_survival(d_labels_by_cpty, d_def, d_out) if as_cuda_tensor: yield out.view(-1, 1) else: d_out.copy_to_host(out) yield out.reshape(-1, 1) if not accumulate: accumulate = True
def create_input(input_type, dtype, shape, order): float_dtypes = [np.float16, np.float32, np.float64] if dtype in float_dtypes: rand_ary = np.random.random(shape) else: rand_ary = cp.random.randint(100, size=shape) rand_ary = cp.array(rand_ary, dtype=dtype, order=order) if input_type == 'numpy': return np.array(cp.asnumpy(rand_ary), dtype=dtype, order=order) elif input_type == 'numba': return cuda.as_cuda_array(rand_ary) elif input_type == 'series': return cudf.Series(cuda.as_cuda_array(rand_ary)) else: return rand_ary
def test_as_cuda_array(self): h_arr = np.arange(10) self.assertFalse(cuda.is_cuda_array(h_arr)) d_arr = cuda.to_device(h_arr) self.assertTrue(cuda.is_cuda_array(d_arr)) my_arr = MyArray(d_arr) self.assertTrue(cuda.is_cuda_array(my_arr)) wrapped = cuda.as_cuda_array(my_arr) self.assertTrue(cuda.is_cuda_array(wrapped)) # Their values must equal the original array np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr) np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr) # d_arr and wrapped must be the same buffer self.assertEqual(wrapped.device_ctypes_pointer.value, d_arr.device_ctypes_pointer.value)
def test_kernel_arg(self): h_arr = np.arange(10) d_arr = cuda.to_device(h_arr) my_arr = MyArray(d_arr) wrapped = cuda.as_cuda_array(my_arr) @cuda.jit def mutate(arr, val): arr[cuda.grid(1)] += val val = 7 mutate.forall(wrapped.size)(wrapped, val) np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr + val) np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr + val)
def test_ownership(self): # Get the deallocation queue ctx = cuda.current_context() deallocs = ctx.deallocations # Flush all deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) # Make new device array d_arr = cuda.to_device(np.arange(100)) # Convert it cvted = cuda.as_cuda_array(d_arr) # Drop reference to the original object such that # only `cvted` has a reference to it. del d_arr # There shouldn't be any new deallocations self.assertEqual(len(deallocs), 0) # Try to access the memory and verify its content np.testing.assert_equal(cvted.copy_to_host(), np.arange(100)) # Drop last reference to the memory del cvted self.assertEqual(len(deallocs), 1) # Flush deallocs.clear()
def test_array_views(self): """Views created via array interface support: - Strided slices - Strided slices """ h_arr = np.random.random(10) c_arr = cuda.to_device(h_arr) arr = cuda.as_cuda_array(c_arr) # __getitem__ interface accesses expected data # Direct views np.testing.assert_array_equal(arr.copy_to_host(), h_arr) np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr) # Slicing np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5]) # Strided view np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2]) # View of strided array arr_strided = cuda.as_cuda_array(c_arr[::2]) np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2]) # A strided-view-of-array and view-of-strided-array have the same # shape, strides, itemsize, and alloc_size self.assertEqual(arr[::2].shape, arr_strided.shape) self.assertEqual(arr[::2].strides, arr_strided.strides) self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize) self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size) self.assertEqual(arr[::2].nbytes, arr_strided.size * arr_strided.dtype.itemsize) # __setitem__ interface propogates into external array # Writes to a slice arr[:5] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host(), np.concatenate((np.full(5, np.pi), h_arr[5:])) ) # Writes to a slice from a view arr[:5] = arr[5:] np.testing.assert_array_equal( c_arr.copy_to_host(), np.concatenate((h_arr[5:], h_arr[5:])) ) # Writes through a view arr[:] = cuda.to_device(h_arr) np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr) # Writes to a strided slice arr[::2] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host()[::2], np.full(5, np.pi), ) np.testing.assert_array_equal( c_arr.copy_to_host()[1::2], h_arr[1::2] )
def as_device_array(self, obj): return cuda.as_cuda_array(obj)