def init_once(): global ucp, cuda_array if ucp is not None: return import ucp as _ucp ucp = _ucp # remove/process dask.ucx flags for valid ucx options ucx_config = _scrub_ucx_config() ucp.init(options=ucx_config, env_takes_precedence=True) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm if hasattr(rmm, "DeviceBuffer"): cuda_array = lambda n: rmm.DeviceBuffer(size=n) else: # pre-0.11.0 cuda_array = lambda n: rmm.device_array(n, dtype=np.uint8) except ImportError: try: import numba.cuda cuda_array = lambda n: numba.cuda.device_array( (n, ), dtype=np.uint8) except ImportError: def cuda_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" ) pool_size_str = dask.config.get("rmm.pool-size") if pool_size_str is not None: pool_size = parse_bytes(pool_size_str) rmm.reinitialize(pool_allocator=True, managed_memory=False, initial_pool_size=pool_size)
def test_mr_devicebuffer_lifetime(): # Test ensures MR/Stream lifetime is longer than DeviceBuffer. Even if all # references go out of scope # Create new Pool MR rmm.mr.set_current_device_resource( rmm.mr.PoolMemoryResource(rmm.mr.get_current_device_resource())) # Creates a new non-default stream stream = rmm._cuda.stream.Stream() # Allocate DeviceBuffer with Pool and Stream a = rmm.DeviceBuffer(size=10, stream=stream) # Change current MR. Will cause Pool to go out of scope rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) # Force collection to ensure objects are cleaned up gc.collect() # Delete a. Used to crash before. Pool MR should still be alive del a
def test_rmm_device_buffer(size): b = rmm.DeviceBuffer(size=size) # Test some properties if size: assert b.ptr != 0 assert b.size == size else: assert b.ptr == 0 assert b.size == 0 assert len(b) == b.size assert b.nbytes == b.size assert b.capacity() >= b.size assert b.__sizeof__() == b.size # Test `__cuda_array_interface__` keyset = {"data", "shape", "strides", "typestr", "version"} assert isinstance(b.__cuda_array_interface__, dict) assert set(b.__cuda_array_interface__) == keyset assert b.__cuda_array_interface__["data"] == (b.ptr, False) assert b.__cuda_array_interface__["shape"] == (b.size, ) assert b.__cuda_array_interface__["strides"] is None assert b.__cuda_array_interface__["typestr"] == "|u1" assert b.__cuda_array_interface__["version"] == 0 # Test conversion to bytes s = b.tobytes() assert isinstance(s, bytes) assert len(s) == len(b) # Test conversion from bytes b2 = rmm.DeviceBuffer.to_device(s) assert isinstance(b2, rmm.DeviceBuffer) assert len(b2) == len(s) # Test resizing b.resize(2) assert b.size == 2 assert b.capacity() >= b.size
def register_am_allocators(args, worker): if not args.enable_am: return import numpy as np worker.register_am_allocator( lambda n: np.empty(n, dtype=np.uint8), ucx_api.AllocatorType.HOST ) if args.object_type == "cupy": import cupy as cp worker.register_am_allocator( lambda n: cp.empty(n, dtype=cp.uint8), ucx_api.AllocatorType.CUDA ) elif args.object_type == "rmm": import rmm worker.register_am_allocator( lambda n: rmm.DeviceBuffer(size=n), ucx_api.AllocatorType.CUDA )
def test_dev_buf_circle_ref_dealloc(): rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) dbuf1 = rmm.DeviceBuffer(size=1_000_000) # Make dbuf1 part of a reference cycle: l1 = [dbuf1] l1.append(l1) # due to the reference cycle, the device buffer doesn't actually get # cleaned up until later, when we invoke `gc.collect()`: del dbuf1, l1 rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) # by now, the only remaining reference to the *original* memory # resource should be in `dbuf1`. However, the cyclic garbage collector # will eliminate that reference when it clears the object via its # `tp_clear` method. Later, when `tp_dealloc` attemps to actually # deallocate `dbuf1` (which needs the MR alive), a segfault occurs. gc.collect()
def test_statistics_resource_adaptor(): cuda_mr = rmm.mr.CudaMemoryResource() mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) rmm.mr.set_current_device_resource(mr) buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)] for i in range(9, 0, -2): del buffers[i] assert mr.allocation_counts == { "current_bytes": 5000, "current_count": 5, "peak_bytes": 10000, "peak_count": 10, "total_bytes": 10000, "total_count": 10, } # Push a new Tracking adaptor mr2 = rmm.mr.StatisticsResourceAdaptor(mr) rmm.mr.set_current_device_resource(mr2) for _ in range(2): buffers.append(rmm.DeviceBuffer(size=1000)) assert mr2.allocation_counts == { "current_bytes": 2000, "current_count": 2, "peak_bytes": 2000, "peak_count": 2, "total_bytes": 2000, "total_count": 2, } assert mr.allocation_counts == { "current_bytes": 7000, "current_count": 7, "peak_bytes": 10000, "peak_count": 10, "total_bytes": 12000, "total_count": 12, } del buffers gc.collect() assert mr2.allocation_counts == { "current_bytes": 0, "current_count": 0, "peak_bytes": 2000, "peak_count": 2, "total_bytes": 2000, "total_count": 2, } assert mr.allocation_counts == { "current_bytes": 0, "current_count": 0, "peak_bytes": 10000, "peak_count": 10, "total_bytes": 12000, "total_count": 12, }
def test_reinitialize_max_pool_size_exceeded(): rmm.reinitialize(pool_allocator=True, initial_pool_size=0, maximum_pool_size=1 << 23) with pytest.raises(MemoryError): rmm.DeviceBuffer().resize(1 << 24)
def test_reinitialize_max_pool_size(): rmm.reinitialize(pool_allocator=True, initial_pool_size=0, maximum_pool_size=1 << 23) rmm.DeviceBuffer().resize((1 << 23) - 1)
def cuda_array(size): return rmm.DeviceBuffer(size=size)
def init_once(): global ucp, device_array global ucx_create_endpoint, ucx_create_listener global pre_existing_cuda_context, cuda_context_created if ucp is not None: return # remove/process dask.ucx flags for valid ucx options ucx_config = _scrub_ucx_config() # We ensure the CUDA context is created before initializing UCX. This can't # be safely handled externally because communications in Dask start before # preload scripts run. if dask.config.get("distributed.comm.ucx.create-cuda-context") is True or ( "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"]): try: import numba.cuda except ImportError: raise ImportError( "CUDA support with UCX requires Numba for context management") cuda_visible_device = int( os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]) pre_existing_cuda_context = has_cuda_context() if pre_existing_cuda_context is not False: warnings.warn( f"A CUDA context for device {pre_existing_cuda_context} already exists on process " f"ID {os.getpid()}. This is often the result of a CUDA-enabled library calling a " "CUDA runtime function before Dask-CUDA can spawn worker processes. Please make " "sure any such function calls don't happen at import time or in the global scope " "of a program.") numba.cuda.current_context() cuda_context_created = has_cuda_context() if (cuda_context_created is not False and cuda_context_created != cuda_visible_device): warnings.warn( f"Worker with process ID {os.getpid()} should have a CUDA context assigned to " f"device {cuda_visible_device}, but instead the CUDA context is on device " "{cuda_context_created}. This is often the result of a CUDA-enabled library " "calling a CUDA runtime function before Dask-CUDA can spawn worker processes. " "Please make sure any such function calls don't happen at import time or in " "the global scope of a program.") import ucp as _ucp ucp = _ucp ucp.init(options=ucx_config, env_takes_precedence=True) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm device_array = lambda n: rmm.DeviceBuffer(size=n) except ImportError: try: import numba.cuda def numba_device_array(n): a = numba.cuda.device_array((n, ), dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = numba_device_array except ImportError: def device_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" ) pool_size_str = dask.config.get("distributed.rmm.pool-size") if pool_size_str is not None: pool_size = parse_bytes(pool_size_str) rmm.reinitialize(pool_allocator=True, managed_memory=False, initial_pool_size=pool_size)
def init_once(): global ucp, host_array, device_array, ucx_create_endpoint, ucx_create_listener if ucp is not None: return import ucp as _ucp ucp = _ucp # remove/process dask.ucx flags for valid ucx options ucx_config = _scrub_ucx_config() ucp.init(options=ucx_config, env_takes_precedence=True) # Find the function, `host_array()`, to use when allocating new host arrays try: import numpy host_array = lambda n: numpy.empty((n, ), dtype="u1") except ImportError: host_array = lambda n: bytearray(n) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm if hasattr(rmm, "DeviceBuffer"): device_array = lambda n: rmm.DeviceBuffer(size=n) else: # pre-0.11.0 import numba.cuda def rmm_device_array(n): a = rmm.device_array(n, dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = rmm_device_array except ImportError: try: import numba.cuda def numba_device_array(n): a = numba.cuda.device_array((n, ), dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = numba_device_array except ImportError: def device_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" ) pool_size_str = dask.config.get("rmm.pool-size") if pool_size_str is not None: pool_size = parse_bytes(pool_size_str) rmm.reinitialize(pool_allocator=True, managed_memory=False, initial_pool_size=pool_size) try: from ucp.endpoint_reuse import EndpointReuse except ImportError: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener else: if dask.config.get("ucx.reuse-endpoints"): ucx_create_endpoint = EndpointReuse.create_endpoint ucx_create_listener = EndpointReuse.create_listener else: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener
def init_once(): global ucp, host_array, device_array, ucx_create_endpoint, ucx_create_listener if ucp is not None: return # remove/process dask.ucx flags for valid ucx options ucx_config = _scrub_ucx_config() # We ensure the CUDA context is created before initializing UCX. This can't # be safely handled externally because communications in Dask start before # preload scripts run. if "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"]: try: import numba.cuda except ImportError: raise ImportError( "CUDA support with UCX requires Numba for context management") numba.cuda.current_context() import ucp as _ucp ucp = _ucp ucp.init(options=ucx_config, env_takes_precedence=True) # Find the function, `host_array()`, to use when allocating new host arrays try: import numpy host_array = lambda n: numpy.empty((n, ), dtype="u1") except ImportError: host_array = lambda n: bytearray(n) # Find the function, `cuda_array()`, to use when allocating new CUDA arrays try: import rmm if hasattr(rmm, "DeviceBuffer"): device_array = lambda n: rmm.DeviceBuffer(size=n) else: # pre-0.11.0 import numba.cuda def rmm_device_array(n): a = rmm.device_array(n, dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = rmm_device_array except ImportError: try: import numba.cuda def numba_device_array(n): a = numba.cuda.device_array((n, ), dtype="u1") weakref.finalize(a, numba.cuda.current_context) return a device_array = numba_device_array except ImportError: def device_array(n): raise RuntimeError( "In order to send/recv CUDA arrays, Numba or RMM is required" ) pool_size_str = dask.config.get("rmm.pool-size") if pool_size_str is not None: pool_size = parse_bytes(pool_size_str) rmm.reinitialize(pool_allocator=True, managed_memory=False, initial_pool_size=pool_size) try: from ucp.endpoint_reuse import EndpointReuse except ImportError: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener else: reuse_endpoints = dask.config.get("ucx.reuse-endpoints") if (reuse_endpoints is None and ucp.get_ucx_version() >= (1, 11, 0)) or reuse_endpoints is False: ucx_create_endpoint = ucp.create_endpoint ucx_create_listener = ucp.create_listener else: ucx_create_endpoint = EndpointReuse.create_endpoint ucx_create_listener = EndpointReuse.create_listener
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(ColumnBase._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError("Result of concat cannot have " "size > {}".format( libcudfxx.MAX_COLUMN_SIZE_STR)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR)) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data = None data_dtype = head.codes.dtype children = (column_empty(newsize, dtype=head.codes.dtype, masked=True), ) else: data_dtype = head.dtype mem = rmm.DeviceBuffer(size=newsize * data_dtype.itemsize) data = Buffer(mem) children = None # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column(data=data, dtype=head.dtype, mask=mask, children=children) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col