def test_redirected_logger(): new_stdout = StringIO() with logger.set_level(logger.level_trace): # We do not test trace because CUML_LOG_TRACE is not compiled by # default test_msg = "This is a debug message" with redirect_stdout(new_stdout): logger.debug(test_msg) assert test_msg in new_stdout.getvalue() test_msg = "This is an info message" with redirect_stdout(new_stdout): logger.info(test_msg) assert test_msg in new_stdout.getvalue() test_msg = "This is a warn message" with redirect_stdout(new_stdout): logger.warn(test_msg) assert test_msg in new_stdout.getvalue() test_msg = "This is an error message" with redirect_stdout(new_stdout): logger.error(test_msg) assert test_msg in new_stdout.getvalue() test_msg = "This is a critical message" with redirect_stdout(new_stdout): logger.critical(test_msg) assert test_msg in new_stdout.getvalue() # Check that logging does not error with sys.stdout of None with redirect_stdout(None): test_msg = "This is a debug message" logger.debug(test_msg)
def init(self, workers=None): """ Initializes the underlying comms. NCCL is required but UCX is only initialized if `comms_p2p == True` """ self.worker_addresses = list( set((self.client.has_what().keys() if workers is None else workers))) if self.nccl_initialized: warnings.warn("CommsContext has already been initialized.") return worker_info = self.worker_info(self.worker_addresses) worker_info = {w: worker_info[w] for w in self.worker_addresses} self.uniqueId = nccl.get_unique_id() self.client.run(_func_init_all, self.sessionId, self.uniqueId, self.comms_p2p, worker_info, self.streams_per_handle, workers=self.worker_addresses, wait=True) self.nccl_initialized = True if self.comms_p2p: self.ucx_initialized = True logger.debug("Initialization complete.")
def test_concat_memory_leak(large_clf, estimator_type): import gc import os try: import psutil except ImportError: pytest.skip("psutil not installed") process = psutil.Process(os.getpid()) X, y = large_clf X = X.astype(np.float32) # Build a series of RF models n_models = 10 if estimator_type == 'classification': base_models = [ curfc(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.int32) elif estimator_type == 'regression': base_models = [ curfr(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.float32) else: assert False # Pre-fit once - this is our baseline and memory usage # should not significantly exceed it after later fits for model in base_models: model.fit(X, y) # Just concatenate over and over in a loop concat_models = base_models[1:] init_model = base_models[0] other_handles = [ model._obtain_treelite_handle() for model in concat_models ] init_model._concatenate_treelite_handle(other_handles) gc.collect() initial_baseline_mem = process.memory_info().rss for i in range(10): init_model._concatenate_treelite_handle(other_handles) gc.collect() used_mem = process.memory_info().rss logger.debug("memory at rep %2d: %d m" % (i, (used_mem - initial_baseline_mem) / 1e6)) gc.collect() used_mem = process.memory_info().rss logger.info("Final memory delta: %d" % ((used_mem - initial_baseline_mem) / 1e6)) assert (used_mem - initial_baseline_mem) < 1e6
def check_order(arr_order): if order != 'K' and arr_order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: debug("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") return True return False
def __init__(self, data=None, convert_to_dtype=False, convert_index=np.int32, convert_format=True): if not cpx.scipy.sparse.isspmatrix(data) and \ not (has_scipy() and scipy.sparse.isspmatrix(data)): raise ValueError("A sparse matrix is expected as input. " "Received %s" % type(data)) check_classes = [cpx.scipy.sparse.csr_matrix] if has_scipy(): check_classes.append(scipy.sparse.csr_matrix) if not isinstance(data, tuple(check_classes)): if convert_format: debug('Received sparse matrix in %s format but CSR is ' 'expected. Data will be converted to CSR, but this ' 'will require additional memory copies. If this ' 'conversion is not desired, set ' 'set_convert_format=False to raise an exception ' 'instead.' % type(data)) data = data.tocsr() # currently only CSR is supported else: raise ValueError("Expected CSR matrix but received %s" % type(data)) if not convert_to_dtype: convert_to_dtype = data.dtype if not convert_index: convert_index = data.indptr.dtype # Note: Only 32-bit indexing is supported currently. # In CUDA11, Cusparse provides 64-bit function calls # but these are not yet used in RAFT/Cuml self.indptr, _, _, _ = cuml.common.input_to_cuml_array( data.indptr, check_dtype=convert_index, convert_to_dtype=convert_index) self.indices, _, _, _ = cuml.common.input_to_cuml_array( data.indices, check_dtype=convert_index, convert_to_dtype=convert_index) self.data, _, _, _ = cuml.common.input_to_cuml_array( data.data, check_dtype=data.dtype, convert_to_dtype=convert_to_dtype) self.shape = data.shape self.dtype = self.data.dtype self.nnz = data.nnz
def run_classification(datatype, penalty, loss, dims, nclasses): t = time.perf_counter() nrows, ncols = dims X_train, X_test, y_train, y_test = make_classification_dataset( datatype, nrows, ncols, nclasses) logger.debug(f"Data generation time: {time.perf_counter() - t} s.") # solving in primal is not supported by sklearn for this loss type. skdual = loss == 'hinge' and penalty == 'l2' if loss == 'hinge' and penalty == 'l1': pytest.skip( "sklearn does not support this combination of loss and penalty") # limit the max iterations for sklearn to reduce the max test time cuit = 10000 skit = max(10, min(cuit, cuit * 1000 / nrows)) t = time.perf_counter() handle = cuml.Handle(n_streams=0) cum = cu.LinearSVC(handle=handle, loss=loss, penalty=penalty, max_iter=cuit) cum.fit(X_train, y_train) cus = cum.score(X_test, y_test) handle.sync() t = time.perf_counter() - t logger.debug(f"Cuml time: {t} s.") t = max(5, t * SKLEARN_TIMEOUT_FACTOR) # cleanup cuml objects so that we can more easily fork the process # and test sklearn del cum X_train = X_train.get() X_test = X_test.get() y_train = y_train.get() y_test = y_test.get() gc.collect() try: def run_sklearn(): skm = sk.LinearSVC(loss=loss, penalty=penalty, max_iter=skit, dual=skdual) skm.fit(X_train, y_train) return skm.score(X_test, y_test) sks = with_timeout(timeout=t, target=run_sklearn) good_enough(cus, sks, nrows) except TimeoutError: pytest.skip(f"sklearn did not finish within {t} seconds.")
def test_logger(): logger.trace("This is a trace message") logger.debug("This is a debug message") logger.info("This is an info message") logger.warn("This is a warn message") logger.error("This is a error message") logger.critical("This is a critical message") with logger.set_level(logger.level_warn): assert (logger.should_log_for(logger.level_warn)) assert (not logger.should_log_for(logger.level_info)) with logger.set_pattern("%v"): logger.info("This is an info message")
def destroy(self): """ Shuts down initialized comms and cleans up resources. """ self.client.run(_func_destroy_all, self.sessionId, self.comms_p2p, wait=True, workers=self.worker_addresses) logger.debug("Destroying comms.") self.nccl_initialized = False self.ucx_initialized = False
def test_log_flush(): stdout_buffer = BytesIO() new_stdout = TextIOWrapper(stdout_buffer) with logger.set_level(logger.level_trace): test_msg = "This is a debug message" with redirect_stdout(new_stdout): logger.debug(test_msg) assert test_msg not in stdout_buffer.getvalue().decode('utf-8') logger.flush() assert test_msg in stdout_buffer.getvalue().decode('utf-8') # Check that logging flush does not error with sys.stdout of None with redirect_stdout(None): logger.flush()
def _to_dask_cudf(futures, client=None): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :param client: dask.distributed.Client Optional client to use :return: dask.Dataframe a dask.Dataframe """ c = default_client() if client is None else client # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA if logger.should_log_for(logger.level_debug): logger.debug("to_dask_cudf dfs=%s" % str(dfs)) meta_future = c.submit(_get_meta, dfs[0], pure=False) meta = meta_future.result() return dd.from_delayed(dfs, meta=meta)
def test_umap_mnmg(n_parts, n_rows, sampling_ratio, supervised, dataset, n_neighbors, client): local_X, local_y = _load_dataset(dataset, n_rows) dist_umap = _umap_mnmg_trustworthiness(local_X, local_y, n_neighbors, supervised, n_parts, sampling_ratio) loc_umap = _local_umap_trustworthiness(local_X, local_y, n_neighbors, supervised) logger.debug( "\nLocal UMAP trustworthiness score : {:.2f}".format(loc_umap)) logger.debug("UMAP MNMG trustworthiness score : {:.2f}".format(dist_umap)) trust_diff = loc_umap - dist_umap assert trust_diff <= 0.1
def tree_reduce(objs, func=sum): """ Performs a binary tree reduce on an associative and commutative function in parallel across Dask workers. Since this supports dask.delayed objects, which have yet been scheduled on workers, it does not take locality into account. As a result, any local reductions should be performed before this function is called. Parameters ---------- func : Python function or dask.delayed function Function to use for reduction. The reduction function acceps a list of objects to reduce as an argument and produces a single reduced object objs : array-like of dask.delayed or future objects to reduce. Returns ------- reduced_result : dask.delayed or future if func is delayed, the result will be delayed if func is a future, the result will be a future """ func = dask.delayed(func) \ if not isinstance(func, Delayed) else func while len(objs) > 1: new_objs = [] n_objs = len(objs) for i in range(0, n_objs, 2): inputs = dask.delayed(objs[i:i + 2], pure=False) obj = func(inputs) new_objs.append(obj) wait(new_objs) objs = new_objs logger.debug(str(objs)) return first(objs)
def __init__(self, comms_p2p=False, client=None, streams_per_handle=0): """ Construct a new CommsContext instance :param comms_p2p: bool Should p2p comms be initialized? """ self.client = client if client is not None else default_client() self.comms_p2p = comms_p2p self.streams_per_handle = streams_per_handle self.sessionId = uuid.uuid4().bytes self.nccl_initialized = False self.ucx_initialized = False if comms_p2p and (not is_ucx_enabled() or not has_ucp()): warnings.warn("ucx-py not found. UCP Integration will " "be disabled.") self.comms_p2p = False logger.debug("Initializing comms!")
def transform(self, X): """ Transform X using one-hot encoding. Parameters ---------- X : cudf.DataFrame or cupy.ndarray The data to encode. Returns ------- X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ self._check_is_fitted() X = self._check_input(X) cols, rows = list(), list() col_idx = None j = 0 try: for feature in X.columns: encoder = self._encoders[feature] col_idx = encoder.transform(X[feature]) idx_to_keep = cp.asarray(col_idx.notnull().to_gpu_array()) col_idx = cp.asarray(col_idx.dropna().to_gpu_array()) # Simple test to auto upscale col_idx type as needed # First, determine the maximum value we will add assuming # monotonically increasing up to len(encoder.classes_) # Ensure we dont go negative by clamping to 0 max_value = int(max(len(encoder.classes_) - 1, 0) + j) # If we exceed the max value, upconvert if (max_value > np.iinfo(col_idx.dtype).max): col_idx = col_idx.astype(np.min_scalar_type(max_value)) logger.debug("Upconverting column: '{}', to dtype: '{}', \ to support up to {} classes".format( feature, np.min_scalar_type(max_value), max_value)) # increase indices to take previous features into account col_idx += j # Filter out rows with null values row_idx = cp.arange(len(X))[idx_to_keep] if self.drop_idx_ is not None: drop_idx = self.drop_idx_[feature] + j mask = cp.ones(col_idx.shape, dtype=cp.bool) mask[col_idx == drop_idx] = False col_idx = col_idx[mask] row_idx = row_idx[mask] # account for dropped category in indices col_idx[col_idx > drop_idx] -= 1 # account for dropped category in current cats number j -= 1 j += len(encoder.classes_) cols.append(col_idx) rows.append(row_idx) cols = cp.concatenate(cols) rows = cp.concatenate(rows) val = cp.ones(rows.shape[0], dtype=self.dtype) ohe = cupyx.scipy.sparse.coo_matrix((val, (rows, cols)), shape=(len(X), j), dtype=self.dtype) if not self.sparse: ohe = ohe.toarray() return ohe except TypeError as e: # Append to cols to include the column that threw the error cols.append(col_idx) # Build a string showing what the types are input_types_str = ", ".join([str(x.dtype) for x in cols]) raise TypeError( "A TypeError occurred while calculating column " "category indices, most likely due to integer overflow. This " "can occur when columns have a large difference in the number " "of categories, resulting in different category code dtypes " "for different columns." "Calculated column code dtypes: {}.\n" "Internal Error: {}".format(input_types_str, repr(e)))
def cuda_kernel_factory(nvrtc_kernel_str, dtypes, kernel_name=None): """ A factory wrapper function to perform some of the boiler-plate involved in making cuPy RawKernels type-agnostic. Until a better method is created, either by RAPIDS or cuPy, this function will perform a string search and replace of c-based datatype primitives in ``nvrtc_kernel_str`` using a numerical placeholder (eg. {0}, {1}) for the dtype in the corresponding index of tuple ``dtypes``. Note that the extern, function scope, and function name should not be included in the kernel string. These will be added by this function and the function name will be made unique, based on the given dtypes. Example ------- The following kernel string with dtypes = [float, double, int] ({0} *a, {1} *b, {2} *c) {} Will become (float *a, double *b, int *c) {} Parameters ---------- nvrtc_kernel_str : string valid nvrtc kernel string without extern, scope, or function name. dtypes : tuple of dtypes to search and replace. kernel_name : string prefix and function name to use. Note that when this not set (or is set to None), a UUID will be used, which will stop this function from being memoized. Returns ------- kernel_name : string unique function name created for kernel, raw_kernel : cupy.RawKernel object ready for use """ dtype_strs = get_dtype_strs(dtypes) for idx, dtype in enumerate(dtypes): nvrtc_kernel_str = nvrtc_kernel_str.replace("{%d}" % idx, dtype_strs[idx]) kernel_name = f'''{uuid1() if kernel_name is None else kernel_name}_{ "".join(dtype_strs).replace(" ", "_") }''' nvrtc_kernel_str = "%s\nvoid %s%s" % \ (extern_prefix, kernel_name, nvrtc_kernel_str) if logger.should_log_for(logger.LEVEL_DEBUG): logger.debug(str(nvrtc_kernel_str)) return cp.RawKernel(nvrtc_kernel_str, kernel_name)
def input_to_cuml_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False, force_contiguous=True): """ Convert input X to CumlArray. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, NumPy array, Pandas DataFrame, Pandas Series or any cuda_array_interface (CAI) compliant array like CuPy, Numba or pytorch. order: 'F', 'C' or 'K' (default: 'F') Whether to return a F-major ('F'), C-major ('C') array or Keep ('K') the order of X. Used to check the order of the input. If fail_on_order=True, the method will raise ValueError, otherwise it will convert X to be of order `order` if needed. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. force_contiguous: boolean (default: True) Set to True to force CumlArray produced to be contiguous. If `X` is non contiguous then a contiguous copy will be done. If False, and `X` doesn't need to be converted and is not contiguous, the underlying memory underneath the CumlArray will be non contiguous. Only affects CAI inputs. Only affects CuPy and Numba device array views, all other input methods produce contiguous CumlArrays. Returns ------- `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype') A new CumlArray and associated data. """ def check_order(arr_order): if order != 'K' and arr_order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: debug("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") return True return False # dtype conversion # force_contiguous set to True always for now # upcoming CumlArray improvements will affect this # https://github.com/rapidsai/cuml/issues/2412 force_contiguous = True if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False # format conversion if (isinstance(X, cudf.Series)): if X.null_count != 0: raise ValueError("Error: cuDF Series has missing/null values, " "which are not supported by cuML.") # converting pandas to numpy before sending it to CumlArray if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): # pandas doesn't support custom order in to_numpy X = cp.asarray(X.to_numpy(copy=False), order=order) if isinstance(X, cudf.DataFrame): if order == 'K': X_m = CumlArray(data=X.as_gpu_matrix(order='F')) else: X_m = CumlArray(data=X.as_gpu_matrix(order=order)) elif isinstance(X, CumlArray): X_m = X elif hasattr(X, "__array_interface__") or \ hasattr(X, "__cuda_array_interface__"): # Since we create the array with the correct order here, do the order # check now if necessary interface = getattr(X, "__array_interface__", None) or getattr( X, "__cuda_array_interface__", None) arr_info = ArrayInfo.from_interface(interface) check_order(arr_info.order) make_copy = False if force_contiguous or hasattr(X, "__array_interface__"): if not _check_array_contiguity(X): debug("Non contiguous array or view detected, a " "contiguous copy of the data will be done.") # X = cp.array(X, order=order, copy=True) make_copy = True cp_arr = cp.array(X, copy=make_copy, order=order) X_m = CumlArray(data=cp_arr) if deepcopy: X_m = copy.deepcopy(X_m) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) if check_dtype: if not isinstance(check_dtype, list): check_dtype = [check_dtype] check_dtype = [np.dtype(dtype) for dtype in check_dtype] if X_m.dtype not in check_dtype: type_str = X_m.dtype del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(type_str)) # Checks based on parameters n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if n_cols == 1 or n_rows == 1: order = 'K' if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if (check_order(X_m.order)): X_m = cp.array(X_m, copy=False, order=order) X_m = CumlArray(data=X_m) return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
def random_state(): random_state = random.randint(0, 1e6) with logger.set_level(logger.level_debug): logger.debug("Random seed: {}".format(random_state)) return random_state
def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, n_parts=None, center_box=(-10, 10), shuffle=True, random_state=None, return_centers=False, verbosity=logger.LEVEL_INFO, order='F', dtype='float32', client=None): """ Makes labeled Dask-Cupy arrays containing blobs for a randomly generated set of centroids. This function calls `make_blobs` from `cuml.datasets` on each Dask worker and aggregates them into a single Dask Dataframe. For more information on Scikit-learn's `make_blobs: <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html>`_. Parameters ---------- n_samples : int number of rows n_features : int number of features centers : int or array of shape [n_centers, n_features], optional (default=None) The number of centers to generate, or the fixed center locations. If n_samples is an int and centers is None, 3 centers are generated. If n_samples is array-like, centers must be either None or an array of length equal to the length of n_samples. cluster_std : float (default = 1.0) standard deviation of points around centroid n_parts : int (default = None) number of partitions to generate (this can be greater than the number of workers) center_box : tuple (int, int) (default = (-10, 10)) the bounding box which constrains all the centroids random_state : int (default = None) sets random seed (or use None to reinitialize each time) return_centers : bool, optional (default=False) If True, then return the centers of each cluster verbosity : int (default = cuml.logger.LEVEL_INFO) Logging level. shuffle : bool (default=False) Shuffles the samples on each worker. order: str, optional (default='F') The order of the generated samples dtype : str, optional (default='float32') Dtype of the generated samples client : dask.distributed.Client (optional) Dask client to use Returns ------- X : dask.array backed by CuPy array of shape [n_samples, n_features] The input samples. y : dask.array backed by CuPy array of shape [n_samples] The output values. centers : dask.array backed by CuPy array of shape [n_centers, n_features], optional The centers of the underlying blobs. It is returned only if return_centers is True. """ client = get_client(client=client) generator = _create_rs_generator(random_state=random_state) workers = list(client.scheduler_info()['workers'].keys()) n_parts = n_parts if n_parts is not None else len(workers) parts_workers = (workers * n_parts)[:n_parts] centers, n_centers = _get_centers(generator, centers, center_box, n_samples, n_features, dtype) rows_per_part = max(1, int(n_samples / n_parts)) worker_rows = [rows_per_part] * n_parts if rows_per_part == 1: worker_rows[-1] += n_samples % n_parts else: worker_rows[-1] += n_samples % rows_per_part worker_rows = tuple(worker_rows) logger.debug("Generating %d samples across %d partitions on " "%d workers (total=%d samples)" % (math.ceil(n_samples / len(workers)), n_parts, len(workers), n_samples)) seeds = generator.randint(n_samples, size=len(parts_workers)) parts = [client.submit(_create_local_data, part_rows, n_features, centers, cluster_std, shuffle, int(seeds[idx]), order, dtype, pure=False, workers=[parts_workers[idx]]) for idx, part_rows in enumerate(worker_rows)] X = [client.submit(_get_X, f, pure=False) for idx, f in enumerate(parts)] y = [client.submit(_get_labels, f, pure=False) for idx, f in enumerate(parts)] X_del = _create_delayed(X, dtype, worker_rows, n_features) y_del = _create_delayed(y, dtype, worker_rows) X_final = da.concatenate(X_del, axis=0) y_final = da.concatenate(y_del, axis=0) if return_centers: return X_final, y_final, centers else: return X_final, y_final
async def _func_init_all(sessionId, uniqueId, comms_p2p, worker_info, streams_per_handle): session_state = worker_state(sessionId) session_state["nccl_uid"] = uniqueId session_state["wid"] = worker_info[get_worker().address]["rank"] session_state["nworkers"] = len(worker_info) if logger.should_log_for(logger.level_debug): logger.debug("Initializing NCCL") start = time.time() _func_init_nccl(sessionId, uniqueId) if logger.should_log_for(logger.level_debug): elapsed = time.time() - start logger.debug("NCCL Initialization took: %f seconds." % elapsed) if comms_p2p: logger.debug("Initializing UCX Endpoints") if logger.should_log_for(logger.level_debug): start = time.time() await _func_ucp_create_endpoints(sessionId, worker_info) if logger.should_log_for(logger.level_debug): elapsed = time.time() - start logger.debug("Done initializing UCX endpoints. Took: %f seconds." % elapsed) logger.debug("Building handle") _func_build_handle_p2p(sessionId, streams_per_handle) logger.debug("Done building handle.") else: _func_build_handle(sessionId, streams_per_handle)