def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cupca = daskPCA(n_components=20, whiten=True) cupca.fit_transform(X_cudf) if owns_cluster: client.close() cluster.close()
def test_pagerank(): gc.collect() input_data_path = r"../datasets/karate.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value']) G = nx.Graph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50) res_df = pr.compute() err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.01 * len(res_df)) client.close() cluster.close()
def create_cuml_distributed(X_train, y_train): start_time = datetime.now() print('init dask cluster') cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) workers = client.has_what().keys() n_workers = len(workers) X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) y_train_cudf = cudf.Series(y_train) X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers) y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers) X_train_ddask, y_train_ddask = dask_utils.persist_across_workers( client, [X_train_dask, y_train_dask], workers=workers) print('cuml distributed initialized', datetime.now() - start_time) model = distributed_cuml_Rf(n_estimators=500, n_streams=64) model.fit(X_train, y_train) wait(model.rfs) print('cuml distributed finished', datetime.now() - start_time) client.close() cluster.close() return model
def fit(self, data, args): params = self.configure(data, args) n_workers = None if args.gpus < 0 else args.gpus cluster = LocalCUDACluster(n_workers=n_workers, local_directory=args.root) client = Client(cluster) n_partitions = len(client.scheduler_info()['workers']) X_sliced, y_sliced = self.get_slices(n_partitions, data.X_train, data.y_train) X = da.concatenate( [da.from_array(sub_array) for sub_array in X_sliced]) X = X.rechunk((X_sliced[0].shape[0], data.X_train.shape[1])) y = da.concatenate( [da.from_array(sub_array) for sub_array in y_sliced]) y = y.rechunk(X.chunksize[0]) dtrain = xgb.dask.DaskDMatrix(client, X, y) with Timer() as t: output = xgb.dask.train(client, params, dtrain, num_boost_round=args.ntrees) self.model = output['booster'] client.close() cluster.close() return t.interval
def fit(self, data, args): params = self.configure(data, args) cluster = LocalCUDACluster( n_workers=None if args.gpus < 0 else args.gpus, local_directory=args.root, threads_per_worker=1) client = Client(cluster) partition_size = 10000 if isinstance(data.X_train, np.ndarray): X = da.from_array(data.X_train, (partition_size, data.X_train.shape[1])) y = da.from_array(data.y_train, partition_size) else: X = dd.from_pandas(data.X_train, chunksize=partition_size) y = dd.from_pandas(data.y_train, chunksize=partition_size) dtrain = xgb.dask.DaskDMatrix(client, X, y) with Timer() as t: output = xgb.dask.train(client, params, dtrain, num_boost_round=args.ntrees) self.model = output['booster'] client.close() cluster.close() return t.interval
def test_send_recv(n_trials): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) cb = CommsContext(comms_p2p=True) cb.init() cb = default_comms() start = time.time() dfs = [client.submit(func_test_send_recv, cb.sessionId, n_trials, random.random(), workers=[w]) for wid, w in zip(range(len(cb.worker_addresses)), cb.worker_addresses)] wait(dfs) print("Time: " + str(time.time() - start)) result = list(map(lambda x: x.result(), dfs)) print(str(result)) assert(result) cb.destroy() client.close() cluster.close()
def test_consolidation(graph_file): gc.collect() cluster = LocalCUDACluster() client = Client(cluster) chunksize = dcg.get_chunksize(graph_file) M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M['0']) df['target'] = pd.Series(M['1']) ddf = dask_cudf.read_csv(graph_file, chunksize=chunksize, delimiter=' ', names=['source', 'target', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(ddf, source='source', destination='target', create_using=cugraph.DiGraph) assert compare_graphs(Gnx, G) Gnx.clear() G.clear() client.close() cluster.close()
def get_cuda_cluster(): from dask_cuda import LocalCUDACluster CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0") n_workers = min(2, len(CUDA_VISIBLE_DEVICES.split(","))) cluster = LocalCUDACluster(n_workers=n_workers) yield cluster cluster.close()
def cluster(): print("Starting cluster") cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) yield cluster print("Closing cluster") cluster.close() print("Closed cluster")
class MGContext: """Utility Context Manager to start a multi GPU context using dask_cuda Parameters: ----------- number_of_devices : int Number of devices to use, verification must be done prior to call to ensure that there are enough devices available. If not specified, the cluster will be initialized to use all visible devices. rmm_managed_memory : bool True to enable managed memory (UVM) in RMM as part of the cluster. Default is False. p2p : bool Initialize UCX endpoints if True. Default is False. """ def __init__(self, number_of_devices=None, rmm_managed_memory=False, p2p=False): self._number_of_devices = number_of_devices self._rmm_managed_memory = rmm_managed_memory self._client = None self._p2p = p2p self._cluster = CUDACluster( n_workers=self._number_of_devices, rmm_managed_memory=self._rmm_managed_memory) @property def client(self): return self._client @property def cluster(self): return self._cluster def __enter__(self): self._prepare_mg() return self def _prepare_mg(self): self._prepare_client() self._prepare_comms() def _prepare_client(self): self._client = Client(self._cluster) self._client.wait_for_workers(self._number_of_devices) def _prepare_comms(self): Comms.initialize(p2p=self._p2p) def _close(self): Comms.destroy() if self._client is not None: self._client.close() if self._cluster is not None: self._cluster.close() def __exit__(self, type, value, traceback): self._close()
def predict_xgboost_gpu(xgb_model, X, data_chunksize=None, n_gpus=None, n_threads_per_gpu=1, gpu_cluster=None, client=None): ''' Predicts the output for the input features X using the 'xgb_model' running on the GPU. :param xgb_model: a dask XGBoost model to use for predictions :param X: the input features to use for predictions, must be either a numpy ndarray or a pandas DataFrame :param data_chunksize: chunk sizes to be used on a dask dataframe, leave the default value None for auto decision :param n_gpus: number of GPUs to be used. Default value None selects all available devices; :param n_threads_per_gpu: number of threads per GPU; :param gpu_cluster: an existing dask cluster object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created cluster. :param client: an existing dask cluster object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created client. :return: If the input features X is a pandas DataFrame, returns a array-like DataFrame of single column containing the predictions; Otherwise, if the input features X is a numpy ndarray, returns a 1D ndarray containing the predictions . ''' if gpu_cluster is None: local_gpus = LocalCUDACluster(n_workers=n_gpus, threads_per_worker=n_threads_per_gpu) else: local_gpus = gpu_cluster if client is None: local_dask_client = Client(local_gpus) else: local_dask_client = client if data_chunksize is None: data_chunksize = X.shape[0] // len(local_gpus.cuda_visible_devices) if isinstance(X, pd.DataFrame): ndarray = False X = from_pandas(X, chunksize=data_chunksize) else: ndarray = True X = from_array(X, chunksize=data_chunksize) y_predicted = dask_xgboost_predict(local_dask_client, xgb_model, X) y_predicted = pd.DataFrame(y_predicted) if client is None: local_dask_client.close() if gpu_cluster is None: local_gpus.close() if ndarray: return y_predicted.to_numpy() return y_predicted
def test_pagerank(): gc.collect() input_data_path = r"../datasets/hibench_small/1/part-00000.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) new_ddf = dcg.drop_duplicates(x) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50) wait(pr) t3 = time.time() print("Running PR algo time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() # For bigdatax4, chunksize=100000000 to avoid oom on write csv res_df.to_csv('~/pagerank.csv', header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df)) client.close() cluster.close()
def test_tree_stats(self) -> None: with LocalCUDACluster(n_workers=1) as cluster: with Client(cluster) as client: local = run_tree_stats(client, "gpu_hist") with LocalCUDACluster(n_workers=2) as cluster: with Client(cluster) as client: distributed = run_tree_stats(client, "gpu_hist") assert local == distributed
def test_default_comms_no_exist(): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) cb = default_comms() assert cb is not None cb2 = default_comms() assert cb.sessionId == cb2.sessionId client.close() cluster.close()
def client_connection(): cluster = LocalCUDACluster() client = Client(cluster) Comms.initialize(p2p=True) yield client Comms.destroy() client.close() cluster.close()
def ucx_cluster(): initialize.initialize(create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) cluster = LocalCUDACluster(protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband) yield cluster cluster.close()
class MGContext: """Utility Context Manager to start a multi GPU context using dask_cuda Parameters: ----------- number_of_devices : int Number of devices to use, verification must be done prior to call to ensure that there are enough devices available. """ def __init__(self, number_of_devices=None, rmm_managed_memory=False): self._number_of_devices = number_of_devices self._rmm_managed_memory = rmm_managed_memory self._cluster = None self._client = None @property def client(self): return self._client @property def cluster(self): return self._cluster def __enter__(self): self._prepare_mg() return self def _prepare_mg(self): self._prepare_cluster() self._prepare_client() self._prepare_comms() def _prepare_cluster(self): self._cluster = CUDACluster( n_workers=self._number_of_devices, rmm_managed_memory=self._rmm_managed_memory) def _prepare_client(self): self._client = Client(self._cluster) self._client.wait_for_workers(self._number_of_devices) def _prepare_comms(self): Comms.initialize() def _close(self): Comms.destroy() if self._client is not None: self._client.close() if self._cluster is not None: self._cluster.close() def __exit__(self, type, value, traceback): self._close()
def client_connection(): # setup cluster = LocalCUDACluster() client = Client(cluster) Comms.initialize() yield client # teardown Comms.destroy() client.close() cluster.close()
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_cudf) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_default_comms(): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) cb = CommsContext(comms_p2p=True, client=client) cb.init() comms = default_comms() assert(cb.sessionId == comms.sessionId) comms.destroy() client.close() cluster.close()
def test_splitting(): gc.collect() # This is an experimental setup for 300GB bigdatax8 dataset. # This test can be run on 16 32GB gpus. The dataset is split into 32 files. input_data_path = r"/datasets/pagerank_demo/1/Input-bigdatax8/edges/" input_files = [ 'file-00000.csv', 'file-00001.csv', 'file-00002.csv', 'file-00003.csv', 'file-00004.csv', 'file-00005.csv', 'file-00006.csv', 'file-00007.csv', 'file-00008.csv', 'file-00009.csv', 'file-00010.csv', 'file-00011.csv', 'file-00012.csv', 'file-00013.csv', 'file-00014.csv', 'file-00015.csv', 'file-00016.csv', 'file-00017.csv', 'file-00018.csv', 'file-00019.csv', 'file-00020.csv', 'file-00021.csv', 'file-00022.csv', 'file-00023.csv', 'file-00024.csv', 'file-00025.csv', 'file-00026.csv', 'file-00027.csv', 'file-00028.csv', 'file-00029.csv', 'file-00030.csv', 'file-00031.csv' ] # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) files = [input_data_path + f for f in input_files] # Read 2 files per gpu/worker and concatenate the dataframe # This is a work around for large files to fit memory requirements # of cudf.read_csv t0 = time.time() new_ddf = dcg.read_split_csv(files) t1 = time.time() print("Reading Csv time: ", t1 - t0) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=3) wait(pr) t3 = time.time() print("Pagerank (Dask) time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() res_df.to_csv('~/pagerank.csv', chunksize=40000000, header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) client.close() cluster.close()
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_cudf) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_end_to_end(nrows, ncols, nclusters, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from dask_ml.cluster import KMeans as dmlKMeans from cuml.test.dask.utils import dask_make_blobs X_df, X_cudf = dask_make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.1, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) daskmlModel1 = dmlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) daskmlModel1.fit(X_df) cumlLabels = cumlModel.predict(X_cudf) daskmlLabels1 = daskmlModel1.predict(X_df) from sklearn.metrics import adjusted_rand_score cumlPred = cumlLabels.compute().to_pandas().values daskmlPred1 = daskmlLabels1.compute() score = adjusted_rand_score(cumlPred, daskmlPred1) if owns_cluster: client.close() cluster.close() assert 1.0 == score
def __init__( self, cloud_type="Azure", model_type="RandomForest", data_type="Parquet", compute_type="single-GPU", verbose_estimator=False, CSP_paths=default_azureml_paths, ): self.CSP_paths = CSP_paths self.cloud_type = cloud_type self.model_type = model_type self.data_type = data_type self.compute_type = compute_type self.verbose_estimator = verbose_estimator self.log_to_file( f"\n> RapidsCloudML\n\tCompute, Data , Model, Cloud types {self.compute_type, self.data_type, self.model_type, self.cloud_type}" ) # Setting up client for multi-GPU option if "multi" in self.compute_type: self.log_to_file("\n\tMulti-GPU selected") # This will use all GPUs on the local host by default cluster = LocalCUDACluster(threads_per_worker=1) self.client = Client(cluster) # Query the client for all connected workers self.workers = self.client.has_what().keys() self.n_workers = len(self.workers) self.log_to_file(f"\n\tClient information {self.client}")
async def run(): initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) async with LocalCUDACluster( interface="enp1s0f0", protocol="ucx", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) a = rs.normal(10, 1, (int(4e3), int(4e3)), chunks=(int(1e3), int(1e3))) x = a + a.T for i in range(100): print("Running iteration:", i) start = time.time() await client.compute(x) print("Time for iteration", i, ":", time.time() - start)
def test_with_asyncio(): with LocalCUDACluster() as cluster: with Client(cluster) as client: address = client.scheduler.address output = asyncio.run(run_from_dask_array_asyncio(address)) assert isinstance(output['booster'], xgboost.Booster) assert isinstance(output['history'], dict)
def test_empty_dmatrix(self): with LocalCUDACluster() as cluster: with Client(cluster) as client: parameters = {'tree_method': 'gpu_hist', 'debug_synchronize': True} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def test_gpu_hist(self, params, num_rounds, dataset): with LocalCUDACluster(n_workers=2) as cluster: with Client(cluster) as client: run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client) run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client)
def setup(dask_scheduler_file=None, rmm_pool_size=None): if dask_scheduler_file: cluster = None # Env var UCX_MAX_RNDV_RAILS=1 must be set too. initialize( enable_tcp_over_ucx=True, enable_nvlink=True, enable_infiniband=False, enable_rdmacm=False, #net_devices="mlx5_0:1", ) client = Client(scheduler_file=dask_scheduler_file) else: tempdir_object = tempfile.TemporaryDirectory() cluster = LocalCUDACluster(local_directory=tempdir_object.name, rmm_pool_size=rmm_pool_size) client = Client(cluster) # add the obj to the client so it doesn't get deleted until # the 'client' obj gets cleaned up client.tempdir_object = tempdir_object client.wait_for_workers(len(get_visible_devices())) Comms.initialize(p2p=True) return (client, cluster)
async def test_with_subset_of_cuda_visible_devices(): os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,6,7" try: async with LocalCUDACluster(scheduler_port=0, asynchronous=True, device_memory_limit=1) as cluster: async with Client(cluster, asynchronous=True) as client: assert len(cluster.workers) == 4 # CUDA_VISIBLE_DEVICES cycles properly def get_visible_devices(): return os.environ["CUDA_VISIBLE_DEVICES"] result = await client.run(get_visible_devices) assert all(len(v.split(",")) == 4 for v in result.values()) for i in range(4): assert {int(v.split(",")[i]) for v in result.values()} == { 2, 3, 6, 7, } finally: del os.environ["CUDA_VISIBLE_DEVICES"]