def read_dask_cudf_csv_file(csv_file, read_weights_in_sp=True, single_partition=True): print('Reading ' + str(csv_file) + '...') if read_weights_in_sp is True: if single_partition: chunksize = os.path.getsize(csv_file) return dask_cudf.read_csv(csv_file, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) else: return dask_cudf.read_csv(csv_file, delimiter=' ', names=['src', 'dst', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) else: if single_partition: chunksize = os.path.getsize(csv_file) return dask_cudf.read_csv(csv_file, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) else: return dask_cudf.read_csv(csv_file, delimiter=' ', names=['src', 'dst', 'weight'], dtype=['int32', 'int32', 'float64'], header=None)
def to_ddf(self, columns=None): if columns: return dask_cudf.read_csv(self.paths, chunksize=self.part_size, **self.csv_kwargs)[columns] return dask_cudf.read_csv(self.paths, chunksize=self.part_size, **self.csv_kwargs)
def test_read_csv(tmp_path): df = dask.datasets.timeseries( dtypes={"x": int, "y": int}, freq="120s" ).reset_index(drop=True) df.to_csv(tmp_path / "data-*.csv", index=False) df2 = dask_cudf.read_csv(tmp_path / "data-*.csv") dd.assert_eq(df, df2) # file path test stmp_path = str(tmp_path / "data-*.csv") df3 = dask_cudf.read_csv(f"file://{stmp_path}") dd.assert_eq(df2, df3)
def test_consolidation(graph_file): gc.collect() cluster = LocalCUDACluster() client = Client(cluster) chunksize = dcg.get_chunksize(graph_file) M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M['0']) df['target'] = pd.Series(M['1']) ddf = dask_cudf.read_csv(graph_file, chunksize=chunksize, delimiter=' ', names=['source', 'target', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(ddf, source='source', destination='target', create_using=cugraph.DiGraph) assert compare_graphs(Gnx, G) Gnx.clear() G.clear() client.close() cluster.close()
async def start_client(scheduler_addr, train_dir, model_file, num_workers, fs): async with Client(scheduler_addr, asynchronous=True) as client: dask.config.set({'distributed.scheduler.work-stealing': False}) print(dask.config.get('distributed.scheduler.work-stealing')) dask.config.set({'distributed.scheduler.bandwidth': 1}) print(dask.config.get('distributed.scheduler.bandwidth')) await client.wait_for_workers(num_workers) colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)] df = dask_cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None) start_time = time.time() dtrain = await xgb.dask.DaskDeviceQuantileDMatrix( client, df[df.columns.difference(['label'])], df['label']) output = await xgb.dask.train(client, { 'verbosity': 2, 'learning_rate': 0.1, 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': 0.6, 'gamma': 1, 'verbose_eval': True, 'tree_method': 'gpu_hist', 'nthread': 1 }, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) print("[debug:leader]: ------ training finished") output['booster'].save_model('/tmp/tmp.model') history = output['history'] print('[debug:leader]: ------ Training evaluation history:', history) fs.put('/tmp/tmp.model', model_file) print("[debug:leader]: ------model saved") print("[debug:leader]: ------ %s seconds ---" % (time.time() - start_time)) output = await xgb.dask.train(client, { 'verbosity': 2, 'learning_rate': 0.1, 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': 0.5, 'gamma': 0.9, 'verbose_eval': True, 'tree_method': 'gpu_hist', 'nthread': 1 }, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) print("[debug:leader]: ------ training finished") output['booster'].save_model('/tmp/tmp.model') history = output['history'] print('[debug:leader]: ------ Training evaluation history:', history) fs.put('/tmp/tmp.model', model_file + '2') print("[debug:leader]: ------model saved") print("[debug:leader]: ------ %s [2nd]seconds ---" % (time.time() - start_time)) await client.shutdown()
def test_compute_local_data(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") # Compute_local_data dg.compute_local_data(by="dst") data = dg.local_data["data"] by = dg.local_data["by"] assert by == "dst" assert Comms.is_initialized() global_num_edges = data.local_data["edges"].sum() assert global_num_edges == dg.number_of_edges() global_num_verts = data.local_data["verts"].sum() assert global_num_verts == dg.number_of_nodes()
def test_pagerank(): gc.collect() input_data_path = r"../datasets/karate.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value']) G = nx.Graph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50) res_df = pr.compute() err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.01 * len(res_df)) client.close() cluster.close()
def test_from_edgelist(dask_client): input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList
def test_dask_mg_degree(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = cugraph.dask.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) df = cudf.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') g = cugraph.DiGraph() g.from_cudf_edgelist(df, 'src', 'dst') merge_df = dg.in_degree().merge(g.in_degree(), on="vertex", suffixes=['_dg', '_g']).compute() assert merge_df['degree_dg'].equals(merge_df['degree_g'])
def test_compute_local_data(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') # Compute_local_data dg.compute_local_data(by='dst') data = dg.local_data['data'] by = dg.local_data['by'] assert by == 'dst' assert Comms.is_initialized() global_num_edges = data.local_data['edges'].sum() assert global_num_edges == dg.number_of_edges() global_num_verts = data.local_data['verts'].sum() assert global_num_verts == dg.number_of_nodes()
def test_dask_mg_degree(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = cugraph.dask.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") merge_df = (dg.in_degree().merge(g.in_degree(), on="vertex", suffixes=["_dg", "_g"]).compute()) assert merge_df["degree_dg"].equals(merge_df["degree_g"])
def fetch_data(self): """ Fetch data using dask based on provided config object """ df = None input_format = self.config["input_format"].lower() filepath = self.config["input_path"] kwargs = self.config.copy() del kwargs["type"] del kwargs["input_format"] del kwargs["input_path"] if "csv" == input_format: df = dask_cudf.read_csv(filepath, **kwargs) elif "parquet" == input_format: df = dask_cudf.read_parquet(filepath, **kwargs) elif "orc" == input_format: df = dask_cudf.read_orc(filepath, engine="cudf") elif "json" == input_format: df = dask_cudf.read_json(filepath, **kwargs) else: raise NotImplementedError("%s is not a supported input_format" % (input_format)) self.has_data = False return df
def test_from_edgelist(client_connection): # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList
def test_dask_katz_centrality(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree") largest_out_degree = largest_out_degree["out_degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge( mg_res, on="vertex", suffixes=["_local", "_dask"] ) for i in range(len(compare_res)): diff = abs( compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i] ) if diff > tol * 1.1: err = err + 1 assert err == 0
def read_csv(self, files, **kwargs): if "dtype" in kwargs: kwargs["dtype"] = OrderedDict([ (col, ("str" if dtype == "category" else dtype)) for (col, dtype) in kwargs["dtype"].items() ]) kwargs["chunksize"] = None return dask_cudf.read_csv(files, **kwargs)
def test_read_csv(s3_base, s3so): with s3_context( s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"} ): df = dask_cudf.read_csv( "s3://daskcsv/*.csv", chunksize="50 B", storage_options=s3so ) assert df.a.sum().compute() == 4
def test_csv_roundtrip(tmp_path): df = cudf.DataFrame({"x": [1, 2, 3, 4], "id": ["a", "b", "c", "d"]}) ddf = dask_cudf.from_cudf(df, npartitions=2) csv_path = str(tmp_path / "data-*.csv") ddf.to_csv(csv_path, index=False) ddf2 = dask_cudf.read_csv(csv_path) dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
def test_read_csv_compression(tmp_path): df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) df.to_csv(tmp_path / "data.csv.gz", index=False) with pytest.warns(UserWarning) as w: df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize="50 B") assert len(w) == 1 msg = str(w[0].message) assert "gzip" in msg assert df2.npartitions == 1 dd.assert_eq(df2, df, check_index=False) with warnings.catch_warnings(record=True) as record: df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize=None) assert not record
def test_dask_bfs(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) def modify_dataset(df): temp_df = cudf.DataFrame() temp_df['src'] = df['src'] + 1000 temp_df['dst'] = df['dst'] + 1000 temp_df['value'] = df['value'] return cudf.concat([df, temp_df]) meta = ddf._meta ddf = ddf.map_partitions(modify_dataset, meta=meta) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = modify_dataset(df) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, [0, 1000]) result_dist = dcg.bfs(dg, [0, 1000]) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_pagerank(): gc.collect() input_data_path = r"../datasets/hibench_small/1/part-00000.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) new_ddf = dcg.drop_duplicates(x) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50) wait(pr) t3 = time.time() print("Running PR algo time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() # For bigdatax4, chunksize=100000000 to avoid oom on write csv res_df.to_csv('~/pagerank.csv', header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df)) client.close() cluster.close()
def test_read_csv_w_bytes(tmp_path): df = dask.datasets.timeseries( dtypes={"x": int, "y": int}, freq="120s" ).reset_index(drop=True) df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20))) df.to_csv(tmp_path / "data-*.csv", index=False) df2 = dask_cudf.read_csv(tmp_path / "*.csv", chunksize="50 B") assert df2.npartitions == 3 dd.assert_eq(df2, df, check_index=False)
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data and personalize personalization = None if personalization_perc != 0: dg.compute_local_data(by="dst") personalization = personalize(dg.number_of_vertices(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def read_dask_cudf_csv_file(csv_file, read_weights_in_sp=True, single_partition=True): print("Reading " + str(csv_file) + "...") if read_weights_in_sp is True: if single_partition: chunksize = os.path.getsize(csv_file) return dask_cudf.read_csv( csv_file, chunksize=chunksize, delimiter=" ", names=["src", "dst", "weight"], dtype=["int32", "int32", "float32"], header=None, ) else: return dask_cudf.read_csv( csv_file, delimiter=" ", names=["src", "dst", "weight"], dtype=["int32", "int32", "float32"], header=None, ) else: if single_partition: chunksize = os.path.getsize(csv_file) return dask_cudf.read_csv( csv_file, chunksize=chunksize, delimiter=" ", names=["src", "dst", "weight"], dtype=["int32", "int32", "float32"], header=None, ) else: return dask_cudf.read_csv( csv_file, delimiter=" ", names=["src", "dst", "weight"], dtype=["int32", "int32", "float64"], header=None, )
def test_dask_bfs_multi_column_depthlimit(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) ddf['src_b'] = ddf['src_a'] + 1000 ddf['dst_b'] = ddf['dst_a'] + 1000 df = cudf.read_csv( input_data_path, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) df['src_b'] = df['src_a'] + 1000 df['dst_b'] = df['dst_a'] + 1000 g = cugraph.DiGraph() g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"]) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"]) start = cudf.DataFrame() start['a'] = [0] start['b'] = [1000] depth_limit = 18 expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit) result_dist = dcg.bfs(dg, start, depth_limit=depth_limit) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on=["0_vertex", "1_vertex"], suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] <= depth_limit and compare_dist["distance_dask"].iloc[i] <= depth_limit and compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_read_csv(tmp_path): df = dask.datasets.timeseries( dtypes={"x": int, "y": int}, freq="120s" ).reset_index(drop=True) df.to_csv(tmp_path / "data-*.csv", index=False) df2 = dask_cudf.read_csv(tmp_path / "data-*.csv") dd.assert_eq(df, df2) # file path test stmp_path = str(tmp_path / "data-*.csv") df3 = dask_cudf.read_csv(f"file://{stmp_path}") dd.assert_eq(df2, df3) # file list test list_paths = [ os.path.join(tmp_path, fname) for fname in sorted(os.listdir(tmp_path)) ] df4 = dask_cudf.read_csv(list_paths) dd.assert_eq(df, df4)
def test_csv_reader_usecols(tmp_path, dtype): df = cudf.DataFrame({ "a": [1, 2, 3, 4] * 100, "b": ["a", "b", "c", "d"] * 100, "c": [10, 11, 12, 13] * 100, }) csv_path = str(tmp_path / "usecols_data.csv") df.to_csv(csv_path, index=False) ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5) ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype) dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
def using_quantile_device_dmatrix(client: Client, train_dir, model_file, fs, do_wait=False): '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree method that reduces memory overhead. When training on GPU pipeline, it's preferred over `DaskDMatrix`. .. versionadded:: 1.2.0 ''' colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)] df = dask_cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None) X = df[df.columns.difference(['label'])] y = df['label'] print("[INFO]: ------ CSV files are read from" + train_dir) if do_wait is True: df = df.persist() X = X.persist() wait(df) wait(X) print("[INFO]: ------ Long waited but the data is ready now") # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful # that it can not be used for anything else than training. start_time = time.time() dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time))) del df del X del y start_time = time.time() output = xgb.dask.train(client, { 'verbosity': 2, 'learning_rate': 0.1, 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': 0.5, 'gamma': 0.9, 'verbose_eval': True, 'tree_method':'gpu_hist', #'nthread':1 }, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time))) history = output['history'] print('[INFO]: ------ Training evaluation history:', history) output['booster'].save_model('/tmp/tmp.model') fs.put('/tmp/tmp.model', model_file) print("[INFO]: ------ Model saved here:{}".format( model_file))
def test_dask_pagerank(dask_client, personalization_perc): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_dataset(datasets, engine, num_files): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = paths[:num_files] if engine == "parquet": ddf0 = dask_cudf.read_parquet(paths)[mycols_pq] dataset = nvtabular.io.Dataset(paths) result = dataset.to_ddf(columns=mycols_pq) else: ddf0 = dask_cudf.read_csv(paths, header=False, names=allcols_csv)[mycols_csv] dataset = nvtabular.io.Dataset(paths, header=False, names=allcols_csv) result = dataset.to_ddf(columns=mycols_csv) assert_eq(ddf0, result)