Ejemplo n.º 1
0
def read_dask_cudf_csv_file(csv_file,
                            read_weights_in_sp=True,
                            single_partition=True):
    print('Reading ' + str(csv_file) + '...')
    if read_weights_in_sp is True:
        if single_partition:
            chunksize = os.path.getsize(csv_file)
            return dask_cudf.read_csv(csv_file,
                                      chunksize=chunksize,
                                      delimiter=' ',
                                      names=['src', 'dst', 'weight'],
                                      dtype=['int32', 'int32', 'float32'],
                                      header=None)
        else:
            return dask_cudf.read_csv(csv_file,
                                      delimiter=' ',
                                      names=['src', 'dst', 'weight'],
                                      dtype=['int32', 'int32', 'float32'],
                                      header=None)
    else:
        if single_partition:
            chunksize = os.path.getsize(csv_file)
            return dask_cudf.read_csv(csv_file,
                                      chunksize=chunksize,
                                      delimiter=' ',
                                      names=['src', 'dst', 'weight'],
                                      dtype=['int32', 'int32', 'float32'],
                                      header=None)
        else:
            return dask_cudf.read_csv(csv_file,
                                      delimiter=' ',
                                      names=['src', 'dst', 'weight'],
                                      dtype=['int32', 'int32', 'float64'],
                                      header=None)
Ejemplo n.º 2
0
 def to_ddf(self, columns=None):
     if columns:
         return dask_cudf.read_csv(self.paths,
                                   chunksize=self.part_size,
                                   **self.csv_kwargs)[columns]
     return dask_cudf.read_csv(self.paths,
                               chunksize=self.part_size,
                               **self.csv_kwargs)
Ejemplo n.º 3
0
def test_read_csv(tmp_path):
    df = dask.datasets.timeseries(
        dtypes={"x": int, "y": int}, freq="120s"
    ).reset_index(drop=True)

    df.to_csv(tmp_path / "data-*.csv", index=False)

    df2 = dask_cudf.read_csv(tmp_path / "data-*.csv")
    dd.assert_eq(df, df2)

    # file path test
    stmp_path = str(tmp_path / "data-*.csv")
    df3 = dask_cudf.read_csv(f"file://{stmp_path}")
    dd.assert_eq(df2, df3)
Ejemplo n.º 4
0
def test_consolidation(graph_file):
    gc.collect()

    cluster = LocalCUDACluster()
    client = Client(cluster)
    chunksize = dcg.get_chunksize(graph_file)

    M = utils.read_csv_for_nx(graph_file)

    df = pd.DataFrame()
    df['source'] = pd.Series(M['0'])
    df['target'] = pd.Series(M['1'])

    ddf = dask_cudf.read_csv(graph_file,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['source', 'target', 'weight'],
                             dtype=['int32', 'int32', 'float32'],
                             header=None)

    Gnx = nx.from_pandas_edgelist(df,
                                  source='source',
                                  target='target',
                                  create_using=nx.DiGraph)
    G = cugraph.from_cudf_edgelist(ddf,
                                   source='source',
                                   destination='target',
                                   create_using=cugraph.DiGraph)

    assert compare_graphs(Gnx, G)
    Gnx.clear()
    G.clear()
    client.close()
    cluster.close()
Ejemplo n.º 5
0
async def start_client(scheduler_addr, train_dir, model_file, num_workers, fs):
    async with Client(scheduler_addr, asynchronous=True) as client:
        dask.config.set({'distributed.scheduler.work-stealing': False})
        print(dask.config.get('distributed.scheduler.work-stealing'))
        dask.config.set({'distributed.scheduler.bandwidth': 1})
        print(dask.config.get('distributed.scheduler.bandwidth'))
        await client.wait_for_workers(num_workers)
        colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
        df = dask_cudf.read_csv(train_dir,
                                header=None,
                                names=colnames,
                                chunksize=None)
        start_time = time.time()
        dtrain = await xgb.dask.DaskDeviceQuantileDMatrix(
            client, df[df.columns.difference(['label'])], df['label'])
        output = await xgb.dask.train(client, {
            'verbosity': 2,
            'learning_rate': 0.1,
            'max_depth': 8,
            'objective': 'reg:squarederror',
            'subsample': 0.6,
            'gamma': 1,
            'verbose_eval': True,
            'tree_method': 'gpu_hist',
            'nthread': 1
        },
                                      dtrain,
                                      num_boost_round=100,
                                      evals=[(dtrain, 'train')])
        print("[debug:leader]: ------ training finished")
        output['booster'].save_model('/tmp/tmp.model')
        history = output['history']
        print('[debug:leader]: ------ Training evaluation history:', history)
        fs.put('/tmp/tmp.model', model_file)
        print("[debug:leader]: ------model saved")
        print("[debug:leader]: ------ %s seconds ---" %
              (time.time() - start_time))
        output = await xgb.dask.train(client, {
            'verbosity': 2,
            'learning_rate': 0.1,
            'max_depth': 8,
            'objective': 'reg:squarederror',
            'subsample': 0.5,
            'gamma': 0.9,
            'verbose_eval': True,
            'tree_method': 'gpu_hist',
            'nthread': 1
        },
                                      dtrain,
                                      num_boost_round=100,
                                      evals=[(dtrain, 'train')])
        print("[debug:leader]: ------ training finished")
        output['booster'].save_model('/tmp/tmp.model')
        history = output['history']
        print('[debug:leader]: ------ Training evaluation history:', history)
        fs.put('/tmp/tmp.model', model_file + '2')
        print("[debug:leader]: ------model saved")
        print("[debug:leader]: ------ %s [2nd]seconds ---" %
              (time.time() - start_time))
        await client.shutdown()
Ejemplo n.º 6
0
def test_compute_local_data(client_connection):

    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf,
                               source="src",
                               destination="dst",
                               edge_attr="value")

    # Compute_local_data
    dg.compute_local_data(by="dst")
    data = dg.local_data["data"]
    by = dg.local_data["by"]

    assert by == "dst"
    assert Comms.is_initialized()

    global_num_edges = data.local_data["edges"].sum()
    assert global_num_edges == dg.number_of_edges()
    global_num_verts = data.local_data["verts"].sum()
    assert global_num_verts == dg.number_of_nodes()
Ejemplo n.º 7
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/karate.csv"
    # Networkx Call
    pd_df = pd.read_csv(input_data_path,
                        delimiter=' ',
                        names=['src', 'dst', 'value'])
    G = nx.Graph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])
    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50)
    res_df = pr.compute()

    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.01 * len(res_df))

    client.close()
    cluster.close()
Ejemplo n.º 8
0
def test_from_edgelist(dask_client):
    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.from_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value",
                                create_using=cugraph.DiGraph)

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value")

    assert dg1.EdgeList == dg2.EdgeList
Ejemplo n.º 9
0
def test_dask_mg_degree(client_connection):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"

    chunksize = cugraph.dask.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    df = cudf.read_csv(input_data_path,
                       delimiter=' ',
                       names=['src', 'dst', 'value'],
                       dtype=['int32', 'int32', 'float32'])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, 'src', 'dst')

    merge_df = dg.in_degree().merge(g.in_degree(),
                                    on="vertex",
                                    suffixes=['_dg', '_g']).compute()

    assert merge_df['degree_dg'].equals(merge_df['degree_g'])
Ejemplo n.º 10
0
def test_compute_local_data(client_connection):

    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                               edge_attr='value')

    # Compute_local_data
    dg.compute_local_data(by='dst')
    data = dg.local_data['data']
    by = dg.local_data['by']

    assert by == 'dst'
    assert Comms.is_initialized()

    global_num_edges = data.local_data['edges'].sum()
    assert global_num_edges == dg.number_of_edges()
    global_num_verts = data.local_data['verts'].sum()
    assert global_num_verts == dg.number_of_nodes()
Ejemplo n.º 11
0
def test_dask_mg_degree(client_connection):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"

    chunksize = cugraph.dask.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    merge_df = (dg.in_degree().merge(g.in_degree(),
                                     on="vertex",
                                     suffixes=["_dg", "_g"]).compute())

    assert merge_df["degree_dg"].equals(merge_df["degree_g"])
Ejemplo n.º 12
0
    def fetch_data(self):
        """
        Fetch data using dask based on provided config object
        """
        df = None
        input_format = self.config["input_format"].lower()
        filepath = self.config["input_path"]
        kwargs = self.config.copy()
        del kwargs["type"]
        del kwargs["input_format"]
        del kwargs["input_path"]

        if "csv" == input_format:
            df = dask_cudf.read_csv(filepath, **kwargs)
        elif "parquet" == input_format:
            df = dask_cudf.read_parquet(filepath, **kwargs)
        elif "orc" == input_format:
            df = dask_cudf.read_orc(filepath, engine="cudf")
        elif "json" == input_format:
            df = dask_cudf.read_json(filepath, **kwargs)
        else:
            raise NotImplementedError("%s is not a supported input_format" % (input_format))

        self.has_data = False
        return df
Ejemplo n.º 13
0
def test_from_edgelist(client_connection):
    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/karate.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg1 = cugraph.from_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value",
                                create_using=cugraph.DiGraph)

    dg2 = cugraph.DiGraph()
    dg2.from_dask_cudf_edgelist(ddf,
                                source="src",
                                destination="dst",
                                edge_attr="value")

    assert dg1.EdgeList == dg2.EdgeList
Ejemplo n.º 14
0
def test_dask_katz_centrality(client_connection):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree")
    largest_out_degree = largest_out_degree["out_degree"].iloc[0]
    katz_alpha = 1 / (largest_out_degree + 1)

    mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6)
    mg_res = mg_res.compute()

    import networkx as nx
    from cugraph.tests import utils
    NM = utils.read_csv_for_nx(input_data_path)
    Gnx = nx.from_pandas_edgelist(
        NM, create_using=nx.DiGraph(), source="0", target="1"
    )
    nk = nx.katz_centrality(Gnx, alpha=katz_alpha)
    import pandas as pd
    pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality'])
    exp_res = cudf.DataFrame(pdf)
    err = 0
    tol = 1.0e-05

    compare_res = exp_res.merge(
        mg_res, on="vertex", suffixes=["_local", "_dask"]
    )

    for i in range(len(compare_res)):
        diff = abs(
            compare_res["katz_centrality_local"].iloc[i]
            - compare_res["katz_centrality_dask"].iloc[i]
        )
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Ejemplo n.º 15
0
 def read_csv(self, files, **kwargs):
     if "dtype" in kwargs:
         kwargs["dtype"] = OrderedDict([
             (col, ("str" if dtype == "category" else dtype))
             for (col, dtype) in kwargs["dtype"].items()
         ])
     kwargs["chunksize"] = None
     return dask_cudf.read_csv(files, **kwargs)
Ejemplo n.º 16
0
def test_read_csv(s3_base, s3so):
    with s3_context(
        s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"}
    ):
        df = dask_cudf.read_csv(
            "s3://daskcsv/*.csv", chunksize="50 B", storage_options=s3so
        )
        assert df.a.sum().compute() == 4
Ejemplo n.º 17
0
def test_csv_roundtrip(tmp_path):
    df = cudf.DataFrame({"x": [1, 2, 3, 4], "id": ["a", "b", "c", "d"]})
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    csv_path = str(tmp_path / "data-*.csv")
    ddf.to_csv(csv_path, index=False)

    ddf2 = dask_cudf.read_csv(csv_path)
    dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
Ejemplo n.º 18
0
def test_read_csv_compression(tmp_path):
    df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
    df.to_csv(tmp_path / "data.csv.gz", index=False)

    with pytest.warns(UserWarning) as w:
        df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize="50 B")

    assert len(w) == 1
    msg = str(w[0].message)
    assert "gzip" in msg

    assert df2.npartitions == 1
    dd.assert_eq(df2, df, check_index=False)

    with warnings.catch_warnings(record=True) as record:
        df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize=None)

        assert not record
Ejemplo n.º 19
0
def test_dask_bfs(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()

    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    def modify_dataset(df):
        temp_df = cudf.DataFrame()
        temp_df['src'] = df['src'] + 1000
        temp_df['dst'] = df['dst'] + 1000
        temp_df['value'] = df['value']
        return cudf.concat([df, temp_df])

    meta = ddf._meta
    ddf = ddf.map_partitions(modify_dataset, meta=meta)

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = modify_dataset(df)

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, [0, 1000])
    result_dist = dcg.bfs(dg, [0, 1000])
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Ejemplo n.º 20
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/hibench_small/1/part-00000.csv"

    # Networkx Call
    pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst'])
    G = nx.DiGraph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    t0 = time.time()
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter='\t',
                             names=['src', 'dst'],
                             dtype=['int32', 'int32'])
    y = ddf.to_delayed()
    x = client.compute(y)
    wait(x)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    new_ddf = dcg.drop_duplicates(x)
    t2 = time.time()
    pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50)
    wait(pr)
    t3 = time.time()
    print("Running PR algo time: ", t3 - t2)
    t4 = time.time()
    res_df = pr.compute()
    t5 = time.time()
    print("Compute time: ", t5 - t4)
    print(res_df)
    t6 = time.time()
    # For bigdatax4, chunksize=100000000 to avoid oom on write csv
    res_df.to_csv('~/pagerank.csv', header=False, index=False)
    t7 = time.time()
    print("Write csv time: ", t7 - t6)

    # Comparison
    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.02 * len(res_df))

    client.close()
    cluster.close()
Ejemplo n.º 21
0
def test_read_csv_w_bytes(tmp_path):
    df = dask.datasets.timeseries(
        dtypes={"x": int, "y": int}, freq="120s"
    ).reset_index(drop=True)
    df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
    df.to_csv(tmp_path / "data-*.csv", index=False)

    df2 = dask_cudf.read_csv(tmp_path / "*.csv", chunksize="50 B")
    assert df2.npartitions == 3
    dd.assert_eq(df2, df, check_index=False)
Ejemplo n.º 22
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/karate.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Ejemplo n.º 23
0
def test_dask_pagerank(client_connection, personalization_perc):
    gc.collect()

    input_data_path = r"../datasets/karate.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    # Pre compute local data and personalize
    personalization = None
    if personalization_perc != 0:
        dg.compute_local_data(by="dst")
        personalization = personalize(dg.number_of_vertices(),
                                      personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Ejemplo n.º 24
0
def read_dask_cudf_csv_file(csv_file,
                            read_weights_in_sp=True,
                            single_partition=True):
    print("Reading " + str(csv_file) + "...")
    if read_weights_in_sp is True:
        if single_partition:
            chunksize = os.path.getsize(csv_file)
            return dask_cudf.read_csv(
                csv_file,
                chunksize=chunksize,
                delimiter=" ",
                names=["src", "dst", "weight"],
                dtype=["int32", "int32", "float32"],
                header=None,
            )
        else:
            return dask_cudf.read_csv(
                csv_file,
                delimiter=" ",
                names=["src", "dst", "weight"],
                dtype=["int32", "int32", "float32"],
                header=None,
            )
    else:
        if single_partition:
            chunksize = os.path.getsize(csv_file)
            return dask_cudf.read_csv(
                csv_file,
                chunksize=chunksize,
                delimiter=" ",
                names=["src", "dst", "weight"],
                dtype=["int32", "int32", "float32"],
                header=None,
            )
        else:
            return dask_cudf.read_csv(
                csv_file,
                delimiter=" ",
                names=["src", "dst", "weight"],
                dtype=["int32", "int32", "float64"],
                header=None,
            )
Ejemplo n.º 25
0
def test_dask_bfs_multi_column_depthlimit(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    ddf['src_b'] = ddf['src_a'] + 1000
    ddf['dst_b'] = ddf['dst_a'] + 1000

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    df['src_b'] = df['src_a'] + 1000
    df['dst_b'] = df['dst_a'] + 1000

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"])

    start = cudf.DataFrame()
    start['a'] = [0]
    start['b'] = [1000]

    depth_limit = 18
    expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit)
    result_dist = dcg.bfs(dg, start, depth_limit=depth_limit)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on=["0_vertex", "1_vertex"],
                                       suffixes=["_local", "_dask"])

    err = 0
    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] <= depth_limit
                and compare_dist["distance_dask"].iloc[i] <= depth_limit
                and compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Ejemplo n.º 26
0
def test_read_csv(tmp_path):
    df = dask.datasets.timeseries(
        dtypes={"x": int, "y": int}, freq="120s"
    ).reset_index(drop=True)

    df.to_csv(tmp_path / "data-*.csv", index=False)

    df2 = dask_cudf.read_csv(tmp_path / "data-*.csv")
    dd.assert_eq(df, df2)

    # file path test
    stmp_path = str(tmp_path / "data-*.csv")
    df3 = dask_cudf.read_csv(f"file://{stmp_path}")
    dd.assert_eq(df2, df3)

    # file list test
    list_paths = [
        os.path.join(tmp_path, fname) for fname in sorted(os.listdir(tmp_path))
    ]
    df4 = dask_cudf.read_csv(list_paths)
    dd.assert_eq(df, df4)
Ejemplo n.º 27
0
def test_csv_reader_usecols(tmp_path, dtype):
    df = cudf.DataFrame({
        "a": [1, 2, 3, 4] * 100,
        "b": ["a", "b", "c", "d"] * 100,
        "c": [10, 11, 12, 13] * 100,
    })
    csv_path = str(tmp_path / "usecols_data.csv")
    df.to_csv(csv_path, index=False)
    ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)

    dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
Ejemplo n.º 28
0
def using_quantile_device_dmatrix(client: Client, train_dir, model_file, fs, do_wait=False):
    '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
     method that reduces memory overhead.  When training on GPU pipeline, it's
     preferred over `DaskDMatrix`.
    .. versionadded:: 1.2.0
    '''
    colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
    df = dask_cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None)
    X = df[df.columns.difference(['label'])]
    y = df['label']
    print("[INFO]: ------ CSV files are read from" + train_dir)
   

    if do_wait is True:
        df = df.persist()
        X = X.persist()
        wait(df)
        wait(X)
        print("[INFO]: ------ Long waited but the data is ready now")
    

    # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
    # that it can not be used for anything else than training.
    start_time = time.time()
    dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
    print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time)))
   
    del df
    del X
    del y

    start_time = time.time()
    output = xgb.dask.train(client,
                        { 'verbosity': 2,
                         'learning_rate': 0.1,
                          'max_depth': 8,
                          'objective': 'reg:squarederror',
                          'subsample': 0.5,
                          'gamma': 0.9,
                          'verbose_eval': True,
                          'tree_method':'gpu_hist',
                          #'nthread':1
                        },
                        dtrain,
                        num_boost_round=100, evals=[(dtrain, 'train')])
    print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))
    
    history = output['history']
    print('[INFO]: ------ Training evaluation history:', history)
    
    output['booster'].save_model('/tmp/tmp.model')
    fs.put('/tmp/tmp.model', model_file)
    print("[INFO]: ------ Model saved here:{}".format( model_file))
Ejemplo n.º 29
0
def test_dask_pagerank(dask_client, personalization_perc):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    personalization = None
    if personalization_perc != 0:
        personalization, p = personalize(g.nodes(), personalization_perc)

    expected_pr = cugraph.pagerank(g,
                                   personalization=personalization,
                                   tol=1e-6)
    result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
    result_pr = result_pr.compute()

    err = 0
    tol = 1.0e-05

    assert len(expected_pr) == len(result_pr)

    compare_pr = expected_pr.merge(result_pr,
                                   on="vertex",
                                   suffixes=["_local", "_dask"])

    for i in range(len(compare_pr)):
        diff = abs(compare_pr["pagerank_local"].iloc[i] -
                   compare_pr["pagerank_dask"].iloc[i])
        if diff > tol * 1.1:
            err = err + 1
    assert err == 0
Ejemplo n.º 30
0
def test_dask_dataset(datasets, engine, num_files):
    paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0])
    paths = paths[:num_files]
    if engine == "parquet":
        ddf0 = dask_cudf.read_parquet(paths)[mycols_pq]
        dataset = nvtabular.io.Dataset(paths)
        result = dataset.to_ddf(columns=mycols_pq)
    else:
        ddf0 = dask_cudf.read_csv(paths, header=False, names=allcols_csv)[mycols_csv]
        dataset = nvtabular.io.Dataset(paths, header=False, names=allcols_csv)
        result = dataset.to_ddf(columns=mycols_csv)

    assert_eq(ddf0, result)