Ejemplo n.º 1
0
def test_sorensen_multi_column(read_csv):

    M, _ = read_csv

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    df_res = cugraph.sorensen(G1, vertex_pair)

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0",
                          destination="dst_0")
    df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    actual = df_res.sort_values("0_source").reset_index()
    expected = df_exp.sort_values("source").reset_index()
    assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
Ejemplo n.º 2
0
def test_renumber_files_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src"] = cudf.Series([x + translate for x in sources.values_host])
    gdf["dst"] = cudf.Series([x + translate for x in destinations.values_host])

    exp_src = cudf.Series([x + translate for x in sources.values_host])
    exp_dst = cudf.Series([x + translate for x in destinations.values_host])

    renumbered_df, renumber_map = NumberMap.renumber(
        gdf, ["src"], ["dst"], preserve_order=True
    )

    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df, renumber_map.renumbered_src_col_name,
        preserve_order=True
    )
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df, renumber_map.renumbered_dst_col_name,
        preserve_order=True
    )

    assert_series_equal(exp_src,
                        unrenumbered_df[renumber_map.renumbered_src_col_name],
                        check_names=False)
    assert_series_equal(exp_dst,
                        unrenumbered_df[renumber_map.renumbered_dst_col_name],
                        check_names=False)
Ejemplo n.º 3
0
def test_series_different_type_cases(dtype, check_exact, check_dtype):
    data = [0, 1, 2, 3]

    psr1 = pd.Series(data, dtype="uint8")
    psr2 = pd.Series(data, dtype=dtype)

    sr1 = cudf.from_pandas(psr1)
    sr2 = cudf.from_pandas(psr2)

    kind = None
    try:
        pd.testing.assert_series_equal(psr1,
                                       psr2,
                                       check_exact=check_exact,
                                       check_dtype=check_dtype)
    except BaseException as e:
        kind = type(e)

    if kind is not None:
        with pytest.raises(kind):
            assert_series_equal(sr1,
                                sr2,
                                check_exact=check_exact,
                                check_dtype=check_dtype)
    else:
        assert_series_equal(sr1,
                            sr2,
                            check_exact=check_exact,
                            check_dtype=check_dtype)
Ejemplo n.º 4
0
def test_renumber_ips_cols():

    source_list = [
        "192.168.1.1",
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
    ]
    dest_list = [
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
        "192.168.1.1",
    ]

    pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.from_pandas(pdf)

    gdf["source_as_int"] = gdf["source_list"].str.ip2int()
    gdf["dest_as_int"] = gdf["dest_list"].str.ip2int()

    renumbered_gdf, renumber_map = NumberMap.renumber(
        gdf, ["source_as_int"], ["dest_as_int"], preserve_order=True
    )

    input_check = renumbered_gdf.merge(gdf, on=["source_list", "dest_list"])

    output_check = renumber_map.from_internal_vertex_id(
        renumbered_gdf, renumber_map.renumbered_src_col_name,
        external_column_names=["check_src"]
    )
    output_check = renumber_map.from_internal_vertex_id(
        output_check, renumber_map.renumbered_dst_col_name,
        external_column_names=["check_dst"]
    )

    merged = output_check.merge(input_check, on=["source_list", "dest_list"])

    assert_series_equal(
        merged["check_src"], merged["source_as_int"], check_names=False
    )
    assert_series_equal(
        merged["check_dst"], merged["dest_as_int"], check_names=False
    )
Ejemplo n.º 5
0
def test_replicate_cudf_series(input_data_path, dask_client):
    gc.collect()
    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )
    for column in df.columns.values:
        series = df[column]
        worker_to_futures = replication.replicate_cudf_series(series)
        for worker in worker_to_futures:
            replicated_series = worker_to_futures[worker].result()
            assert_series_equal(series, replicated_series, check_names=False)
        # FIXME: If we do not clear this dictionary, when comparing
        # results for the 2nd column, one of the workers still
        # has a value from the 1st column
        worker_to_futures = {}
Ejemplo n.º 6
0
def test_basic_assert_series_equal(
    rdata,
    rname,
    check_names,
    check_category_order,
    check_categorical,
    dtype,
):

    p_left = pd.Series([1, 2, 3], name="a", dtype=dtype)
    p_right = pd.Series(rdata, name=rname, dtype=dtype)

    left = cudf.from_pandas(p_left)
    right = cudf.from_pandas(p_right)

    kind = None
    try:
        pd.testing.assert_series_equal(
            p_left,
            p_right,
            check_names=check_names,
            check_categorical=check_categorical,
            check_category_order=check_category_order,
        )
    except BaseException as e:
        kind = type(e)

    if kind is not None:
        with pytest.raises(kind):
            assert_series_equal(
                left,
                right,
                check_names=check_names,
                check_categorical=check_categorical,
                check_category_order=check_category_order,
            )
    else:
        assert_series_equal(
            left,
            right,
            check_names=check_names,
            check_categorical=check_categorical,
            check_category_order=check_category_order,
        )
Ejemplo n.º 7
0
def test_dask_mg_degree(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "karate-asymmetric.csv").as_posix()
    print(f"dataset={input_data_path}")

    chunksize = cugraph.dask.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    merge_df_in = (dg.in_degree().merge(g.in_degree(),
                                        on="vertex",
                                        suffixes=["_dg", "_g"]).compute())

    merge_df_out = (dg.out_degree().merge(g.out_degree(),
                                          on="vertex",
                                          suffixes=["_dg", "_g"]).compute())

    assert_series_equal(merge_df_in["degree_dg"],
                        merge_df_in["degree_g"],
                        check_names=False)
    assert_series_equal(merge_df_out["degree_dg"],
                        merge_df_out["degree_g"],
                        check_names=False)
Ejemplo n.º 8
0
def test_random_walks_coalesced(graph_file, directed):
    max_depth = random.randint(2, 10)
    df_G = utils.read_csv_file(graph_file)
    df_G.rename(columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True)
    path_data, seeds = calc_random_walks(graph_file,
                                         directed,
                                         max_depth=max_depth)
    check_random_walks(path_data, seeds, df_G)

    # Check path query output
    df = cugraph.rw_path(len(seeds), path_data[2])
    v_offsets = [0] + path_data[2].cumsum()[:-1].to_numpy().tolist()
    w_offsets = [0] + (path_data[2] - 1).cumsum()[:-1].to_numpy().tolist()

    assert_series_equal(df['weight_sizes'],
                        path_data[2] - 1,
                        check_names=False)
    assert df['vertex_offsets'].to_numpy().tolist() == v_offsets
    assert df['weight_offsets'].to_numpy().tolist() == w_offsets
Ejemplo n.º 9
0
def test_mg_renumber(graph_file, dask_client):

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    ddf = dask.dataframe.from_pandas(
        gdf, npartitions=len(dask_client.scheduler_info()['workers']))

    # preserve_order is not supported for MG
    renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"],
                                                     ["dst", "dst_old"],
                                                     preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df,
        renumber_map.renumbered_src_col_name,
        preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df,
        renumber_map.renumbered_dst_col_name,
        preserve_order=False)

    # sort needed only for comparisons, since preserve_order is False
    gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"])
    gdf = gdf.reset_index()
    unrenumbered_df = unrenumbered_df.compute()
    src = renumber_map.renumbered_src_col_name
    dst = renumber_map.renumbered_dst_col_name
    unrenumbered_df = unrenumbered_df.sort_values(
        by=[f"0_{src}", f"1_{src}", f"0_{dst}", f"1_{dst}"])
    unrenumbered_df = unrenumbered_df.reset_index()

    assert_series_equal(gdf["src"],
                        unrenumbered_df[f"0_{src}"],
                        check_names=False)
    assert_series_equal(gdf["src_old"],
                        unrenumbered_df[f"1_{src}"],
                        check_names=False)
    assert_series_equal(gdf["dst"],
                        unrenumbered_df[f"0_{dst}"],
                        check_names=False)
    assert_series_equal(gdf["dst_old"],
                        unrenumbered_df[f"1_{dst}"],
                        check_names=False)
Ejemplo n.º 10
0
def test_datetime_like_compaibility(rdata, check_datetimelike_compat):
    psr1 = pd.Series([0, 1, 2, 3], dtype="datetime64[ns]")
    psr2 = pd.Series(rdata, dtype="datetime64[ns]").astype("str")

    sr1 = cudf.from_pandas(psr1)
    sr2 = cudf.from_pandas(psr2)

    kind = None
    try:
        pd.testing.assert_series_equal(
            psr1, psr2, check_datetimelike_compat=check_datetimelike_compat)
    except BaseException as e:
        kind = type(e)

    if kind is not None:
        with pytest.raises(kind):
            assert_series_equal(
                sr1, sr2, check_datetimelike_compat=check_datetimelike_compat)
    else:
        assert_series_equal(
            sr1, sr2, check_datetimelike_compat=check_datetimelike_compat)
Ejemplo n.º 11
0
def test_enable_batch_adjlist_replication_weights(graph_file, directed,
                                                  dask_client):
    gc.collect()
    df = cudf.read_csv(
        graph_file,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )
    G = cugraph.DiGraph() if directed else cugraph.Graph()
    G.from_cudf_edgelist(df,
                         source="src",
                         destination="dst",
                         edge_attr="value")
    G.enable_batch()
    G.view_adj_list()
    adjlist = G.adjlist
    offsets = adjlist.offsets
    indices = adjlist.indices
    weights = adjlist.weights
    for worker in G.batch_adjlists:
        (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker]
        assert_series_equal(offsets, rep_offsets.result(), check_names=False)
        assert_series_equal(indices, rep_indices.result(), check_names=False)
        assert_series_equal(weights, rep_weights.result(), check_names=False)
Ejemplo n.º 12
0
def test_woverlap_multi_column(graph_file):

    M = utils.read_csv_for_nx(graph_file)

    cu_M = cudf.DataFrame()
    cu_M["src_0"] = cudf.Series(M["0"])
    cu_M["dst_0"] = cudf.Series(M["1"])
    cu_M["src_1"] = cu_M["src_0"] + 1000
    cu_M["dst_1"] = cu_M["dst_0"] + 1000
    G1 = cugraph.Graph()
    G1.from_cudf_edgelist(cu_M,
                          source=["src_0", "src_1"],
                          destination=["dst_0", "dst_1"])

    G2 = cugraph.Graph()
    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")

    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
    vertex_pair = vertex_pair[:5]

    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(),
                                     dtype=np.float32))

    weights = cudf.DataFrame()
    weights['vertex'] = G2.nodes()
    weights['vertex_'] = weights['vertex'] + 1000
    weights['weight'] = weight_arr

    df_res = cugraph.overlap_w(G1, weights, vertex_pair)

    weights = weights[['vertex', 'weight']]
    df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]])

    # Calculating mismatch
    actual = df_res.sort_values("0_source").reset_index()
    expected = df_exp.sort_values("source").reset_index()
    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
Ejemplo n.º 13
0
def test_renumber_negative_col():
    source_list = [4, 6, 8, -20, 1]
    dest_list = [1, 29, 35, 0, 77]

    df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])
    gdf["original_src"] = gdf["source_list"]
    gdf["original_dst"] = gdf["dest_list"]

    renumbered_gdf, renumber_map = NumberMap.renumber(
        gdf, ["source_list"], ["dest_list"], preserve_order=True
    )

    input_check = renumbered_gdf.merge(
        gdf, on=["original_src", "original_dst"]
    )

    output_check = renumber_map.from_internal_vertex_id(
        renumbered_gdf, renumber_map.renumbered_src_col_name,
        external_column_names=["check_src"]
    )
    output_check = renumber_map.from_internal_vertex_id(
        output_check, renumber_map.renumbered_dst_col_name,
        external_column_names=["check_dst"]
    )

    merged = output_check.merge(
        input_check, on=["original_src", "original_dst"]
    )

    assert_series_equal(
        merged["check_src"], merged["original_src"], check_names=False
    )
    assert_series_equal(
        merged["check_dst"], merged["original_dst"], check_names=False
    )
Ejemplo n.º 14
0
def test_renumber_files_multi_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    renumbered_df, renumber_map = NumberMap.renumber(
        gdf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=True
    )

    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df, renumber_map.renumbered_src_col_name,
        preserve_order=True
    )
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df, renumber_map.renumbered_dst_col_name,
        preserve_order=True
    )

    src = renumber_map.renumbered_src_col_name
    dst = renumber_map.renumbered_dst_col_name
    assert_series_equal(
        gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False
    )
    assert_series_equal(
        gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False
    )
    assert_series_equal(
        gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False
    )
    assert_series_equal(
        gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False
    )