Esempio n. 1
0
def _compare_bfs(G, Gnx, source):
    df = cugraph.bfs(G, source, return_sp_counter=False)
    # This call should only contain 3 columns:
    # 'vertex', 'distance', 'predecessor'
    # It also confirms wether or not 'sp_counter' has been created by the call
    # 'sp_counter' triggers atomic operations in BFS, thus we want to make
    # sure that it was not the case
    # NOTE: 'predecessor' is always returned while the C++ function allows to
    # pass a nullptr
    assert len(df.columns) == 3, ("The result of the BFS has an invalid "
                                  "number of columns")
    cu_distances = {
        vertex: dist
        for vertex, dist in zip(df["vertex"].to_array(),
                                df["distance"].to_array())
    }
    cu_predecessors = {
        vertex: dist
        for vertex, dist in zip(df["vertex"].to_array(),
                                df["predecessor"].to_array())
    }

    nx_distances = nx.single_source_shortest_path_length(Gnx, source)
    # FIXME: The following only verifies vertices that were reached
    #       by cugraph's BFS.
    # We assume that the distances are given back as integers in BFS
    # max_val = np.iinfo(df['distance'].dtype).max
    # Unreached vertices have a distance of max_val

    missing_vertex_error = 0
    distance_mismatch_error = 0
    invalid_predecessor_error = 0
    for vertex in nx_distances:
        if vertex in cu_distances:
            result = cu_distances[vertex]
            expected = nx_distances[vertex]
            if result != expected:
                print("[ERR] Mismatch on distances: "
                      "vid = {}, cugraph = {}, nx = {}".format(
                          vertex, result, expected))
                distance_mismatch_error += 1
            if vertex not in cu_predecessors:
                missing_vertex_error += 1
            else:
                pred = cu_predecessors[vertex]
                if vertex != source and pred not in nx_distances:
                    invalid_predecessor_error += 1
                else:
                    # The graph is unweighted thus, predecessors are 1 away
                    if vertex != source and (
                        (nx_distances[pred] + 1 != cu_distances[vertex])):
                        print("[ERR] Invalid on predecessors: "
                              "vid = {}, cugraph = {}".format(vertex, pred))
                        invalid_predecessor_error += 1
        else:
            missing_vertex_error += 1
    assert missing_vertex_error == 0, "There are missing vertices"
    assert distance_mismatch_error == 0, "There are invalid distances"
    assert invalid_predecessor_error == 0, "There are invalid predecessors"
Esempio n. 2
0
 def breadth_first_search(graph: CuGraph, source_node: NodeID,
                          depth_limit: int) -> CuDFVector:
     bfs_df = cugraph.bfs(graph.value, source_node)
     bfs_df = bfs_df[bfs_df.predecessor.isin(bfs_df.vertex) |
                     (bfs_df.distance == 0)]
     bfs_ordered_vertices = bfs_df.sort_values(
         "distance").vertex.reset_index(drop=True)
     return CuDFVector(bfs_ordered_vertices)
Esempio n. 3
0
def test_dask_bfs(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()

    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    def modify_dataset(df):
        temp_df = cudf.DataFrame()
        temp_df['src'] = df['src'] + 1000
        temp_df['dst'] = df['dst'] + 1000
        temp_df['value'] = df['value']
        return cudf.concat([df, temp_df])

    meta = ddf._meta
    ddf = ddf.map_partitions(modify_dataset, meta=meta)

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = modify_dataset(df)

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst")

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, [0, 1000])
    result_dist = dcg.bfs(dg, [0, 1000])
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Esempio n. 4
0
def test_dask_bfs_multi_column_depthlimit(dask_client):
    gc.collect()

    input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH /
                       "netscience.csv").as_posix()
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    ddf['src_b'] = ddf['src_a'] + 1000
    ddf['dst_b'] = ddf['dst_a'] + 1000

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src_a", "dst_a", "value"],
        dtype=["int32", "int32", "float32"],
    )
    df['src_b'] = df['src_a'] + 1000
    df['dst_b'] = df['dst_a'] + 1000

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"])

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"])

    start = cudf.DataFrame()
    start['a'] = [0]
    start['b'] = [1000]

    depth_limit = 18
    expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit)
    result_dist = dcg.bfs(dg, start, depth_limit=depth_limit)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on=["0_vertex", "1_vertex"],
                                       suffixes=["_local", "_dask"])

    err = 0
    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] <= depth_limit
                and compare_dist["distance_dask"].iloc[i] <= depth_limit
                and compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Esempio n. 5
0
def cugraph_call(cu_M, start_vertex):

    G = cugraph.DiGraph()
    G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2')

    t1 = time.time()
    df = cugraph.bfs(G, start_vertex)
    t2 = time.time() - t1
    print('Time : '+str(t2))

    # Return distances as np.array()
    return df['vertex'].to_array(), df['distance'].to_array()
Esempio n. 6
0
def _compare_bfs_spc(G, Gnx, source):
    df = cugraph.bfs(G, source, return_sp_counter=True)
    # This call should only contain 3 columns:
    # 'vertex', 'distance', 'predecessor', 'sp_counter'
    assert len(df.columns) == 4, (
        "The result of the BFS has an invalid " "number of columns"
    )
    _, _, nx_sp_counter = nxacb._single_source_shortest_path_basic(Gnx, source)
    sorted_nx = [nx_sp_counter[key] for key in sorted(nx_sp_counter.keys())]
    # We are not checking for distances / predecessors here as we assume
    # that these have been checked  in the _compare_bfs tests
    # We focus solely on shortest path counting

    # cugraph return a dataframe that should contain exactly one time each
    # vertex
    # We could us isin to filter only vertices that are common to both
    # But it would slow down the comparison, and in this specific case
    # nxacb._single_source_shortest_path_basic is a dictionary containing all
    # the vertices.
    # There is no guarantee when we get `df` that the vertices are sorted
    # thus we enforce the order so that we can leverage faster comparison after
    sorted_df = df.sort_values("vertex").rename(
        columns={"sp_counter": "cu_spc"}, copy=False
    )

    # This allows to detect vertices identifier that could have been
    # wrongly present multiple times
    cu_vertices = set(sorted_df['vertex'].values_host)
    nx_vertices = nx_sp_counter.keys()
    assert len(cu_vertices.intersection(nx_vertices)) == len(
        nx_vertices
    ), "There are missing vertices"

    # We add the nx shortest path counter in the cudf.DataFrame, both the
    # the DataFrame and `sorted_nx` are sorted base on vertices identifiers
    sorted_df["nx_spc"] = sorted_nx

    # We could use numpy.isclose or cupy.isclose, we can then get the entries
    # in the cudf.DataFrame where there are is a mismatch.
    # numpy / cupy allclose would get only a boolean and we might want the
    # extra information about the discrepancies
    shortest_path_counter_errors = sorted_df[
        ~cupy.isclose(
            sorted_df["cu_spc"], sorted_df["nx_spc"], rtol=DEFAULT_EPSILON
        )
    ]
    if len(shortest_path_counter_errors) > 0:
        print(shortest_path_counter_errors)
    assert len(shortest_path_counter_errors) == 0, (
        "Shortest path counters " "are too different"
    )
Esempio n. 7
0
def cugraph_call(cu_M, start_vertex):
    # Device data
    sources = cu_M['0']
    destinations = cu_M['1']
    values = cu_M['2']

    G = cugraph.Graph()
    G.add_edge_list(sources, destinations, values)

    t1 = time.time()
    df = cugraph.bfs(G, start_vertex)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    # Return distances as np.array()
    return df['vertex'].to_array(), df['distance'].to_array()
Esempio n. 8
0
def cugraph_call(M, start_vertex):
    # Device data
    M = M.tocsr()
    sources = cudf.Series(M.indptr)
    destinations = cudf.Series(M.indices)
    values = cudf.Series(M.data)

    G = cugraph.Graph()
    G.add_adj_list(sources, destinations, values)

    t1 = time.time()
    df = cugraph.bfs(G, start_vertex)
    t2 = time.time() - t1
    print('Time : ' + str(t2))

    # Return distances as np.array()
    return df['vertex'].to_array(), df['distance'].to_array()
Esempio n. 9
0
def test_dask_bfs(client_connection):
    gc.collect()

    # FIXME: update this to allow dataset to be parameterized and have dataset
    # part of test param id (see other tests)
    input_data_path = r"../datasets/netscience.csv"
    print(f"dataset={input_data_path}")
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(
        input_data_path,
        chunksize=chunksize,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    df = cudf.read_csv(
        input_data_path,
        delimiter=" ",
        names=["src", "dst", "value"],
        dtype=["int32", "int32", "float32"],
    )

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, "src", "dst", renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, "src", "dst")

    expected_dist = cugraph.bfs(g, 0)
    result_dist = dcg.bfs(dg, 0, True)
    result_dist = result_dist.compute()

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=["_local", "_dask"])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist["distance_local"].iloc[i] !=
                compare_dist["distance_dask"].iloc[i]):
            err = err + 1
    assert err == 0
Esempio n. 10
0
def test_dask_bfs():
    gc.collect()
    cluster = LocalCUDACluster()
    client = Client(cluster)
    Comms.initialize()

    input_data_path = r"../datasets/netscience.csv"
    chunksize = dcg.get_chunksize(input_data_path)

    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    df = cudf.read_csv(input_data_path,
                       delimiter=' ',
                       names=['src', 'dst', 'value'],
                       dtype=['int32', 'int32', 'float32'])

    g = cugraph.DiGraph()
    g.from_cudf_edgelist(df, 'src', 'dst', renumber=True)

    dg = cugraph.DiGraph()
    dg.from_dask_cudf_edgelist(ddf, renumber=True)

    expected_dist = cugraph.bfs(g, 0)
    result_dist = dcg.bfs(dg, 0, True)

    compare_dist = expected_dist.merge(result_dist,
                                       on="vertex",
                                       suffixes=['_local', '_dask'])

    err = 0

    for i in range(len(compare_dist)):
        if (compare_dist['distance_local'].iloc[i] !=
                compare_dist['distance_dask'].iloc[i]):
            err = err + 1
    assert err == 0

    Comms.destroy()
    client.close()
    cluster.close()
Esempio n. 11
0
def data(df):
    net = cudf.from_pandas(df)

    net['to'] = net['to'].astype('int32')
    net['from'] = net['from'].astype('int32')

    n = net.iloc[0, 0]

    G = cugraph.Graph()
    G.add_edge_list(net['from'], net['to'], None)
    out_bfs = cugraph.bfs(G, n, directed=True)
    out_page = cugraph.pagerank(G)
    out_bfs = out_bfs.to_pandas()
    out_page = out_page.to_pandas()

    out_bfs.loc[out_bfs['distance'] < 3, 'group'] = 2
    out_bfs.loc[out_bfs['distance'] == 3, 'group'] = 0
    out_bfs.loc[out_bfs['distance'] > 3, 'group'] = 1
    out_bfs = out_bfs[['vertex', 'group']]
    return out_bfs, out_page
Esempio n. 12
0
def test_bfs_paths_array():
    with pytest.raises(ValueError) as ErrorMsg:
        gc.collect()

        graph_file = '../datasets/karate.csv'

        cu_M = utils.read_csv_file(graph_file)

        G = cugraph.Graph()
        G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')

        # run BFS starting at vertex 17
        df = cugraph.bfs(G,  16)

        # Get the path to vertex 1
        answer = cugraph.utils.get_traversed_path_list(df, 0)

        assert len(answer) == 3

        # Get path to vertex 0 - which is not in graph
        answer = cugraph.utils.get_traversed_path_list(df, 100)

        assert "not in the result set" in str(ErrorMsg)
Esempio n. 13
0
def test_bfs_paths():
    with pytest.raises(ValueError) as ErrorMsg:
        gc.collect()

        graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv"

        cu_M = utils.read_csv_file(graph_file)

        G = cugraph.Graph()
        G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2')

        # run BFS starting at vertex 17
        df = cugraph.bfs(G, 16)

        # Get the path to vertex 1
        p_df = cugraph.utils.get_traversed_path(df, 0)

        assert len(p_df) == 3

        # Get path to vertex 0 - which is not in graph
        p_df = cugraph.utils.get_traversed_path(df, 100)

        assert "not in the result set" in str(ErrorMsg)
Esempio n. 14
0
def bfs(G, start):
    return cugraph.bfs(G, start=start)
Esempio n. 15
0
def strong_connected_component(source, destination):
    """
    Generate the strongly connected components
    using the FW-BW-TRIM approach, but skipping the trimming)

    Parameters
    ----------
    source : cudf.Seriers
        A cudf seriers that contains the source side of an edge list

    destination : cudf.Seriers
        A cudf seriers that contains the destination side of an edge list

    Returns
    -------
    cdf : cudf.DataFrame - a dataframe for components
        df['vertex']   - the vertex ID
        df['id']       - the component ID

    sdf : cudf.DataFrame - a dataframe with single vertex components
        df['vertex']   - the vertex ID

    count - int - the number of components found


    Examples
    --------
    >>> M = read_mtx_file(graph_file)
    >>> sources = cudf.Series(M.row)
    >>> destinations = cudf.Series(M.col)

   >>> components, single_components, count =
        scc.strong_connected_component(source, destination)
    """
    max_value = np.iinfo(np.int32).max  # NOQA

    # create the FW and BW graphs - this version dopes nopt modify the graphs
    G_fw = cugraph.Graph()
    G_bw = cugraph.Graph()

    G_fw.add_edge_list(source, destination)
    G_bw.add_edge_list(destination, source)

    # get a list of vertices and sort the list on out_degree
    d = G_fw.degrees()
    d = d.sort_values(by='out_degree', ascending=False)

    num_verts = len(d)

    # create space for the answers
    components = [None] * num_verts
    single_components = [None] * num_verts

    # Counts - aka array indexies
    count = 0
    single_count = 0

    # remove vertices that cannot be in a component
    bad = d.query('in_degree == 0 or out_degree == 0')

    if len(bad):
        bad = bad.drop(['in_degree', 'out_degree'])

        single_components[single_count] = bad
        single_count = single_count + 1
        d = _filter_list(d, bad)

    # ----- Start processing -----
    while len(d) > 0:

        v = d['vertex'][0]

        # compute the forward BFS
        bfs_fw = cugraph.bfs(G_fw, v)
        bfs_fw = bfs_fw.query("distance != @max_value")

        # Now backwards
        bfs_bw = cugraph.bfs(G_bw, v)
        bfs_bw = bfs_bw.query("distance != @max_value")

        # intersection
        common = bfs_fw.merge(bfs_bw, on='vertex', how='inner')

        if len(common) > 1:
            common['id'] = v
            components[count] = common
            d = _filter_list(d, common)
            count = count + 1

        else:
            # v is an isolated vertex
            vdf = cudf.DataFrame()
            vdf['vertex'] = v

            single_components[single_count] = vdf
            single_count = single_count + 1
            d = d.iloc[1:]

    # end of loop until vertex queue is empty

    comp = _compress_array(components, count)
    sing = _compress_array(single_components, single_count)

    return comp, sing, count
Esempio n. 16
0
def test_scipy_api_compat():
    graph_file = utils.DATASETS[0]

    input_cugraph_graph = utils.create_obj_from_csv(graph_file,
                                                    cugraph.Graph,
                                                    edgevals=True)
    input_coo_matrix = utils.create_obj_from_csv(graph_file,
                                                 cp_coo_matrix,
                                                 edgevals=True)
    # Ensure scipy-only options are rejected for cugraph inputs
    with pytest.raises(TypeError):
        cugraph.bfs(input_cugraph_graph, start=0, directed=False)
    with pytest.raises(TypeError):
        cugraph.bfs(input_cugraph_graph)  # required arg missing

    # Ensure cugraph-compatible options work as expected
    cugraph.bfs(input_cugraph_graph, i_start=0)
    cugraph.bfs(input_cugraph_graph, i_start=0, return_sp_counter=True)
    # cannot have start and i_start
    with pytest.raises(TypeError):
        cugraph.bfs(input_cugraph_graph, start=0, i_start=0)

    # Ensure SciPy options for matrix inputs work as expected
    cugraph.bfs(input_coo_matrix, i_start=0)
    cugraph.bfs(input_coo_matrix, i_start=0, directed=True)
    cugraph.bfs(input_coo_matrix, i_start=0, directed=False)
    result = cugraph.bfs(input_coo_matrix, i_start=0, return_sp_counter=True)
    assert type(result) is tuple
    assert len(result) == 3