def test_sorensen_multi_column(read_csv): M, _ = read_csv cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] df_res = cugraph.sorensen(G1, vertex_pair) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch actual = df_res.sort_values("0_source").reset_index() expected = df_exp.sort_values("source").reset_index() assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
def test_renumber_files_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src"] = cudf.Series([x + translate for x in sources.values_host]) gdf["dst"] = cudf.Series([x + translate for x in destinations.values_host]) exp_src = cudf.Series([x + translate for x in sources.values_host]) exp_dst = cudf.Series([x + translate for x in destinations.values_host]) renumbered_df, renumber_map = NumberMap.renumber( gdf, ["src"], ["dst"], preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=True ) assert_series_equal(exp_src, unrenumbered_df[renumber_map.renumbered_src_col_name], check_names=False) assert_series_equal(exp_dst, unrenumbered_df[renumber_map.renumbered_dst_col_name], check_names=False)
def test_series_different_type_cases(dtype, check_exact, check_dtype): data = [0, 1, 2, 3] psr1 = pd.Series(data, dtype="uint8") psr2 = pd.Series(data, dtype=dtype) sr1 = cudf.from_pandas(psr1) sr2 = cudf.from_pandas(psr2) kind = None try: pd.testing.assert_series_equal(psr1, psr2, check_exact=check_exact, check_dtype=check_dtype) except BaseException as e: kind = type(e) if kind is not None: with pytest.raises(kind): assert_series_equal(sr1, sr2, check_exact=check_exact, check_dtype=check_dtype) else: assert_series_equal(sr1, sr2, check_exact=check_exact, check_dtype=check_dtype)
def test_renumber_ips_cols(): source_list = [ "192.168.1.1", "172.217.5.238", "216.228.121.209", "192.16.31.23", ] dest_list = [ "172.217.5.238", "216.228.121.209", "192.16.31.23", "192.168.1.1", ] pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.from_pandas(pdf) gdf["source_as_int"] = gdf["source_list"].str.ip2int() gdf["dest_as_int"] = gdf["dest_list"].str.ip2int() renumbered_gdf, renumber_map = NumberMap.renumber( gdf, ["source_as_int"], ["dest_as_int"], preserve_order=True ) input_check = renumbered_gdf.merge(gdf, on=["source_list", "dest_list"]) output_check = renumber_map.from_internal_vertex_id( renumbered_gdf, renumber_map.renumbered_src_col_name, external_column_names=["check_src"] ) output_check = renumber_map.from_internal_vertex_id( output_check, renumber_map.renumbered_dst_col_name, external_column_names=["check_dst"] ) merged = output_check.merge(input_check, on=["source_list", "dest_list"]) assert_series_equal( merged["check_src"], merged["source_as_int"], check_names=False ) assert_series_equal( merged["check_dst"], merged["dest_as_int"], check_names=False )
def test_replicate_cudf_series(input_data_path, dask_client): gc.collect() df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) for column in df.columns.values: series = df[column] worker_to_futures = replication.replicate_cudf_series(series) for worker in worker_to_futures: replicated_series = worker_to_futures[worker].result() assert_series_equal(series, replicated_series, check_names=False) # FIXME: If we do not clear this dictionary, when comparing # results for the 2nd column, one of the workers still # has a value from the 1st column worker_to_futures = {}
def test_basic_assert_series_equal( rdata, rname, check_names, check_category_order, check_categorical, dtype, ): p_left = pd.Series([1, 2, 3], name="a", dtype=dtype) p_right = pd.Series(rdata, name=rname, dtype=dtype) left = cudf.from_pandas(p_left) right = cudf.from_pandas(p_right) kind = None try: pd.testing.assert_series_equal( p_left, p_right, check_names=check_names, check_categorical=check_categorical, check_category_order=check_category_order, ) except BaseException as e: kind = type(e) if kind is not None: with pytest.raises(kind): assert_series_equal( left, right, check_names=check_names, check_categorical=check_categorical, check_category_order=check_category_order, ) else: assert_series_equal( left, right, check_names=check_names, check_categorical=check_categorical, check_category_order=check_category_order, )
def test_dask_mg_degree(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate-asymmetric.csv").as_posix() print(f"dataset={input_data_path}") chunksize = cugraph.dask.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") merge_df_in = (dg.in_degree().merge(g.in_degree(), on="vertex", suffixes=["_dg", "_g"]).compute()) merge_df_out = (dg.out_degree().merge(g.out_degree(), on="vertex", suffixes=["_dg", "_g"]).compute()) assert_series_equal(merge_df_in["degree_dg"], merge_df_in["degree_g"], check_names=False) assert_series_equal(merge_df_out["degree_dg"], merge_df_out["degree_g"], check_names=False)
def test_random_walks_coalesced(graph_file, directed): max_depth = random.randint(2, 10) df_G = utils.read_csv_file(graph_file) df_G.rename(columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) path_data, seeds = calc_random_walks(graph_file, directed, max_depth=max_depth) check_random_walks(path_data, seeds, df_G) # Check path query output df = cugraph.rw_path(len(seeds), path_data[2]) v_offsets = [0] + path_data[2].cumsum()[:-1].to_numpy().tolist() w_offsets = [0] + (path_data[2] - 1).cumsum()[:-1].to_numpy().tolist() assert_series_equal(df['weight_sizes'], path_data[2] - 1, check_names=False) assert df['vertex_offsets'].to_numpy().tolist() == v_offsets assert df['weight_offsets'].to_numpy().tolist() == w_offsets
def test_mg_renumber(graph_file, dask_client): M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) # preserve_order is not supported for MG renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=False) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=False) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=False) # sort needed only for comparisons, since preserve_order is False gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"]) gdf = gdf.reset_index() unrenumbered_df = unrenumbered_df.compute() src = renumber_map.renumbered_src_col_name dst = renumber_map.renumbered_dst_col_name unrenumbered_df = unrenumbered_df.sort_values( by=[f"0_{src}", f"1_{src}", f"0_{dst}", f"1_{dst}"]) unrenumbered_df = unrenumbered_df.reset_index() assert_series_equal(gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False) assert_series_equal(gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False) assert_series_equal(gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False) assert_series_equal(gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False)
def test_datetime_like_compaibility(rdata, check_datetimelike_compat): psr1 = pd.Series([0, 1, 2, 3], dtype="datetime64[ns]") psr2 = pd.Series(rdata, dtype="datetime64[ns]").astype("str") sr1 = cudf.from_pandas(psr1) sr2 = cudf.from_pandas(psr2) kind = None try: pd.testing.assert_series_equal( psr1, psr2, check_datetimelike_compat=check_datetimelike_compat) except BaseException as e: kind = type(e) if kind is not None: with pytest.raises(kind): assert_series_equal( sr1, sr2, check_datetimelike_compat=check_datetimelike_compat) else: assert_series_equal( sr1, sr2, check_datetimelike_compat=check_datetimelike_compat)
def test_enable_batch_adjlist_replication_weights(graph_file, directed, dask_client): gc.collect() df = cudf.read_csv( graph_file, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) G = cugraph.DiGraph() if directed else cugraph.Graph() G.from_cudf_edgelist(df, source="src", destination="dst", edge_attr="value") G.enable_batch() G.view_adj_list() adjlist = G.adjlist offsets = adjlist.offsets indices = adjlist.indices weights = adjlist.weights for worker in G.batch_adjlists: (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker] assert_series_equal(offsets, rep_offsets.result(), check_names=False) assert_series_equal(indices, rep_indices.result(), check_names=False) assert_series_equal(weights, rep_weights.result(), check_names=False)
def test_woverlap_multi_column(graph_file): M = utils.read_csv_for_nx(graph_file) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 G1 = cugraph.Graph() G1.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]) G2 = cugraph.Graph() G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0") vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]] vertex_pair = vertex_pair[:5] weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32)) weights = cudf.DataFrame() weights['vertex'] = G2.nodes() weights['vertex_'] = weights['vertex'] + 1000 weights['weight'] = weight_arr df_res = cugraph.overlap_w(G1, weights, vertex_pair) weights = weights[['vertex', 'weight']] df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch actual = df_res.sort_values("0_source").reset_index() expected = df_exp.sort_values("source").reset_index() assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
def test_renumber_negative_col(): source_list = [4, 6, 8, -20, 1] dest_list = [1, 29, 35, 0, 77] df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) gdf["original_src"] = gdf["source_list"] gdf["original_dst"] = gdf["dest_list"] renumbered_gdf, renumber_map = NumberMap.renumber( gdf, ["source_list"], ["dest_list"], preserve_order=True ) input_check = renumbered_gdf.merge( gdf, on=["original_src", "original_dst"] ) output_check = renumber_map.from_internal_vertex_id( renumbered_gdf, renumber_map.renumbered_src_col_name, external_column_names=["check_src"] ) output_check = renumber_map.from_internal_vertex_id( output_check, renumber_map.renumbered_dst_col_name, external_column_names=["check_dst"] ) merged = output_check.merge( input_check, on=["original_src", "original_dst"] ) assert_series_equal( merged["check_src"], merged["original_src"], check_names=False ) assert_series_equal( merged["check_dst"], merged["original_dst"], check_names=False )
def test_renumber_files_multi_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate renumbered_df, renumber_map = NumberMap.renumber( gdf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=True ) src = renumber_map.renumbered_src_col_name dst = renumber_map.renumbered_dst_col_name assert_series_equal( gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False ) assert_series_equal( gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False ) assert_series_equal( gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False ) assert_series_equal( gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False )