def test_renumber_common_col_names(): """ Ensure that commonly-used column names in the input do not conflict with names used internally by NumberMap. """ # test multi-column ("legacy" renumbering code path) gdf = cudf.DataFrame({"src": [0, 1, 2], "dst": [1, 2, 3], "weights": [0.1, 0.2, 0.3], "col_a": [8, 1, 82], "col_b": [1, 82, 3], "col_c": [9, 7, 2], "col_d": [1, 2, 3]}) renumbered_df, renumber_map = NumberMap.renumber( gdf, ["col_a", "col_b"], ["col_c", "col_d"]) assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns # test experimental renumbering code path gdf = cudf.DataFrame({"src": [0, 1, 2], "dst": [1, 2, 3], "weights": [0.1, 0.2, 0.3], "col_a": [0, 1, 2], "col_b": [1, 2, 3]}) renumbered_df, renumber_map = NumberMap.renumber(gdf, "col_a", "col_b") assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns
def test_mg_renumber_common_col_names(graph_file, dask_client): """ Ensure that commonly-used column names in the input do not conflict with names used internally by NumberMap. """ M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) numbers = range(len(sources)) offset_numbers = [n + 1 for n in numbers] floats = [float(n) for n in numbers] # test multi-column ("legacy" renumbering code path) gdf = cudf.DataFrame({ "src": numbers, "dst": numbers, "weights": floats, "col_a": sources, "col_b": sources, "col_c": destinations, "col_d": destinations }) ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) renumbered_df, renumber_map = NumberMap.renumber(ddf, ["col_a", "col_b"], ["col_c", "col_d"]) assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns # test experimental renumbering code path gdf = cudf.DataFrame({ "src": numbers, "dst": offset_numbers, "weights": floats, "col_a": sources, "col_b": destinations }) ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) renumbered_df, renumber_map = NumberMap.renumber(ddf, "col_a", "col_b") assert renumber_map.renumbered_src_col_name != "src" assert renumber_map.renumbered_dst_col_name != "dst" assert renumber_map.renumbered_src_col_name in renumbered_df.columns assert renumber_map.renumbered_dst_col_name in renumbered_df.columns
def test_mg_renumber(graph_file, client_connection): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate ddf = dask.dataframe.from_pandas(gdf, npartitions=2) numbering = NumberMap() numbering.from_dataframe(ddf, ["src", "src_old"], ["dst", "dst_old"]) renumbered_df = numbering.add_internal_vertex_id( numbering.add_internal_vertex_id(ddf, "src_id", ["src", "src_old"]), "dst_id", ["dst", "dst_old"], ) check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id").compute() check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id").compute() assert check_src["0"].to_pandas().equals(check_src["src"].to_pandas()) assert check_src["1"].to_pandas().equals(check_src["src_old"].to_pandas()) assert check_dst["0"].to_pandas().equals(check_dst["dst"].to_pandas()) assert check_dst["1"].to_pandas().equals(check_dst["dst_old"].to_pandas())
def test_renumber_files_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src"] = cudf.Series([x + translate for x in sources.values_host]) gdf["dst"] = cudf.Series([x + translate for x in destinations.values_host]) exp_src = cudf.Series([x + translate for x in sources.values_host]) exp_dst = cudf.Series([x + translate for x in destinations.values_host]) renumbered_df, renumber_map = NumberMap.renumber( gdf, ["src"], ["dst"], preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=True ) assert_series_equal(exp_src, unrenumbered_df[renumber_map.renumbered_src_col_name], check_names=False) assert_series_equal(exp_dst, unrenumbered_df[renumber_map.renumbered_dst_col_name], check_names=False)
def test_mg_renumber3(graph_file, client_connection): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate gdf["weight"] = gdf.index.astype(np.float) ddf = dask.dataframe.from_pandas(gdf, npartitions=2) ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"]) test_df = gdf[["src", "src_old"]].head() # # This call raises an exception in branch-0.15 # prior to this PR # test_df = num2.add_internal_vertex_id(test_df, "src", ["src", "src_old"]) assert True
def test_mg_renumber2(graph_file, client_connection): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate gdf["weight"] = gdf.index.astype(np.float) ddf = dask.dataframe.from_pandas(gdf, npartitions=2) ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"]) check_src = num2.from_internal_vertex_id(ren2, "src").compute() check_src = check_src.sort_values("weight").reset_index(drop=True) check_dst = num2.from_internal_vertex_id(ren2, "dst").compute() check_dst = check_dst.sort_values("weight").reset_index(drop=True) assert check_src["0"].to_pandas().equals(gdf["src"].to_pandas()) assert check_src["1"].to_pandas().equals(gdf["src_old"].to_pandas()) assert check_dst["0"].to_pandas().equals(gdf["dst"].to_pandas()) assert check_dst["1"].to_pandas().equals(gdf["dst_old"].to_pandas())
def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client): M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate gdf["weight"] = gdf.index.astype(np.float) ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"]) test_df = gdf[["src", "src_old"]].head() # simply check that this does not raise an exception num2.add_internal_vertex_id(test_df, num2.renumbered_src_col_name, ["src", "src_old"])
def test_renumber_ips_str_cols(): source_list = [ "192.168.1.1", "172.217.5.238", "216.228.121.209", "192.16.31.23", ] dest_list = [ "172.217.5.238", "216.228.121.209", "192.16.31.23", "192.168.1.1", ] pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.from_pandas(pdf) renumbered_gdf, renumber_map = NumberMap.renumber(gdf, ["source_as_int"], ["dest_as_int"]) check_src = renumber_map.from_internal_vertex_id( renumbered_gdf['src'])["0"] check_dst = renumber_map.from_internal_vertex_id( renumbered_gdf['dst'])["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"])
def test_renumber_files_multi_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate renumbered_df, renumber_map = NumberMap.renumber(gdf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=True) unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", preserve_order=True) unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) assert gdf["src"].equals(unrenumbered_df["0_src"]) assert gdf["src_old"].equals(unrenumbered_df["1_src"]) assert gdf["dst"].equals(unrenumbered_df["0_dst"]) assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
def test_renumber_files(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 df = cudf.DataFrame() df["src"] = cudf.Series([x + translate for x in sources.values_host]) df["dst"] = cudf.Series([x + translate for x in destinations.values_host]) exp_src = cudf.Series([x + translate for x in sources.values_host]) exp_dst = cudf.Series([x + translate for x in destinations.values_host]) renumbered_df, renumber_map = NumberMap.renumber(df, "src", "dst", preserve_order=True) unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", preserve_order=True) unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) assert exp_src.equals(unrenumbered_df["src"]) assert exp_dst.equals(unrenumbered_df["dst"])
def test_renumber_files_multi_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate numbering = NumberMap() numbering.from_dataframe(gdf, ["src", "src_old"], ["dst", "dst_old"]) renumbered_df = numbering.add_internal_vertex_id( numbering.add_internal_vertex_id(gdf, "src_id", ["src", "src_old"]), "dst_id", ["dst", "dst_old"], ) check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id") check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id") assert check_src["src"].equals(check_src["0"]) assert check_src["src_old"].equals(check_src["1"]) assert check_dst["dst"].equals(check_dst["0"]) assert check_dst["dst_old"].equals(check_dst["1"])
def test_renumber_ips_str_cols(): source_list = [ "192.168.1.1", "172.217.5.238", "216.228.121.209", "192.16.31.23", ] dest_list = [ "172.217.5.238", "216.228.121.209", "192.16.31.23", "192.168.1.1", ] pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.from_pandas(pdf) numbering = NumberMap() numbering.from_dataframe(gdf, ["source_list"], ["dest_list"]) src = numbering.to_internal_vertex_id(gdf["source_list"]) dst = numbering.to_internal_vertex_id(gdf["dest_list"]) check_src = numbering.from_internal_vertex_id(src)["0"] check_dst = numbering.from_internal_vertex_id(dst)["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"])
def test_renumber_files_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf['src'] = cudf.Series([x + translate for x in sources.values_host]) gdf['dst'] = cudf.Series([x + translate for x in destinations.values_host]) numbering = NumberMap() numbering.from_dataframe(gdf, ["src"], ["dst"]) renumbered_df = numbering.add_internal_vertex_id( numbering.add_internal_vertex_id(gdf, "src_id", ["src"]), "dst_id", ["dst"]) check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id") check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id") assert check_src["src"].equals(check_src["0"]) assert check_dst["dst"].equals(check_dst["0"])
def compute_renumber_edge_list(self, transposed=False): """ Compute a renumbered edge list This function works in the MNMG pipeline and will transform the input dask_cudf.DataFrame into a renumbered edge list in the prescribed direction. This function will be called by the algorithms to ensure that the graph is renumbered properly. The graph object will cache the most recent renumbering attempt. For benchmarking purposes, this function can be called prior to calling a graph algorithm so we can measure the cost of computing the renumbering separately from the cost of executing the algorithm. When creating a CSR-like structure, set transposed to False. When creating a CSC-like structure, set transposed to True. Parameters ---------- transposed : (optional) bool If True, renumber with the intent to make a CSC-like structure. If False, renumber with the intent to make a CSR-like structure. Defaults to False. """ # FIXME: What to do about edge_attr??? # currently ignored for MNMG if not self.distributed: raise Exception( "compute_renumber_edge_list should only be used " "for distributed graphs" ) if not self.renumbered: self.edgelist = self.EdgeList(self.input_df) self.renumber_map = None else: if self.edgelist is not None: if type(self) is Graph: return if self.store_transposed == transposed: return del self.edgelist renumbered_ddf, number_map = NumberMap.renumber( self.input_df, self.source_columns, self.destination_columns, store_transposed=transposed ) self.edgelist = self.EdgeList(renumbered_ddf) self.renumber_map = number_map self.store_transposed = transposed
def from_dask_cudf_edgelist(self, input_ddf, source='source', destination='destination', edge_attr=None, renumber=True): """ Initializes the distributed graph from the dask_cudf.DataFrame edgelist. Undirected Graphs are not currently supported. By default, renumbering is enabled to map the source and destination vertices into an index in the range [0, V) where V is the number of vertices. If the input vertices are a single column of integers in the range [0, V), renumbering can be disabled and the original external vertex ids will be used. Parameters ---------- input_ddf : dask_cudf.DataFrame The edgelist as a dask_cudf.DataFrame source : str source argument is source column name destination : str destination argument is destination column name. edge_attr : str edge_attr argument is the weights column name. renumber : bool If source and destination indices are not in range 0 to V where V is number of vertices, renumber argument should be True. """ if self.edgelist is not None or self.adjlist is not None: raise Exception('Graph already has values') if type(self) is Graph: raise Exception('Undirected distributed graph not supported') if isinstance(input_ddf, dask_cudf.DataFrame): self.distributed = True self.local_data = None rename_map = {source: 'src', destination: 'dst'} if edge_attr is not None: rename_map[edge_attr] = 'weights' input_ddf = input_ddf.rename(columns=rename_map) if renumber: renumbered_ddf, number_map = NumberMap.renumber( input_ddf, "src", "dst") self.edgelist = self.EdgeList(renumbered_ddf) self.renumber_map = number_map self.renumbered = True else: self.edgelist = self.EdgeList(input_ddf) self.renumber_map = None self.renumbered = False else: raise Exception('input should be a dask_cudf dataFrame')
def test_mg_renumber(graph_file, dask_client): M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate ddf = dask.dataframe.from_pandas( gdf, npartitions=len(dask_client.scheduler_info()['workers'])) # preserve_order is not supported for MG renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=False) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=False) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=False) # sort needed only for comparisons, since preserve_order is False gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"]) gdf = gdf.reset_index() unrenumbered_df = unrenumbered_df.compute() src = renumber_map.renumbered_src_col_name dst = renumber_map.renumbered_dst_col_name unrenumbered_df = unrenumbered_df.sort_values( by=[f"0_{src}", f"1_{src}", f"0_{dst}", f"1_{dst}"]) unrenumbered_df = unrenumbered_df.reset_index() assert_series_equal(gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False) assert_series_equal(gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False) assert_series_equal(gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False) assert_series_equal(gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False)
def compute_renumber_edge_list(self, transposed=False): """ Compute a renumbered edge list This function works in the MNMG pipeline and will transform the input dask_cudf.DataFrame into a renumbered edge list in the prescribed direction. This function will be called by the algorithms to ensure that the graph is renumbered properly. The graph object will cache the most recent renumbering attempt. For benchmarking purposes, this function can be called prior to calling a graph algorithm so we can measure the cost of computing the renumbering separately from the cost of executing the algorithm. When creating a CSR-like structure, set transposed to False. When creating a CSC-like structure, set transposed to True. Parameters ---------- transposed : (optional) bool If True, renumber with the intent to make a CSC-like structure. If False, renumber with the intent to make a CSR-like structure. Defaults to False. """ # FIXME: What to do about edge_attr??? # currently ignored for MNMG # FIXME: this is confusing - in the code below, # self.properties.renumbered needs to be interpreted as "needs to be # renumbered", everywhere else it means "has been renumbered". if not self.properties.renumbered: self.edgelist = self.EdgeList(self.input_df) self.renumber_map = None else: if self.edgelist is not None: if self.properties.directed is False: return if self.properties.store_transposed == transposed: return del self.edgelist renumbered_ddf, number_map, aggregate_segment_offsets = \ NumberMap.renumber_and_segment(self.input_df, self.source_columns, self.destination_columns, store_transposed=transposed) self.edgelist = self.EdgeList(renumbered_ddf) self.renumber_map = number_map self.aggregate_segment_offsets = aggregate_segment_offsets self.properties.store_transposed = transposed
def test_renumber_ips_cols(): source_list = [ "192.168.1.1", "172.217.5.238", "216.228.121.209", "192.16.31.23", ] dest_list = [ "172.217.5.238", "216.228.121.209", "192.16.31.23", "192.168.1.1", ] pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.from_pandas(pdf) gdf["source_as_int"] = gdf["source_list"].str.ip2int() gdf["dest_as_int"] = gdf["dest_list"].str.ip2int() renumbered_gdf, renumber_map = NumberMap.renumber( gdf, ["source_as_int"], ["dest_as_int"], preserve_order=True ) input_check = renumbered_gdf.merge(gdf, on=["source_list", "dest_list"]) output_check = renumber_map.from_internal_vertex_id( renumbered_gdf, renumber_map.renumbered_src_col_name, external_column_names=["check_src"] ) output_check = renumber_map.from_internal_vertex_id( output_check, renumber_map.renumbered_dst_col_name, external_column_names=["check_dst"] ) merged = output_check.merge(input_check, on=["source_list", "dest_list"]) assert_series_equal( merged["check_src"], merged["source_as_int"], check_names=False ) assert_series_equal( merged["check_dst"], merged["dest_as_int"], check_names=False )
def test_renumber_negative_col(): source_list = [4, 6, 8, -20, 1] dest_list = [1, 29, 35, 0, 77] df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) renumbered_gdf, renumber_map = NumberMap.renumber(gdf, "source_list", "dest_list") check_src = renumber_map.from_internal_vertex_id( renumbered_gdf['src'])["0"] check_dst = renumber_map.from_internal_vertex_id( renumbered_gdf['dst'])["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"])
def test_renumber_unrenumber_non_default_vert_names(): """ Test that renumbering a dataframe with generated src/dst column names can be used for unrenumbering results. """ input_gdf = cudf.DataFrame({"dst": [1, 2, 3], "weights": [0.1, 0.2, 0.3], "col_a": [99, 199, 2], "col_b": [199, 2, 32]}) renumbered_df, number_map = NumberMap.renumber(input_gdf, "col_a", "col_b") some_result_gdf = cudf.DataFrame({"vertex": [0, 1, 2, 3]}) expected_values = [99, 199, 2, 32] some_result_gdf = number_map.unrenumber(some_result_gdf, "vertex") assert sorted(expected_values) == \ sorted(some_result_gdf["vertex"].to_arrow().to_pylist())
def test_renumber_files_multi_col(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate renumbered_df, renumber_map = NumberMap.renumber( gdf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( renumbered_df, renumber_map.renumbered_src_col_name, preserve_order=True ) unrenumbered_df = renumber_map.unrenumber( unrenumbered_df, renumber_map.renumbered_dst_col_name, preserve_order=True ) src = renumber_map.renumbered_src_col_name dst = renumber_map.renumbered_dst_col_name assert_series_equal( gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False ) assert_series_equal( gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False ) assert_series_equal( gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False ) assert_series_equal( gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False )
def test_renumber_negative_col(): source_list = [4, 6, 8, -20, 1] dest_list = [1, 29, 35, 0, 77] df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) numbering = NumberMap() numbering.from_dataframe(gdf, ["source_list"], ["dest_list"]) src = numbering.to_internal_vertex_id(gdf["source_list"]) dst = numbering.to_internal_vertex_id(gdf["dest_list"]) check_src = numbering.from_internal_vertex_id(src)["0"] check_dst = numbering.from_internal_vertex_id(dst)["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"])
def test_mg_renumber(graph_file, client_connection): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 gdf = cudf.DataFrame() gdf["src_old"] = sources gdf["dst_old"] = destinations gdf["src"] = sources + translate gdf["dst"] = destinations + translate ddf = dask.dataframe.from_pandas(gdf, npartitions=2) # preserve_order is not supported for MG renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=False) unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", preserve_order=False) unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=False) # sort needed only for comparisons, since preserve_order is False gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"]) gdf = gdf.reset_index() unrenumbered_df = unrenumbered_df.compute() unrenumbered_df = unrenumbered_df.sort_values( by=["0_src", "1_src", "0_dst", "1_dst"]) unrenumbered_df = unrenumbered_df.reset_index() assert gdf["src"].equals(unrenumbered_df["0_src"]) assert gdf["src_old"].equals(unrenumbered_df["1_src"]) assert gdf["dst"].equals(unrenumbered_df["0_dst"]) assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
def test_renumber_negative_col(): source_list = [4, 6, 8, -20, 1] dest_list = [1, 29, 35, 0, 77] df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list}) gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) gdf["original_src"] = gdf["source_list"] gdf["original_dst"] = gdf["dest_list"] renumbered_gdf, renumber_map = NumberMap.renumber( gdf, ["source_list"], ["dest_list"], preserve_order=True ) input_check = renumbered_gdf.merge( gdf, on=["original_src", "original_dst"] ) output_check = renumber_map.from_internal_vertex_id( renumbered_gdf, renumber_map.renumbered_src_col_name, external_column_names=["check_src"] ) output_check = renumber_map.from_internal_vertex_id( output_check, renumber_map.renumbered_dst_col_name, external_column_names=["check_dst"] ) merged = output_check.merge( input_check, on=["original_src", "original_dst"] ) assert_series_equal( merged["check_src"], merged["original_src"], check_names=False ) assert_series_equal( merged["check_dst"], merged["original_dst"], check_names=False )
def from_cudf_edgelist( self, input_df, source="source", destination="destination", edge_attr=None, renumber=True, ): """ Initialize a graph from the edge list. It is an error to call this method on an initialized Graph object. The passed input_df argument wraps gdf_column objects that represent a graph using the edge list format. source argument is source column name and destination argument is destination column name. By default, renumbering is enabled to map the source and destination vertices into an index in the range [0, V) where V is the number of vertices. If the input vertices are a single column of integers in the range [0, V), renumbering can be disabled and the original external vertex ids will be used. If weights are present, edge_attr argument is the weights column name. Parameters ---------- input_df : cudf.DataFrame or dask_cudf.DataFrame This cudf.DataFrame wraps source, destination and weight gdf_column of size E (E: number of edges) The 'src' column contains the source index for each edge. Source indices are in the range [0, V) (V: number of vertices). The 'dst' column contains the destination index for each edge. Destination indices are in the range [0, V) (V: number of vertices). If renumbering needs to be done, renumber argument should be passed as True. For weighted graphs, dataframe contains 'weight' column containing the weight value for each edge. If a dask_cudf.DataFrame is passed it will be reinterpreted as a cudf.DataFrame. For the distributed path please use from_dask_cudf_edgelist. source : str source argument is source column name destination : str destination argument is destination column name. edge_attr : str edge_attr argument is the weights column name. renumber : bool If source and destination indices are not in range 0 to V where V is number of vertices, renumber argument should be True. Examples -------- >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(df, source='0', destination='1', edge_attr='2', renumber=False) """ if self.edgelist is not None or self.adjlist is not None: raise Exception("Graph already has values") # Consolidation if isinstance(input_df, cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception('cudf dataFrame edge list is too big \ to fit in a single GPU') elist = input_df elif isinstance(input_df, dask_cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception('dask_cudf dataFrame edge list is too big \ to fit in a single GPU') elist = input_df.compute().reset_index(drop=True) else: raise Exception('input should be a cudf.DataFrame or \ a dask_cudf dataFrame') renumber_map = None if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber( elist, source, destination, store_transposed=False ) source = 'src' destination = 'dst' self.renumbered = True self.renumber_map = renumber_map else: if type(source) is list and type(destination) is list: raise Exception('set renumber to True for multi column ids') source_col = elist[source] dest_col = elist[destination] if self.multi: if type(edge_attr) is not list: raise Exception("edge_attr should be a list of column names") value_col = {} for col_name in edge_attr: value_col[col_name] = elist[col_name] elif edge_attr is not None: value_col = elist[edge_attr] else: value_col = None if not self.symmetrized and not self.multi: if value_col is not None: source_col, dest_col, value_col = symmetrize( source_col, dest_col, value_col ) else: source_col, dest_col = symmetrize(source_col, dest_col) self.edgelist = Graph.EdgeList( source_col, dest_col, value_col ) if self.batch_enabled: self._replicate_edgelist() self.renumber_map = renumber_map
def __from_edgelist( self, input_df, source="source", destination="destination", edge_attr=None, renumber=True, ): # Verify column names present in input DataFrame s_col = source d_col = destination if not isinstance(s_col, list): s_col = [s_col] if not isinstance(d_col, list): d_col = [d_col] if not (set(s_col).issubset(set(input_df.columns)) and set(d_col).issubset(set(input_df.columns))): # FIXME: Raise concrete Exceptions raise Exception("source column names and/or destination column " "names not found in input. Recheck the source and " "destination parameters") # FIXME: check if the consolidated graph fits on the # device before gathering all the edge lists # Consolidation if isinstance(input_df, cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception("cudf dataFrame edge list is too big " "to fit in a single GPU") elist = input_df elif isinstance(input_df, dask_cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception("dask_cudf dataFrame edge list is too big " "to fit in a single GPU") elist = input_df.compute().reset_index(drop=True) else: raise Exception("input should be a cudf.DataFrame or " "a dask_cudf dataFrame") # Renumbering self.renumber_map = None if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber(elist, source, destination, store_transposed=False) source = "src" destination = "dst" self.properties.renumbered = True self.renumber_map = renumber_map else: if type(source) is list and type(destination) is list: raise Exception("set renumber to True for multi column ids") # Populate graph edgelist source_col = elist[source] dest_col = elist[destination] if edge_attr is not None: self.weighted = True value_col = elist[edge_attr] else: value_col = None # TODO: Update Symmetrize to work on Graph and/or DataFrame if value_col is not None: source_col, dest_col, value_col = symmetrize( source_col, dest_col, value_col, multi=self.properties.multi_edge, symmetrize=not self.properties.directed) if isinstance(value_col, cudf.DataFrame): value_dict = {} for i in value_col.columns: value_dict[i] = value_col[i] value_col = value_dict else: source_col, dest_col = symmetrize( source_col, dest_col, multi=self.properties.multi_edge, symmetrize=not self.properties.directed) self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, value_col) if self.batch_enabled: self._replicate_edgelist()
def test_renumber_series(graph_file): gc.collect() M = utils.read_csv_for_nx(graph_file) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) translate = 1000 df = cudf.DataFrame() df["src"] = cudf.Series([x + translate for x in sources.values_host]) df["dst"] = cudf.Series([x + translate for x in destinations.values_host]) numbering_series_1 = NumberMap() numbering_series_1.from_series(df["src"]) numbering_series_2 = NumberMap() numbering_series_2.from_series(df["dst"]) renumbered_src = numbering_series_1.add_internal_vertex_id( df["src"], "src_id") renumbered_dst = numbering_series_2.add_internal_vertex_id( df["dst"], "dst_id") check_src = numbering_series_1.from_internal_vertex_id( renumbered_src, "src_id") check_dst = numbering_series_2.from_internal_vertex_id( renumbered_dst, "dst_id") assert check_src["0_y"].equals(check_src["0_x"]) assert check_dst["0_y"].equals(check_dst["0_x"])
def get_traversed_cost(df, source, source_col, dest_col, value_col): """ Take the DataFrame result from a BFS or SSSP function call and sums the given weights along the path to the starting vertex. The source_col, dest_col identifiers need to match with the vertex and predecessor columns of df. Input Parameters ---------- df : cudf.DataFrame The dataframe containing the results of a BFS or SSSP call source: int Index of the source vertex. source_col : cudf.DataFrame This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. value_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains values associated with this edge. Weight should be a floating type. Returns --------- df : cudf.DataFrame DataFrame containing two columns 'vertex' and 'info'. Unreachable vertices will have value the max value of the weight type. """ if 'vertex' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'vertex' column missing") if 'distance' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'distance' column missing") if 'predecessor' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'predecessor' column missing") src, dst, val = symmetrize(source_col, dest_col, value_col) symmetrized_df = cudf.DataFrame() symmetrized_df['source'] = src symmetrized_df['destination'] = dst symmetrized_df['weights'] = val input_df = df.merge(symmetrized_df, left_on=['vertex', 'predecessor'], right_on=['source', 'destination'], how="left") # Set unreachable vertex weights to max float and source vertex weight to 0 max_val = np.finfo(val.dtype).max input_df[['weights']] = input_df[['weights']].fillna(max_val) input_df.loc[input_df['vertex'] == source, 'weights'] = 0 # Renumber renumbered_gdf, renumber_map = NumberMap.renumber(input_df, ["vertex"], ["predecessor"], preserve_order=True) renumbered_gdf = renumbered_gdf.rename(columns={ 'src': 'vertex', 'dst': 'predecessor' }) stop_vertex = renumber_map.to_internal_vertex_id(cudf.Series(-1)).values[0] out_df = path_retrieval_wrapper.get_traversed_cost(renumbered_gdf, stop_vertex) # Unrenumber out_df['vertex'] = renumber_map.unrenumber(renumbered_gdf, 'vertex', preserve_order=True)["vertex"] return out_df