Esempio n. 1
0
def test_renumber_common_col_names():
    """
    Ensure that commonly-used column names in the input do not conflict with
    names used internally by NumberMap.
    """
    # test multi-column ("legacy" renumbering code path)
    gdf = cudf.DataFrame({"src": [0, 1, 2],
                          "dst": [1, 2, 3],
                          "weights": [0.1, 0.2, 0.3],
                          "col_a": [8, 1, 82],
                          "col_b": [1, 82, 3],
                          "col_c": [9, 7, 2],
                          "col_d": [1, 2, 3]})

    renumbered_df, renumber_map = NumberMap.renumber(
        gdf, ["col_a", "col_b"], ["col_c", "col_d"])

    assert renumber_map.renumbered_src_col_name != "src"
    assert renumber_map.renumbered_dst_col_name != "dst"
    assert renumber_map.renumbered_src_col_name in renumbered_df.columns
    assert renumber_map.renumbered_dst_col_name in renumbered_df.columns

    # test experimental renumbering code path
    gdf = cudf.DataFrame({"src": [0, 1, 2],
                          "dst": [1, 2, 3],
                          "weights": [0.1, 0.2, 0.3],
                          "col_a": [0, 1, 2],
                          "col_b": [1, 2, 3]})

    renumbered_df, renumber_map = NumberMap.renumber(gdf, "col_a", "col_b")

    assert renumber_map.renumbered_src_col_name != "src"
    assert renumber_map.renumbered_dst_col_name != "dst"
    assert renumber_map.renumbered_src_col_name in renumbered_df.columns
    assert renumber_map.renumbered_dst_col_name in renumbered_df.columns
Esempio n. 2
0
def test_mg_renumber_common_col_names(graph_file, dask_client):
    """
    Ensure that commonly-used column names in the input do not conflict with
    names used internally by NumberMap.
    """
    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    numbers = range(len(sources))
    offset_numbers = [n + 1 for n in numbers]
    floats = [float(n) for n in numbers]

    # test multi-column ("legacy" renumbering code path)
    gdf = cudf.DataFrame({
        "src": numbers,
        "dst": numbers,
        "weights": floats,
        "col_a": sources,
        "col_b": sources,
        "col_c": destinations,
        "col_d": destinations
    })
    ddf = dask.dataframe.from_pandas(
        gdf, npartitions=len(dask_client.scheduler_info()['workers']))

    renumbered_df, renumber_map = NumberMap.renumber(ddf, ["col_a", "col_b"],
                                                     ["col_c", "col_d"])

    assert renumber_map.renumbered_src_col_name != "src"
    assert renumber_map.renumbered_dst_col_name != "dst"
    assert renumber_map.renumbered_src_col_name in renumbered_df.columns
    assert renumber_map.renumbered_dst_col_name in renumbered_df.columns

    # test experimental renumbering code path
    gdf = cudf.DataFrame({
        "src": numbers,
        "dst": offset_numbers,
        "weights": floats,
        "col_a": sources,
        "col_b": destinations
    })

    ddf = dask.dataframe.from_pandas(
        gdf, npartitions=len(dask_client.scheduler_info()['workers']))

    renumbered_df, renumber_map = NumberMap.renumber(ddf, "col_a", "col_b")

    assert renumber_map.renumbered_src_col_name != "src"
    assert renumber_map.renumbered_dst_col_name != "dst"
    assert renumber_map.renumbered_src_col_name in renumbered_df.columns
    assert renumber_map.renumbered_dst_col_name in renumbered_df.columns
Esempio n. 3
0
def test_mg_renumber(graph_file, client_connection):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)

    numbering = NumberMap()
    numbering.from_dataframe(ddf, ["src", "src_old"], ["dst", "dst_old"])
    renumbered_df = numbering.add_internal_vertex_id(
        numbering.add_internal_vertex_id(ddf, "src_id", ["src", "src_old"]),
        "dst_id",
        ["dst", "dst_old"],
    )

    check_src = numbering.from_internal_vertex_id(renumbered_df,
                                                  "src_id").compute()
    check_dst = numbering.from_internal_vertex_id(renumbered_df,
                                                  "dst_id").compute()

    assert check_src["0"].to_pandas().equals(check_src["src"].to_pandas())
    assert check_src["1"].to_pandas().equals(check_src["src_old"].to_pandas())
    assert check_dst["0"].to_pandas().equals(check_dst["dst"].to_pandas())
    assert check_dst["1"].to_pandas().equals(check_dst["dst_old"].to_pandas())
Esempio n. 4
0
def test_renumber_files_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src"] = cudf.Series([x + translate for x in sources.values_host])
    gdf["dst"] = cudf.Series([x + translate for x in destinations.values_host])

    exp_src = cudf.Series([x + translate for x in sources.values_host])
    exp_dst = cudf.Series([x + translate for x in destinations.values_host])

    renumbered_df, renumber_map = NumberMap.renumber(
        gdf, ["src"], ["dst"], preserve_order=True
    )

    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df, renumber_map.renumbered_src_col_name,
        preserve_order=True
    )
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df, renumber_map.renumbered_dst_col_name,
        preserve_order=True
    )

    assert_series_equal(exp_src,
                        unrenumbered_df[renumber_map.renumbered_src_col_name],
                        check_names=False)
    assert_series_equal(exp_dst,
                        unrenumbered_df[renumber_map.renumbered_dst_col_name],
                        check_names=False)
Esempio n. 5
0
def test_mg_renumber3(graph_file, client_connection):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate
    gdf["weight"] = gdf.index.astype(np.float)

    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)

    ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"],
                                    ["dst", "dst_old"])

    test_df = gdf[["src", "src_old"]].head()

    #
    #  This call raises an exception in branch-0.15
    #  prior to this PR
    #
    test_df = num2.add_internal_vertex_id(test_df, "src", ["src", "src_old"])
    assert True
Esempio n. 6
0
def test_mg_renumber2(graph_file, client_connection):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate
    gdf["weight"] = gdf.index.astype(np.float)

    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)

    ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"],
                                    ["dst", "dst_old"])

    check_src = num2.from_internal_vertex_id(ren2, "src").compute()
    check_src = check_src.sort_values("weight").reset_index(drop=True)
    check_dst = num2.from_internal_vertex_id(ren2, "dst").compute()
    check_dst = check_dst.sort_values("weight").reset_index(drop=True)

    assert check_src["0"].to_pandas().equals(gdf["src"].to_pandas())
    assert check_src["1"].to_pandas().equals(gdf["src_old"].to_pandas())
    assert check_dst["0"].to_pandas().equals(gdf["dst"].to_pandas())
    assert check_dst["1"].to_pandas().equals(gdf["dst_old"].to_pandas())
Esempio n. 7
0
def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client):
    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate
    gdf["weight"] = gdf.index.astype(np.float)

    ddf = dask.dataframe.from_pandas(
        gdf, npartitions=len(dask_client.scheduler_info()['workers']))

    ren2, num2 = NumberMap.renumber(ddf, ["src", "src_old"],
                                    ["dst", "dst_old"])

    test_df = gdf[["src", "src_old"]].head()

    # simply check that this does not raise an exception
    num2.add_internal_vertex_id(test_df, num2.renumbered_src_col_name,
                                ["src", "src_old"])
Esempio n. 8
0
def test_renumber_ips_str_cols():

    source_list = [
        "192.168.1.1",
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
    ]
    dest_list = [
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
        "192.168.1.1",
    ]

    pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.from_pandas(pdf)

    renumbered_gdf, renumber_map = NumberMap.renumber(gdf, ["source_as_int"],
                                                      ["dest_as_int"])

    check_src = renumber_map.from_internal_vertex_id(
        renumbered_gdf['src'])["0"]
    check_dst = renumber_map.from_internal_vertex_id(
        renumbered_gdf['dst'])["0"]

    assert check_src.equals(gdf["source_list"])
    assert check_dst.equals(gdf["dest_list"])
Esempio n. 9
0
def test_renumber_files_multi_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    renumbered_df, renumber_map = NumberMap.renumber(gdf, ["src", "src_old"],
                                                     ["dst", "dst_old"],
                                                     preserve_order=True)

    unrenumbered_df = renumber_map.unrenumber(renumbered_df,
                                              "src",
                                              preserve_order=True)
    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df,
                                              "dst",
                                              preserve_order=True)

    assert gdf["src"].equals(unrenumbered_df["0_src"])
    assert gdf["src_old"].equals(unrenumbered_df["1_src"])
    assert gdf["dst"].equals(unrenumbered_df["0_dst"])
    assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
Esempio n. 10
0
def test_renumber_files(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    df = cudf.DataFrame()
    df["src"] = cudf.Series([x + translate for x in sources.values_host])
    df["dst"] = cudf.Series([x + translate for x in destinations.values_host])

    exp_src = cudf.Series([x + translate for x in sources.values_host])
    exp_dst = cudf.Series([x + translate for x in destinations.values_host])

    renumbered_df, renumber_map = NumberMap.renumber(df,
                                                     "src",
                                                     "dst",
                                                     preserve_order=True)

    unrenumbered_df = renumber_map.unrenumber(renumbered_df,
                                              "src",
                                              preserve_order=True)
    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df,
                                              "dst",
                                              preserve_order=True)

    assert exp_src.equals(unrenumbered_df["src"])
    assert exp_dst.equals(unrenumbered_df["dst"])
Esempio n. 11
0
def test_renumber_files_multi_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    numbering = NumberMap()
    numbering.from_dataframe(gdf, ["src", "src_old"], ["dst", "dst_old"])

    renumbered_df = numbering.add_internal_vertex_id(
        numbering.add_internal_vertex_id(gdf, "src_id", ["src", "src_old"]),
        "dst_id",
        ["dst", "dst_old"],
    )

    check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id")
    check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id")

    assert check_src["src"].equals(check_src["0"])
    assert check_src["src_old"].equals(check_src["1"])
    assert check_dst["dst"].equals(check_dst["0"])
    assert check_dst["dst_old"].equals(check_dst["1"])
Esempio n. 12
0
def test_renumber_ips_str_cols():

    source_list = [
        "192.168.1.1",
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
    ]
    dest_list = [
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
        "192.168.1.1",
    ]

    pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.from_pandas(pdf)

    numbering = NumberMap()
    numbering.from_dataframe(gdf, ["source_list"], ["dest_list"])
    src = numbering.to_internal_vertex_id(gdf["source_list"])
    dst = numbering.to_internal_vertex_id(gdf["dest_list"])

    check_src = numbering.from_internal_vertex_id(src)["0"]
    check_dst = numbering.from_internal_vertex_id(dst)["0"]

    assert check_src.equals(gdf["source_list"])
    assert check_dst.equals(gdf["dest_list"])
Esempio n. 13
0
def test_renumber_files_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf['src'] = cudf.Series([x + translate for x in sources.values_host])
    gdf['dst'] = cudf.Series([x + translate for x in destinations.values_host])

    numbering = NumberMap()
    numbering.from_dataframe(gdf, ["src"], ["dst"])

    renumbered_df = numbering.add_internal_vertex_id(
        numbering.add_internal_vertex_id(gdf, "src_id", ["src"]), "dst_id",
        ["dst"])

    check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id")
    check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id")

    assert check_src["src"].equals(check_src["0"])
    assert check_dst["dst"].equals(check_dst["0"])
Esempio n. 14
0
    def compute_renumber_edge_list(self, transposed=False):
        """
        Compute a renumbered edge list

        This function works in the MNMG pipeline and will transform
        the input dask_cudf.DataFrame into a renumbered edge list
        in the prescribed direction.

        This function will be called by the algorithms to ensure
        that the graph is renumbered properly.  The graph object will
        cache the most recent renumbering attempt.  For benchmarking
        purposes, this function can be called prior to calling a
        graph algorithm so we can measure the cost of computing
        the renumbering separately from the cost of executing the
        algorithm.

        When creating a CSR-like structure, set transposed to False.
        When creating a CSC-like structure, set transposed to True.

        Parameters
        ----------
        transposed : (optional) bool
            If True, renumber with the intent to make a CSC-like
            structure.  If False, renumber with the intent to make
            a CSR-like structure.  Defaults to False.
        """
        # FIXME:  What to do about edge_attr???
        #         currently ignored for MNMG

        if not self.distributed:
            raise Exception(
                "compute_renumber_edge_list should only be used "
                "for distributed graphs"
            )

        if not self.renumbered:
            self.edgelist = self.EdgeList(self.input_df)
            self.renumber_map = None
        else:
            if self.edgelist is not None:
                if type(self) is Graph:
                    return

                if self.store_transposed == transposed:
                    return

                del self.edgelist

            renumbered_ddf, number_map = NumberMap.renumber(
                self.input_df, self.source_columns,
                self.destination_columns,
                store_transposed=transposed
            )
            self.edgelist = self.EdgeList(renumbered_ddf)
            self.renumber_map = number_map
            self.store_transposed = transposed
Esempio n. 15
0
    def from_dask_cudf_edgelist(self,
                                input_ddf,
                                source='source',
                                destination='destination',
                                edge_attr=None,
                                renumber=True):
        """
        Initializes the distributed graph from the dask_cudf.DataFrame
        edgelist. Undirected Graphs are not currently supported.

        By default, renumbering is enabled to map the source and destination
        vertices into an index in the range [0, V) where V is the number
        of vertices.  If the input vertices are a single column of integers
        in the range [0, V), renumbering can be disabled and the original
        external vertex ids will be used.

        Parameters
        ----------
        input_ddf : dask_cudf.DataFrame
            The edgelist as a dask_cudf.DataFrame
        source : str
            source argument is source column name
        destination : str
            destination argument is destination column name.
        edge_attr : str
            edge_attr argument is the weights column name.
        renumber : bool
            If source and destination indices are not in range 0 to V where V
            is number of vertices, renumber argument should be True.
        """
        if self.edgelist is not None or self.adjlist is not None:
            raise Exception('Graph already has values')
        if type(self) is Graph:
            raise Exception('Undirected distributed graph not supported')
        if isinstance(input_ddf, dask_cudf.DataFrame):
            self.distributed = True
            self.local_data = None
            rename_map = {source: 'src', destination: 'dst'}
            if edge_attr is not None:
                rename_map[edge_attr] = 'weights'
            input_ddf = input_ddf.rename(columns=rename_map)
            if renumber:
                renumbered_ddf, number_map = NumberMap.renumber(
                    input_ddf, "src", "dst")
                self.edgelist = self.EdgeList(renumbered_ddf)
                self.renumber_map = number_map
                self.renumbered = True
            else:
                self.edgelist = self.EdgeList(input_ddf)
                self.renumber_map = None
                self.renumbered = False
        else:
            raise Exception('input should be a dask_cudf dataFrame')
Esempio n. 16
0
def test_mg_renumber(graph_file, dask_client):

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    ddf = dask.dataframe.from_pandas(
        gdf, npartitions=len(dask_client.scheduler_info()['workers']))

    # preserve_order is not supported for MG
    renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"],
                                                     ["dst", "dst_old"],
                                                     preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df,
        renumber_map.renumbered_src_col_name,
        preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df,
        renumber_map.renumbered_dst_col_name,
        preserve_order=False)

    # sort needed only for comparisons, since preserve_order is False
    gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"])
    gdf = gdf.reset_index()
    unrenumbered_df = unrenumbered_df.compute()
    src = renumber_map.renumbered_src_col_name
    dst = renumber_map.renumbered_dst_col_name
    unrenumbered_df = unrenumbered_df.sort_values(
        by=[f"0_{src}", f"1_{src}", f"0_{dst}", f"1_{dst}"])
    unrenumbered_df = unrenumbered_df.reset_index()

    assert_series_equal(gdf["src"],
                        unrenumbered_df[f"0_{src}"],
                        check_names=False)
    assert_series_equal(gdf["src_old"],
                        unrenumbered_df[f"1_{src}"],
                        check_names=False)
    assert_series_equal(gdf["dst"],
                        unrenumbered_df[f"0_{dst}"],
                        check_names=False)
    assert_series_equal(gdf["dst_old"],
                        unrenumbered_df[f"1_{dst}"],
                        check_names=False)
Esempio n. 17
0
    def compute_renumber_edge_list(self, transposed=False):
        """
        Compute a renumbered edge list
        This function works in the MNMG pipeline and will transform
        the input dask_cudf.DataFrame into a renumbered edge list
        in the prescribed direction.
        This function will be called by the algorithms to ensure
        that the graph is renumbered properly.  The graph object will
        cache the most recent renumbering attempt.  For benchmarking
        purposes, this function can be called prior to calling a
        graph algorithm so we can measure the cost of computing
        the renumbering separately from the cost of executing the
        algorithm.
        When creating a CSR-like structure, set transposed to False.
        When creating a CSC-like structure, set transposed to True.

        Parameters
        ----------
        transposed : (optional) bool
            If True, renumber with the intent to make a CSC-like
            structure.  If False, renumber with the intent to make
            a CSR-like structure.  Defaults to False.
        """
        # FIXME:  What to do about edge_attr???
        #         currently ignored for MNMG

        # FIXME: this is confusing - in the code below,
        # self.properties.renumbered needs to be interpreted as "needs to be
        # renumbered", everywhere else it means "has been renumbered".
        if not self.properties.renumbered:
            self.edgelist = self.EdgeList(self.input_df)
            self.renumber_map = None
        else:
            if self.edgelist is not None:
                if self.properties.directed is False:
                    return

                if self.properties.store_transposed == transposed:
                    return

                del self.edgelist

            renumbered_ddf, number_map, aggregate_segment_offsets = \
                NumberMap.renumber_and_segment(self.input_df,
                                               self.source_columns,
                                               self.destination_columns,
                                               store_transposed=transposed)
            self.edgelist = self.EdgeList(renumbered_ddf)
            self.renumber_map = number_map
            self.aggregate_segment_offsets = aggregate_segment_offsets
            self.properties.store_transposed = transposed
Esempio n. 18
0
def test_renumber_ips_cols():

    source_list = [
        "192.168.1.1",
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
    ]
    dest_list = [
        "172.217.5.238",
        "216.228.121.209",
        "192.16.31.23",
        "192.168.1.1",
    ]

    pdf = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.from_pandas(pdf)

    gdf["source_as_int"] = gdf["source_list"].str.ip2int()
    gdf["dest_as_int"] = gdf["dest_list"].str.ip2int()

    renumbered_gdf, renumber_map = NumberMap.renumber(
        gdf, ["source_as_int"], ["dest_as_int"], preserve_order=True
    )

    input_check = renumbered_gdf.merge(gdf, on=["source_list", "dest_list"])

    output_check = renumber_map.from_internal_vertex_id(
        renumbered_gdf, renumber_map.renumbered_src_col_name,
        external_column_names=["check_src"]
    )
    output_check = renumber_map.from_internal_vertex_id(
        output_check, renumber_map.renumbered_dst_col_name,
        external_column_names=["check_dst"]
    )

    merged = output_check.merge(input_check, on=["source_list", "dest_list"])

    assert_series_equal(
        merged["check_src"], merged["source_as_int"], check_names=False
    )
    assert_series_equal(
        merged["check_dst"], merged["dest_as_int"], check_names=False
    )
Esempio n. 19
0
def test_renumber_negative_col():
    source_list = [4, 6, 8, -20, 1]
    dest_list = [1, 29, 35, 0, 77]

    df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])

    renumbered_gdf, renumber_map = NumberMap.renumber(gdf, "source_list",
                                                      "dest_list")

    check_src = renumber_map.from_internal_vertex_id(
        renumbered_gdf['src'])["0"]
    check_dst = renumber_map.from_internal_vertex_id(
        renumbered_gdf['dst'])["0"]

    assert check_src.equals(gdf["source_list"])
    assert check_dst.equals(gdf["dest_list"])
Esempio n. 20
0
def test_renumber_unrenumber_non_default_vert_names():
    """
    Test that renumbering a dataframe with generated src/dst column names can
    be used for unrenumbering results.
    """
    input_gdf = cudf.DataFrame({"dst": [1, 2, 3],
                                "weights": [0.1, 0.2, 0.3],
                                "col_a": [99, 199, 2],
                                "col_b": [199, 2, 32]})

    renumbered_df, number_map = NumberMap.renumber(input_gdf, "col_a", "col_b")

    some_result_gdf = cudf.DataFrame({"vertex": [0, 1, 2, 3]})
    expected_values = [99, 199, 2, 32]

    some_result_gdf = number_map.unrenumber(some_result_gdf, "vertex")

    assert sorted(expected_values) == \
           sorted(some_result_gdf["vertex"].to_arrow().to_pylist())
Esempio n. 21
0
def test_renumber_files_multi_col(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    renumbered_df, renumber_map = NumberMap.renumber(
        gdf, ["src", "src_old"], ["dst", "dst_old"], preserve_order=True
    )

    unrenumbered_df = renumber_map.unrenumber(
        renumbered_df, renumber_map.renumbered_src_col_name,
        preserve_order=True
    )
    unrenumbered_df = renumber_map.unrenumber(
        unrenumbered_df, renumber_map.renumbered_dst_col_name,
        preserve_order=True
    )

    src = renumber_map.renumbered_src_col_name
    dst = renumber_map.renumbered_dst_col_name
    assert_series_equal(
        gdf["src"], unrenumbered_df[f"0_{src}"], check_names=False
    )
    assert_series_equal(
        gdf["src_old"], unrenumbered_df[f"1_{src}"], check_names=False
    )
    assert_series_equal(
        gdf["dst"], unrenumbered_df[f"0_{dst}"], check_names=False
    )
    assert_series_equal(
        gdf["dst_old"], unrenumbered_df[f"1_{dst}"], check_names=False
    )
Esempio n. 22
0
def test_renumber_negative_col():
    source_list = [4, 6, 8, -20, 1]
    dest_list = [1, 29, 35, 0, 77]

    df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])

    numbering = NumberMap()
    numbering.from_dataframe(gdf, ["source_list"], ["dest_list"])
    src = numbering.to_internal_vertex_id(gdf["source_list"])
    dst = numbering.to_internal_vertex_id(gdf["dest_list"])

    check_src = numbering.from_internal_vertex_id(src)["0"]
    check_dst = numbering.from_internal_vertex_id(dst)["0"]

    assert check_src.equals(gdf["source_list"])
    assert check_dst.equals(gdf["dest_list"])
Esempio n. 23
0
def test_mg_renumber(graph_file, client_connection):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    gdf = cudf.DataFrame()
    gdf["src_old"] = sources
    gdf["dst_old"] = destinations
    gdf["src"] = sources + translate
    gdf["dst"] = destinations + translate

    ddf = dask.dataframe.from_pandas(gdf, npartitions=2)

    # preserve_order is not supported for MG
    renumbered_df, renumber_map = NumberMap.renumber(ddf, ["src", "src_old"],
                                                     ["dst", "dst_old"],
                                                     preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(renumbered_df,
                                              "src",
                                              preserve_order=False)
    unrenumbered_df = renumber_map.unrenumber(unrenumbered_df,
                                              "dst",
                                              preserve_order=False)

    # sort needed only for comparisons, since preserve_order is False
    gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"])
    gdf = gdf.reset_index()
    unrenumbered_df = unrenumbered_df.compute()
    unrenumbered_df = unrenumbered_df.sort_values(
        by=["0_src", "1_src", "0_dst", "1_dst"])
    unrenumbered_df = unrenumbered_df.reset_index()

    assert gdf["src"].equals(unrenumbered_df["0_src"])
    assert gdf["src_old"].equals(unrenumbered_df["1_src"])
    assert gdf["dst"].equals(unrenumbered_df["0_dst"])
    assert gdf["dst_old"].equals(unrenumbered_df["1_dst"])
Esempio n. 24
0
def test_renumber_negative_col():
    source_list = [4, 6, 8, -20, 1]
    dest_list = [1, 29, 35, 0, 77]

    df = pd.DataFrame({"source_list": source_list, "dest_list": dest_list})

    gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]])
    gdf["original_src"] = gdf["source_list"]
    gdf["original_dst"] = gdf["dest_list"]

    renumbered_gdf, renumber_map = NumberMap.renumber(
        gdf, ["source_list"], ["dest_list"], preserve_order=True
    )

    input_check = renumbered_gdf.merge(
        gdf, on=["original_src", "original_dst"]
    )

    output_check = renumber_map.from_internal_vertex_id(
        renumbered_gdf, renumber_map.renumbered_src_col_name,
        external_column_names=["check_src"]
    )
    output_check = renumber_map.from_internal_vertex_id(
        output_check, renumber_map.renumbered_dst_col_name,
        external_column_names=["check_dst"]
    )

    merged = output_check.merge(
        input_check, on=["original_src", "original_dst"]
    )

    assert_series_equal(
        merged["check_src"], merged["original_src"], check_names=False
    )
    assert_series_equal(
        merged["check_dst"], merged["original_dst"], check_names=False
    )
Esempio n. 25
0
    def from_cudf_edgelist(
        self,
        input_df,
        source="source",
        destination="destination",
        edge_attr=None,
        renumber=True,
    ):
        """
        Initialize a graph from the edge list. It is an error to call this
        method on an initialized Graph object. The passed input_df argument
        wraps gdf_column objects that represent a graph using the edge list
        format. source argument is source column name and destination argument
        is destination column name.

        By default, renumbering is enabled to map the source and destination
        vertices into an index in the range [0, V) where V is the number
        of vertices.  If the input vertices are a single column of integers
        in the range [0, V), renumbering can be disabled and the original
        external vertex ids will be used.

        If weights are present, edge_attr argument is the weights column name.

        Parameters
        ----------
        input_df : cudf.DataFrame or dask_cudf.DataFrame
            This cudf.DataFrame wraps source, destination and weight
            gdf_column of size E (E: number of edges)
            The 'src' column contains the source index for each edge.
            Source indices are in the range [0, V) (V: number of vertices).
            The 'dst' column contains the destination index for each edge.
            Destination indices are in the range [0, V) (V: number of
            vertices).
            If renumbering needs to be done, renumber
            argument should be passed as True.
            For weighted graphs, dataframe contains 'weight' column
            containing the weight value for each edge.
            If a dask_cudf.DataFrame is passed it will be reinterpreted as
            a cudf.DataFrame. For the distributed path please use
            from_dask_cudf_edgelist.
        source : str
            source argument is source column name
        destination : str
            destination argument is destination column name.
        edge_attr : str
            edge_attr argument is the weights column name.
        renumber : bool
            If source and destination indices are not in range 0 to V where V
            is number of vertices, renumber argument should be True.

        Examples
        --------
        >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ',
        >>>                   dtype=['int32', 'int32', 'float32'], header=None)
        >>> G = cugraph.Graph()
        >>> G.from_cudf_edgelist(df, source='0', destination='1',
                                 edge_attr='2', renumber=False)

        """
        if self.edgelist is not None or self.adjlist is not None:
            raise Exception("Graph already has values")

        # Consolidation
        if isinstance(input_df, cudf.DataFrame):
            if len(input_df[source]) > 2147483100:
                raise Exception('cudf dataFrame edge list is too big \
                                 to fit in a single GPU')
            elist = input_df
        elif isinstance(input_df, dask_cudf.DataFrame):
            if len(input_df[source]) > 2147483100:
                raise Exception('dask_cudf dataFrame edge list is too big \
                                 to fit in a single GPU')
            elist = input_df.compute().reset_index(drop=True)
        else:
            raise Exception('input should be a cudf.DataFrame or \
                              a dask_cudf dataFrame')

        renumber_map = None
        if renumber:
            # FIXME: Should SG do lazy evaluation like MG?
            elist, renumber_map = NumberMap.renumber(
                elist, source, destination,
                store_transposed=False
            )
            source = 'src'
            destination = 'dst'
            self.renumbered = True
            self.renumber_map = renumber_map
        else:
            if type(source) is list and type(destination) is list:
                raise Exception('set renumber to True for multi column ids')

        source_col = elist[source]
        dest_col = elist[destination]

        if self.multi:
            if type(edge_attr) is not list:
                raise Exception("edge_attr should be a list of column names")
            value_col = {}
            for col_name in edge_attr:
                value_col[col_name] = elist[col_name]
        elif edge_attr is not None:
            value_col = elist[edge_attr]
        else:
            value_col = None

        if not self.symmetrized and not self.multi:
            if value_col is not None:
                source_col, dest_col, value_col = symmetrize(
                    source_col, dest_col, value_col
                )
            else:
                source_col, dest_col = symmetrize(source_col, dest_col)

        self.edgelist = Graph.EdgeList(
            source_col, dest_col, value_col
        )

        if self.batch_enabled:
            self._replicate_edgelist()

        self.renumber_map = renumber_map
Esempio n. 26
0
    def __from_edgelist(
        self,
        input_df,
        source="source",
        destination="destination",
        edge_attr=None,
        renumber=True,
    ):

        # Verify column names present in input DataFrame
        s_col = source
        d_col = destination
        if not isinstance(s_col, list):
            s_col = [s_col]
        if not isinstance(d_col, list):
            d_col = [d_col]
        if not (set(s_col).issubset(set(input_df.columns))
                and set(d_col).issubset(set(input_df.columns))):
            # FIXME: Raise concrete Exceptions
            raise Exception("source column names and/or destination column "
                            "names not found in input. Recheck the source and "
                            "destination parameters")

        # FIXME: check if the consolidated graph fits on the
        # device before gathering all the edge lists

        # Consolidation
        if isinstance(input_df, cudf.DataFrame):
            if len(input_df[source]) > 2147483100:
                raise Exception("cudf dataFrame edge list is too big "
                                "to fit in a single GPU")
            elist = input_df
        elif isinstance(input_df, dask_cudf.DataFrame):
            if len(input_df[source]) > 2147483100:
                raise Exception("dask_cudf dataFrame edge list is too big "
                                "to fit in a single GPU")
            elist = input_df.compute().reset_index(drop=True)
        else:
            raise Exception("input should be a cudf.DataFrame or "
                            "a dask_cudf dataFrame")

        # Renumbering
        self.renumber_map = None
        if renumber:
            # FIXME: Should SG do lazy evaluation like MG?
            elist, renumber_map = NumberMap.renumber(elist,
                                                     source,
                                                     destination,
                                                     store_transposed=False)
            source = "src"
            destination = "dst"
            self.properties.renumbered = True
            self.renumber_map = renumber_map
        else:
            if type(source) is list and type(destination) is list:
                raise Exception("set renumber to True for multi column ids")

        # Populate graph edgelist
        source_col = elist[source]
        dest_col = elist[destination]

        if edge_attr is not None:
            self.weighted = True
            value_col = elist[edge_attr]
        else:
            value_col = None

        # TODO: Update Symmetrize to work on Graph and/or DataFrame
        if value_col is not None:
            source_col, dest_col, value_col = symmetrize(
                source_col,
                dest_col,
                value_col,
                multi=self.properties.multi_edge,
                symmetrize=not self.properties.directed)
            if isinstance(value_col, cudf.DataFrame):
                value_dict = {}
                for i in value_col.columns:
                    value_dict[i] = value_col[i]
                value_col = value_dict
        else:
            source_col, dest_col = symmetrize(
                source_col,
                dest_col,
                multi=self.properties.multi_edge,
                symmetrize=not self.properties.directed)

        self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col,
                                                 value_col)

        if self.batch_enabled:
            self._replicate_edgelist()
Esempio n. 27
0
def test_renumber_series(graph_file):
    gc.collect()

    M = utils.read_csv_for_nx(graph_file)
    sources = cudf.Series(M["0"])
    destinations = cudf.Series(M["1"])

    translate = 1000

    df = cudf.DataFrame()
    df["src"] = cudf.Series([x + translate for x in sources.values_host])
    df["dst"] = cudf.Series([x + translate for x in destinations.values_host])

    numbering_series_1 = NumberMap()
    numbering_series_1.from_series(df["src"])

    numbering_series_2 = NumberMap()
    numbering_series_2.from_series(df["dst"])

    renumbered_src = numbering_series_1.add_internal_vertex_id(
        df["src"], "src_id")
    renumbered_dst = numbering_series_2.add_internal_vertex_id(
        df["dst"], "dst_id")

    check_src = numbering_series_1.from_internal_vertex_id(
        renumbered_src, "src_id")
    check_dst = numbering_series_2.from_internal_vertex_id(
        renumbered_dst, "dst_id")

    assert check_src["0_y"].equals(check_src["0_x"])
    assert check_dst["0_y"].equals(check_dst["0_x"])
Esempio n. 28
0
def get_traversed_cost(df, source, source_col, dest_col, value_col):
    """
    Take the DataFrame result from a BFS or SSSP function call and sums
    the given weights along the path to the starting vertex.
    The source_col, dest_col identifiers need to match with the vertex and
    predecessor columns of df.

    Input Parameters
    ----------
    df : cudf.DataFrame
        The dataframe containing the results of a BFS or SSSP call
    source: int
        Index of the source vertex.
    source_col : cudf.DataFrame
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the source index for each edge.
        Source indices must be an integer type.
    dest_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains the destination index for each edge.
        Destination indices must be an integer type.
    value_col : cudf.Series
        This cudf.Series wraps a gdf_column of size E (E: number of edges).
        The gdf column contains values associated with this edge.
        Weight should be a floating type.

    Returns
    ---------
    df : cudf.DataFrame
        DataFrame containing two columns 'vertex' and 'info'.
        Unreachable vertices will have value the max value of the weight type.
    """

    if 'vertex' not in df.columns:
        raise ValueError("DataFrame does not appear to be a BFS or "
                         "SSP result - 'vertex' column missing")
    if 'distance' not in df.columns:
        raise ValueError("DataFrame does not appear to be a BFS or "
                         "SSP result - 'distance' column missing")
    if 'predecessor' not in df.columns:
        raise ValueError("DataFrame does not appear to be a BFS or "
                         "SSP result - 'predecessor' column missing")

    src, dst, val = symmetrize(source_col, dest_col, value_col)

    symmetrized_df = cudf.DataFrame()
    symmetrized_df['source'] = src
    symmetrized_df['destination'] = dst
    symmetrized_df['weights'] = val

    input_df = df.merge(symmetrized_df,
                        left_on=['vertex', 'predecessor'],
                        right_on=['source', 'destination'],
                        how="left")

    # Set unreachable vertex weights to max float and source vertex weight to 0
    max_val = np.finfo(val.dtype).max
    input_df[['weights']] = input_df[['weights']].fillna(max_val)
    input_df.loc[input_df['vertex'] == source, 'weights'] = 0

    # Renumber
    renumbered_gdf, renumber_map = NumberMap.renumber(input_df, ["vertex"],
                                                      ["predecessor"],
                                                      preserve_order=True)
    renumbered_gdf = renumbered_gdf.rename(columns={
        'src': 'vertex',
        'dst': 'predecessor'
    })
    stop_vertex = renumber_map.to_internal_vertex_id(cudf.Series(-1)).values[0]

    out_df = path_retrieval_wrapper.get_traversed_cost(renumbered_gdf,
                                                       stop_vertex)

    # Unrenumber
    out_df['vertex'] = renumber_map.unrenumber(renumbered_gdf,
                                               'vertex',
                                               preserve_order=True)["vertex"]
    return out_df