Beispiel #1
0
def test_serialize_empty_string():
    pd_series = pd.Series([], dtype='str')
    gd_series = cudf.Series([], dtype='str')

    recreated = deserialize(*serialize(gd_series))
    pd.util.testing.assert_series_equal(recreated.to_pandas(), pd_series)
Beispiel #2
0
def test_text_subword_tokenize(tmpdir):
    sr = cudf.Series([
        "This is a test",
        "A test this is",
        "Is test a this",
        "Test   test",
        "this   This",
    ])
    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
    content = "1\n0\n23\n"
    coefficients = [65559] * 23
    for c in coefficients:
        content = content + str(c) + " 0\n"
    # based on values from the bert_hash_table.txt file for the
    # test words used here: 'this' 'is' 'a' test'
    table = [0] * 23
    table[0] = 3015668
    table[1] = 6205475701751155871
    table[5] = 6358029
    table[16] = 451412625363
    table[20] = 6206321707968235495
    content = content + "23\n"
    for v in table:
        content = content + str(v) + "\n"
    content = content + "100\n101\n102\n\n"
    hash_file.write(content)

    tokens, masks, metadata = sr.str.subword_tokenize(str(hash_file), 8, 8)
    expected_tokens = cupy.asarray(
        [
            2023,
            2003,
            1037,
            3231,
            0,
            0,
            0,
            0,
            1037,
            3231,
            2023,
            2003,
            0,
            0,
            0,
            0,
            2003,
            3231,
            1037,
            2023,
            0,
            0,
            0,
            0,
            3231,
            3231,
            0,
            0,
            0,
            0,
            0,
            0,
            2023,
            2023,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        dtype=np.uint32,
    )
    assert_eq(expected_tokens, tokens)

    expected_masks = cupy.asarray(
        [
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        dtype=np.uint32,
    )
    assert_eq(expected_masks, masks)

    expected_metadata = cupy.asarray(
        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32)
    assert_eq(expected_metadata, metadata)
def test_ends_with_empty_suffix():
    test_strs = cudf.Series(["happy", "sad"])
    expect = np.asarray([True, True])
    got = porter_stemmer_rules.ends_with_suffix(test_strs, "").values.get()
    np.testing.assert_array_equal(got, expect)
Beispiel #4
0
def test_serialize_generic_index():
    index = cudf.core.index.GenericIndex(cudf.Series(np.arange(10)))
    outindex = cudf.core.index.GenericIndex.deserialize(*index.serialize())
    assert_eq(index, outindex)
Beispiel #5
0
def test_character_tokenize_series():
    sr = cudf.Series([
        "hello world",
        "sdf",
        None,
        "goodbye, one-two:three~four+five_six@sev"
        "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ",
    ])
    expected = cudf.Series([
        "h",
        "e",
        "l",
        "l",
        "o",
        " ",
        "w",
        "o",
        "r",
        "l",
        "d",
        "s",
        "d",
        "f",
        "g",
        "o",
        "o",
        "d",
        "b",
        "y",
        "e",
        ",",
        " ",
        "o",
        "n",
        "e",
        "-",
        "t",
        "w",
        "o",
        ":",
        "t",
        "h",
        "r",
        "e",
        "e",
        "~",
        "f",
        "o",
        "u",
        "r",
        "+",
        "f",
        "i",
        "v",
        "e",
        "_",
        "s",
        "i",
        "x",
        "@",
        "s",
        "e",
        "v",
        "e",
        "n",
        "#",
        "e",
        "i",
        "g",
        "h",
        "t",
        "^",
        "n",
        "i",
        "n",
        "e",
        " ",
        "h",
        "e",
        "Œ",
        "Ž",
        "‘",
        "•",
        "™",
        "œ",
        "$",
        "µ",
        "¾",
        "Ť",
        "Ơ",
        "é",
        " ",
        "DŽ",
    ])

    actual = sr.str.character_tokenize()
    assert_eq(expected, actual)

    sr = cudf.Series([""])
    expected = cudf.Series([], dtype="object")

    actual = sr.str.character_tokenize()
    assert_eq(expected, actual)

    sr = cudf.Series(["a"])
    expected = cudf.Series(["a"])

    actual = sr.str.character_tokenize()
    assert_eq(expected, actual)
Beispiel #6
0
def test_error_with_null_cudf_series(func):
    s_1 = cudf.Series([1, 2])
    s_2 = cudf.Series([1, 2, None])
    with pytest.raises(ValueError):
        func(s_1, s_2)
Beispiel #7
0
# Copyright (c) 2018, NVIDIA CORPORATION.

import msgpack
import numpy as np
import pandas as pd
import pytest

import cudf
from cudf.tests import utils
from cudf.tests.utils import assert_eq


@pytest.mark.parametrize(
    "df",
    [
        lambda: cudf.Series([1, 2, 3]),
        lambda: cudf.Series([1, 2, 3], index=[4, 5, 6]),
        lambda: cudf.Series([1, None, 3]),
        lambda: cudf.Series([1, 2, 3], index=[4, 5, None]),
        lambda: cudf.Series([1, 2, 3])[:2],
        lambda: cudf.Series([1, 2, 3])[:2]._column,
        lambda: cudf.Series(["a", "bb", "ccc"]),
        lambda: cudf.Series(["a", None, "ccc"]),
        lambda: cudf.DataFrame({"x": [1, 2, 3]}),
        lambda: cudf.DataFrame({
            "x": [1, 2, 3],
            "y": [1.0, None, 3.0]
        }),
        lambda: cudf.DataFrame({
            "x": [1, 2, 3],
            "y": [1.0, 2.0, 3.0]
def test_array_func_missing_cudf_index(np_ar, func):
    cudf_index = cudf.core.index.as_index(cudf.Series(np_ar))
    with pytest.raises(TypeError):
        func(cudf_index)
Beispiel #9
0
def random_walks(
    G,
    start_vertices,
    max_depth=None
):
    """
    compute random walks for each nodes in 'start_vertices'

    parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        The graph can be either directed (DiGraph) or undirected (Graph).
        Weights in the graph are ignored.
        Use weight parameter if weights need to be considered
        (currently not supported)

    start_vertices : int or list or cudf.Series or cudf.DataFrame
        A single node or a list or a cudf.Series of nodes from which to run
        the random walks. In case of multi-column vertices it should be
        a cudf.DataFrame

    max_depth : int
        The maximum depth of the random walks


    Returns
    -------
    random_walks_edge_lists : cudf.DataFrame
        GPU data frame containing all random walks sources identifiers,
        destination identifiers, edge weights

    seeds_offsets: cudf.Series
        Series containing the starting offset in the returned edge list
        for each vertex in start_vertices.
    """
    if max_depth is None:
        raise TypeError("must specify a 'max_depth'")

    G, _ = cugraph.utilities.check_nx_graph(G)

    if start_vertices is int:
        start_vertices = [start_vertices]

    if isinstance(start_vertices, list):
        start_vertices = cudf.Series(start_vertices)

    if G.renumbered is True:
        if isinstance(start_vertices, cudf.DataFrame):
            start_vertices = G.lookup_internal_vertex_id(
                start_vertices,
                start_vertices.columns)
        else:
            start_vertices = G.lookup_internal_vertex_id(start_vertices)

    vertex_set, edge_set, sizes = random_walks_wrapper.random_walks(
        G, start_vertices, max_depth)

    if G.renumbered:
        df_ = cudf.DataFrame()
        df_['vertex_set'] = vertex_set
        df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True)
        vertex_set = cudf.Series(df_['vertex_set'])

    edge_list = defaultdict(list)
    next_path_idx = 0
    offsets = [0]

    df = cudf.DataFrame()
    for s in sizes.values_host:
        for i in range(next_path_idx, s+next_path_idx-1):
            edge_list['src'].append(vertex_set.values_host[i])
            edge_list['dst'].append(vertex_set.values_host[i+1])
        next_path_idx += s
        df = df.append(edge_list, ignore_index=True)
        offsets.append(df.index[-1]+1)
        edge_list['src'].clear()
        edge_list['dst'].clear()
    df['weight'] = edge_set
    offsets = cudf.Series(offsets)

    return df, offsets
Beispiel #10
0
    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       partitions=(),
                       **kwargs):
        if columns is not None:
            columns = [c for c in columns]
        if isinstance(index, list):
            columns += index

        if isinstance(piece, str):
            # `piece` is a file-path string
            piece = pq.ParquetDatasetPiece(piece,
                                           open_file_func=partial(fs.open,
                                                                  mode="rb"))
        else:
            # `piece` = (path, row_group, partition_keys)
            (path, row_group, partition_keys) = piece
            piece = pq.ParquetDatasetPiece(
                path,
                row_group=row_group,
                partition_keys=partition_keys,
                open_file_func=partial(fs.open, mode="rb"),
            )

        strings_to_cats = kwargs.get("strings_to_categorical", False)
        if cudf.utils.ioutils._is_local_filesystem(fs):
            df = cudf.read_parquet(
                piece.path,
                engine="cudf",
                columns=columns,
                row_group=piece.row_group,
                strings_to_categorical=strings_to_cats,
                **kwargs.get("read", {}),
            )
        else:
            with fs.open(piece.path, mode="rb") as f:
                df = cudf.read_parquet(
                    f,
                    engine="cudf",
                    columns=columns,
                    row_group=piece.row_group,
                    strings_to_categorical=strings_to_cats,
                    **kwargs.get("read", {}),
                )

        if index and index[0] in df.columns:
            df = df.set_index(index[0])

        if len(piece.partition_keys) > 0:
            if partitions is None:
                raise ValueError("Must pass partition sets")
            for i, (name, index2) in enumerate(piece.partition_keys):
                categories = [
                    val.as_py() for val in partitions.levels[i].dictionary
                ]
                sr = cudf.Series(index2).astype(type(index2)).repeat(len(df))
                df[name] = build_categorical_column(
                    categories=categories,
                    codes=as_column(sr._column.base_data,
                                    dtype=sr._column.dtype),
                    size=sr._column.size,
                    offset=sr._column.offset,
                    ordered=False,
                )

        return df
def test_array_func_cudf_index(np_ar, func):
    cudf_index = cudf.core.index.as_index(cudf.Series(np_ar))
    expect = func(np_ar)
    got = func(cudf_index)
    assert_eq(expect, got)
Beispiel #12
0
def gen_rand_series(dtype, size, **kwargs):
    values = gen_rand(dtype, size, **kwargs)
    if kwargs.get("has_nulls", False):
        return cudf.Series.from_masked_array(values, random_bitmask(size))

    return cudf.Series(values)
Beispiel #13
0
def test_serialize_generic_index():
    index = cudf.dataframe.index.GenericIndex(cudf.Series(np.arange(10)))
    outindex = deserialize(*serialize(index))
    assert index == outindex
Beispiel #14
0
def test_serialize_series():
    sr = cudf.Series(np.arange(100))
    outsr = deserialize(*serialize(sr))
    pd.util.testing.assert_series_equal(sr.to_pandas(), outsr.to_pandas())
Beispiel #15
0
def test_list_to_pandas_nullable_true():
    df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])})
    actual = df.to_pandas(nullable=True)
    expected = pd.DataFrame({"a": pd.Series([[1, 2, 3]])})

    assert_eq(actual, expected)
Beispiel #16
0
    c = cudf.core.dtypes.ListDtype("int32")

    assert hash(a) != hash(c)


@pytest.mark.parametrize(
    "data",
    [
        [[]],
        [[1, 2, 3], [4, 5]],
        [[1, 2, 3], [], [4, 5]],
        [[1, 2, 3], None, [4, 5]],
        [[None, None], [None]],
        [[[[[[1, 2, 3]]]]]],
        cudf.Series([[1, 2]]).iloc[0:0],
        cudf.Series([None, [1, 2]]).iloc[0:1],
    ],
)
def test_len(data):
    gsr = cudf.Series(data)
    psr = gsr.to_pandas()

    expect = psr.map(lambda x: len(x) if x is not None else None)
    got = gsr.list.len()

    assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
    ("data", "idx"),
Beispiel #17
0
def sum_of_squares(x):
    x = x.astype("f8")._column
    outcol = cudf._gdf.apply_reduce(libgdf.gdf_sum_squared_generic, x)
    return cudf.Series(outcol)
Beispiel #18
0
def test_take_invalid(invalid, exception):
    gs = cudf.Series([[0, 1], [2, 3]])
    with exception:
        gs.list.take(invalid)
Beispiel #19
0
            "a_1.0": [1, 0, 0, 0],
            "a_2.0": [0, 1, 0, 0],
            "a_nan": [0, 0, 1, 0],
            "a_null": [0, 0, 0, 1],
        },
        dtype="uint8",
    )
    actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])

    utils.assert_eq(expected, actual)


@pytest.mark.parametrize(
    "data",
    [
        cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]),
        cudf.Index([None, 1, 2, 3.3, None, 0.2]),
        cudf.Series([0.1, 2, 3, None, np.nan]),
        cudf.Series([23678, 324, 1, 324], name="abc"),
    ],
)
@pytest.mark.parametrize("prefix_sep", ["-", "#"])
@pytest.mark.parametrize("prefix", [None, "hi"])
@pytest.mark.parametrize("dtype", ["uint8", "int16"])
def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
    expected = cudf.get_dummies(
        data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
    )
    if isinstance(data, (cudf.Series, cudf.Index)):
        pd_data = data.to_pandas()
    else:
Beispiel #20
0
def test_get(data, index, expect):
    sr = cudf.Series(data)
    expect = cudf.Series(expect)
    got = sr.list.get(index)
    assert_eq(expect, got)
Beispiel #21
0
def test_serialize_series():
    sr = cudf.Series(np.arange(100))
    outsr = cudf.Series.deserialize(*sr.serialize())
    assert_eq(sr, outsr)
Beispiel #22
0
def test_get_nulls():
    with pytest.raises(IndexError, match="list index out of range"):
        sr = cudf.Series([[], [], []])
        sr.list.get(100)
Beispiel #23
0
def test_groupby_external_series_incorrect_length(series):
    pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]})
    gdf = DataFrame.from_pandas(pdf)
    pxx = pdf.groupby(pd.Series(series)).x.sum()
    gxx = gdf.groupby(cudf.Series(series)).x.sum()
    assert_eq(pxx, gxx)
Beispiel #24
0
def test_create_list_series(data):
    expect = pd.Series(data)
    got = cudf.Series(data)
    assert_eq(expect, got)
Beispiel #25
0
def test_text_replace_tokens():
    sr = cudf.Series(["this is me", "theme music", ""])
    targets = cudf.Series(["is", "me"])

    expected = cudf.Series(["this _ _", "theme music", ""])
    actual = sr.str.replace_tokens(targets, "_")

    assert_eq(expected, actual)

    replacements = cudf.Series(["IS", "ME"])
    expected = cudf.Series(["this IS ME", "theme music", ""])
    actual = sr.str.replace_tokens(targets, replacements)

    assert_eq(expected, actual)

    sr = cudf.Series([
        "this is a small text ☕",
        "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t",
        "emptyme",
    ], )
    targets = cudf.Series(
        ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"])
    replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""])

    expected = cudf.Series([
        "this is the small text 🚒",
        "this \t\t is ; ; - + the 🔥🔥 text \n\t",
        "",
    ])
    actual = sr.str.replace_tokens(targets, replacements)

    assert_eq(expected, actual)

    sr = cudf.Series(
        ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"])
    targets = cudf.Series(["🌬", "🔥", "🌊"])
    replacements = "🚰"

    expected = cudf.Series(
        ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"])
    actual = sr.str.replace_tokens(targets, replacements, delimiter=";")

    assert_eq(expected, actual)
    assert_eq(sr, sr.str.replace_tokens(targets, replacements))
    assert_eq(sr, sr.str.replace_tokens([""], [""]))
Beispiel #26
0
def test_contains_scalar(data, scalar, expect):
    sr = cudf.Series(data)
    expect = cudf.Series(expect)
    got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type))
    assert_eq(expect, got)
Beispiel #27
0
def bfs_df_pregel(_df, start, src_col='src', dst_col='dst', copy_data=True):
    """
    This function executes an unwieghted Breadth-First-Search (BFS) traversal
    to find the distances and predecessors from a specified starting vertex

    NOTE: Only reachable vertices are returned
    NOTE: data is not sorted

    Parameters
    ----------
    _df : cudf.dataframe
        a dataframe containing the source and destination edge list

    start : same type as 'src' and 'dst'
        The index of the graph vertex from which the traversal begins

    src : string
        the source column name

    dst : string
        the destination column name

    copy_data : Bool
        whether we can manipulate the dataframe or if a copy should be made


    Returns
    -------
    df : cudf.DataFrame
        df['vertex'][i] gives the vertex id of the i'th vertex
        df['distance'][i] gives the path distance for the i'th vertex
            from the starting vertex
        df['predecessor'][i] gives for the i'th vertex the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> data_df =
          cudf.read_csv('datasets/karate.csv', delimiter=' ', header=None)
    >>> df = cugraph.pregel_bfs(data_df, 1, '0', '1')

    """

    # extract the src and dst into a dataframe that can be modified
    if copy_data:
        coo_data = _df[[src_col, dst_col]]
    else:
        coo_data = _df

    coo_data.rename(columns={src_col: 'src', dst_col: 'dst'}, inplace=True)

    # convert the "start" vertex into a series
    frontier = cudf.Series(start).to_frame('dst')

    # create the answer DF
    answer = cudf.DataFrame()
    answer['vertex'] = start
    answer['distance'] = 0
    answer['predecessor'] = -1

    # init some variables
    distance = 0
    done = False

    while not done:

        # ---------------------------------
        # update the distance and add it to the dataframe
        distance = distance + 1
        frontier['distance'] = distance

        # -----------------------------------
        # Removed all instances of the frontier vertices from 'dst' side
        # we do not want to hop to a vertex that has already been seen
        coo_data = coo_data.merge(frontier, on=['dst'], how='left')
        coo_data = coo_data[coo_data.distance.isnull()]
        coo_data.drop('distance', inplace=True)

        # now update column names for finding source vertices
        frontier.rename(columns={'dst': 'src'}, inplace=True)

        # ---------------------------------
        # merge the list of vertices and distances with the COO list
        # there are two sets of results that we get from the "hop_df" merge
        # (A) the set of edges that start with a vertice in the frontier set
        #     - this goes into the answer set
        #     - this also forms the next frontier set
        # (B) the set of edges that did not start with a frontier vertex
        #     - this form the new set of coo_data
        hop_df = coo_data.merge(frontier, on=['src'], how='left')

        # ---------------------------------
        # (A) get the data where the 'src' was in the frontier list
        # create a new dataframe of vertices to hop out from (the 'dst')
        one_hop = hop_df.query("distance == @distance")
        frontier = one_hop['dst'].to_frame('dst')

        # ---------------------------------
        # (B) get all the edges that where not touched
        coo_data = hop_df[hop_df.distance.isnull()]
        coo_data.drop('distance', inplace=True)

        # ---------------------------------
        # update the answer
        one_hop.rename(columns={
            'dst': 'vertex',
            'src': 'predecessor'
        },
                       inplace=True)

        # remote duplicates. smallest vertex wins
        aggsOut = OrderedDict()
        aggsOut['predecessor'] = 'min'
        aggsOut['distance'] = 'min'
        _a = one_hop.groupby(['vertex'], as_index=False).agg(aggsOut)

        answer = cudf.concat([answer, _a])

        if len(coo_data) == 0:
            done = True

        if not done and len(frontier) == 0:
            done = True

    # all done, return the answer
    return answer
Beispiel #28
0
def test_contains_null_search_key(data, expect):
    sr = cudf.Series(data)
    expect = cudf.Series(expect, dtype="bool")
    got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
    assert_eq(expect, got)
Beispiel #29
0
        {
            "join_col": exp_join_data,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )
    expect = expect.set_index("join_col")
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)


@pytest.mark.parametrize(
    "lhs",
    [
        cudf.Series([1, 2, 3], name="a"),
        cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}),
    ],
)
@pytest.mark.parametrize(
    "rhs",
    [
        cudf.Series([1, 2, 3], name="b"),
        cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}),
    ],
)
@pytest.mark.parametrize(
    "how", ["left", "inner", "outer", "leftanti", "leftsemi"]
)
@pytest.mark.parametrize(
    "kwargs",
Beispiel #30
0
import torch
import s3fs
import transformers
from clx.analytics.cybert import Cybert

S3_BASE_PATH = "models.huggingface.co/bert/raykallen/cybert_apache_parser"
CONFIG_FILENAME = "config.json"
MODEL_FILENAME = "pytorch_model.bin"

fs = s3fs.S3FileSystem(anon=True)
fs.get(S3_BASE_PATH + "/" + MODEL_FILENAME, MODEL_FILENAME)
fs.get(S3_BASE_PATH + "/" + CONFIG_FILENAME, CONFIG_FILENAME)

cyparse = Cybert()

input_logs = cudf.Series(
    ['109.169.248.247 - -', 'POST /administrator/index.php HTTP/1.1 200 4494'])


def get_expected_preprocess():
    tokens = torch.tensor([[
        11523, 119, 20065, 119, 27672, 119, 26049, 118, 118, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ],
                           [
                               153, 9025, 1942, 120, 11065, 120, 7448, 119,
                               185, 16194, 145, 20174, 2101, 120, 122, 119,
                               122, 2363, 3140, 1580, 1527, 0, 0, 0, 0, 0, 0,