def test_serialize_empty_string(): pd_series = pd.Series([], dtype='str') gd_series = cudf.Series([], dtype='str') recreated = deserialize(*serialize(gd_series)) pd.util.testing.assert_series_equal(recreated.to_pandas(), pd_series)
def test_text_subword_tokenize(tmpdir): sr = cudf.Series([ "This is a test", "A test this is", "Is test a this", "Test test", "this This", ]) hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") content = "1\n0\n23\n" coefficients = [65559] * 23 for c in coefficients: content = content + str(c) + " 0\n" # based on values from the bert_hash_table.txt file for the # test words used here: 'this' 'is' 'a' test' table = [0] * 23 table[0] = 3015668 table[1] = 6205475701751155871 table[5] = 6358029 table[16] = 451412625363 table[20] = 6206321707968235495 content = content + "23\n" for v in table: content = content + str(v) + "\n" content = content + "100\n101\n102\n\n" hash_file.write(content) tokens, masks, metadata = sr.str.subword_tokenize(str(hash_file), 8, 8) expected_tokens = cupy.asarray( [ 2023, 2003, 1037, 3231, 0, 0, 0, 0, 1037, 3231, 2023, 2003, 0, 0, 0, 0, 2003, 3231, 1037, 2023, 0, 0, 0, 0, 3231, 3231, 0, 0, 0, 0, 0, 0, 2023, 2023, 0, 0, 0, 0, 0, 0, ], dtype=np.uint32, ) assert_eq(expected_tokens, tokens) expected_masks = cupy.asarray( [ 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ], dtype=np.uint32, ) assert_eq(expected_masks, masks) expected_metadata = cupy.asarray( [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32) assert_eq(expected_metadata, metadata)
def test_ends_with_empty_suffix(): test_strs = cudf.Series(["happy", "sad"]) expect = np.asarray([True, True]) got = porter_stemmer_rules.ends_with_suffix(test_strs, "").values.get() np.testing.assert_array_equal(got, expect)
def test_serialize_generic_index(): index = cudf.core.index.GenericIndex(cudf.Series(np.arange(10))) outindex = cudf.core.index.GenericIndex.deserialize(*index.serialize()) assert_eq(index, outindex)
def test_character_tokenize_series(): sr = cudf.Series([ "hello world", "sdf", None, "goodbye, one-two:three~four+five_six@sev" "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ", ]) expected = cudf.Series([ "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", "s", "d", "f", "g", "o", "o", "d", "b", "y", "e", ",", " ", "o", "n", "e", "-", "t", "w", "o", ":", "t", "h", "r", "e", "e", "~", "f", "o", "u", "r", "+", "f", "i", "v", "e", "_", "s", "i", "x", "@", "s", "e", "v", "e", "n", "#", "e", "i", "g", "h", "t", "^", "n", "i", "n", "e", " ", "h", "e", "Œ", "Ž", "‘", "•", "™", "œ", "$", "µ", "¾", "Ť", "Ơ", "é", " ", "DŽ", ]) actual = sr.str.character_tokenize() assert_eq(expected, actual) sr = cudf.Series([""]) expected = cudf.Series([], dtype="object") actual = sr.str.character_tokenize() assert_eq(expected, actual) sr = cudf.Series(["a"]) expected = cudf.Series(["a"]) actual = sr.str.character_tokenize() assert_eq(expected, actual)
def test_error_with_null_cudf_series(func): s_1 = cudf.Series([1, 2]) s_2 = cudf.Series([1, 2, None]) with pytest.raises(ValueError): func(s_1, s_2)
# Copyright (c) 2018, NVIDIA CORPORATION. import msgpack import numpy as np import pandas as pd import pytest import cudf from cudf.tests import utils from cudf.tests.utils import assert_eq @pytest.mark.parametrize( "df", [ lambda: cudf.Series([1, 2, 3]), lambda: cudf.Series([1, 2, 3], index=[4, 5, 6]), lambda: cudf.Series([1, None, 3]), lambda: cudf.Series([1, 2, 3], index=[4, 5, None]), lambda: cudf.Series([1, 2, 3])[:2], lambda: cudf.Series([1, 2, 3])[:2]._column, lambda: cudf.Series(["a", "bb", "ccc"]), lambda: cudf.Series(["a", None, "ccc"]), lambda: cudf.DataFrame({"x": [1, 2, 3]}), lambda: cudf.DataFrame({ "x": [1, 2, 3], "y": [1.0, None, 3.0] }), lambda: cudf.DataFrame({ "x": [1, 2, 3], "y": [1.0, 2.0, 3.0]
def test_array_func_missing_cudf_index(np_ar, func): cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index)
def random_walks( G, start_vertices, max_depth=None ): """ compute random walks for each nodes in 'start_vertices' parameters ---------- G : cuGraph.Graph or networkx.Graph The graph can be either directed (DiGraph) or undirected (Graph). Weights in the graph are ignored. Use weight parameter if weights need to be considered (currently not supported) start_vertices : int or list or cudf.Series or cudf.DataFrame A single node or a list or a cudf.Series of nodes from which to run the random walks. In case of multi-column vertices it should be a cudf.DataFrame max_depth : int The maximum depth of the random walks Returns ------- random_walks_edge_lists : cudf.DataFrame GPU data frame containing all random walks sources identifiers, destination identifiers, edge weights seeds_offsets: cudf.Series Series containing the starting offset in the returned edge list for each vertex in start_vertices. """ if max_depth is None: raise TypeError("must specify a 'max_depth'") G, _ = cugraph.utilities.check_nx_graph(G) if start_vertices is int: start_vertices = [start_vertices] if isinstance(start_vertices, list): start_vertices = cudf.Series(start_vertices) if G.renumbered is True: if isinstance(start_vertices, cudf.DataFrame): start_vertices = G.lookup_internal_vertex_id( start_vertices, start_vertices.columns) else: start_vertices = G.lookup_internal_vertex_id(start_vertices) vertex_set, edge_set, sizes = random_walks_wrapper.random_walks( G, start_vertices, max_depth) if G.renumbered: df_ = cudf.DataFrame() df_['vertex_set'] = vertex_set df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True) vertex_set = cudf.Series(df_['vertex_set']) edge_list = defaultdict(list) next_path_idx = 0 offsets = [0] df = cudf.DataFrame() for s in sizes.values_host: for i in range(next_path_idx, s+next_path_idx-1): edge_list['src'].append(vertex_set.values_host[i]) edge_list['dst'].append(vertex_set.values_host[i+1]) next_path_idx += s df = df.append(edge_list, ignore_index=True) offsets.append(df.index[-1]+1) edge_list['src'].clear() edge_list['dst'].clear() df['weight'] = edge_set offsets = cudf.Series(offsets) return df, offsets
def read_partition(fs, piece, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): # `piece` is a file-path string piece = pq.ParquetDatasetPiece(piece, open_file_func=partial(fs.open, mode="rb")) else: # `piece` = (path, row_group, partition_keys) (path, row_group, partition_keys) = piece piece = pq.ParquetDatasetPiece( path, row_group=row_group, partition_keys=partition_keys, open_file_func=partial(fs.open, mode="rb"), ) strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( piece.path, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(piece.path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and index[0] in df.columns: df = df.set_index(index[0]) if len(piece.partition_keys) > 0: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(piece.partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] sr = cudf.Series(index2).astype(type(index2)).repeat(len(df)) df[name] = build_categorical_column( categories=categories, codes=as_column(sr._column.base_data, dtype=sr._column.dtype), size=sr._column.size, offset=sr._column.offset, ordered=False, ) return df
def test_array_func_cudf_index(np_ar, func): cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) assert_eq(expect, got)
def gen_rand_series(dtype, size, **kwargs): values = gen_rand(dtype, size, **kwargs) if kwargs.get("has_nulls", False): return cudf.Series.from_masked_array(values, random_bitmask(size)) return cudf.Series(values)
def test_serialize_generic_index(): index = cudf.dataframe.index.GenericIndex(cudf.Series(np.arange(10))) outindex = deserialize(*serialize(index)) assert index == outindex
def test_serialize_series(): sr = cudf.Series(np.arange(100)) outsr = deserialize(*serialize(sr)) pd.util.testing.assert_series_equal(sr.to_pandas(), outsr.to_pandas())
def test_list_to_pandas_nullable_true(): df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) actual = df.to_pandas(nullable=True) expected = pd.DataFrame({"a": pd.Series([[1, 2, 3]])}) assert_eq(actual, expected)
c = cudf.core.dtypes.ListDtype("int32") assert hash(a) != hash(c) @pytest.mark.parametrize( "data", [ [[]], [[1, 2, 3], [4, 5]], [[1, 2, 3], [], [4, 5]], [[1, 2, 3], None, [4, 5]], [[None, None], [None]], [[[[[[1, 2, 3]]]]]], cudf.Series([[1, 2]]).iloc[0:0], cudf.Series([None, [1, 2]]).iloc[0:1], ], ) def test_len(data): gsr = cudf.Series(data) psr = gsr.to_pandas() expect = psr.map(lambda x: len(x) if x is not None else None) got = gsr.list.len() assert_eq(expect, got, check_dtype=False) @pytest.mark.parametrize( ("data", "idx"),
def sum_of_squares(x): x = x.astype("f8")._column outcol = cudf._gdf.apply_reduce(libgdf.gdf_sum_squared_generic, x) return cudf.Series(outcol)
def test_take_invalid(invalid, exception): gs = cudf.Series([[0, 1], [2, 3]]) with exception: gs.list.take(invalid)
"a_1.0": [1, 0, 0, 0], "a_2.0": [0, 1, 0, 0], "a_nan": [0, 0, 1, 0], "a_null": [0, 0, 0, 1], }, dtype="uint8", ) actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) utils.assert_eq(expected, actual) @pytest.mark.parametrize( "data", [ cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]), cudf.Index([None, 1, 2, 3.3, None, 0.2]), cudf.Series([0.1, 2, 3, None, np.nan]), cudf.Series([23678, 324, 1, 324], name="abc"), ], ) @pytest.mark.parametrize("prefix_sep", ["-", "#"]) @pytest.mark.parametrize("prefix", [None, "hi"]) @pytest.mark.parametrize("dtype", ["uint8", "int16"]) def test_get_dummies_array_like(data, prefix_sep, prefix, dtype): expected = cudf.get_dummies( data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype ) if isinstance(data, (cudf.Series, cudf.Index)): pd_data = data.to_pandas() else:
def test_get(data, index, expect): sr = cudf.Series(data) expect = cudf.Series(expect) got = sr.list.get(index) assert_eq(expect, got)
def test_serialize_series(): sr = cudf.Series(np.arange(100)) outsr = cudf.Series.deserialize(*sr.serialize()) assert_eq(sr, outsr)
def test_get_nulls(): with pytest.raises(IndexError, match="list index out of range"): sr = cudf.Series([[], [], []]) sr.list.get(100)
def test_groupby_external_series_incorrect_length(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) pxx = pdf.groupby(pd.Series(series)).x.sum() gxx = gdf.groupby(cudf.Series(series)).x.sum() assert_eq(pxx, gxx)
def test_create_list_series(data): expect = pd.Series(data) got = cudf.Series(data) assert_eq(expect, got)
def test_text_replace_tokens(): sr = cudf.Series(["this is me", "theme music", ""]) targets = cudf.Series(["is", "me"]) expected = cudf.Series(["this _ _", "theme music", ""]) actual = sr.str.replace_tokens(targets, "_") assert_eq(expected, actual) replacements = cudf.Series(["IS", "ME"]) expected = cudf.Series(["this IS ME", "theme music", ""]) actual = sr.str.replace_tokens(targets, replacements) assert_eq(expected, actual) sr = cudf.Series([ "this is a small text ☕", "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", "emptyme", ], ) targets = cudf.Series( ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"]) replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) expected = cudf.Series([ "this is the small text 🚒", "this \t\t is ; ; - + the 🔥🔥 text \n\t", "", ]) actual = sr.str.replace_tokens(targets, replacements) assert_eq(expected, actual) sr = cudf.Series( ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"]) targets = cudf.Series(["🌬", "🔥", "🌊"]) replacements = "🚰" expected = cudf.Series( ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"]) actual = sr.str.replace_tokens(targets, replacements, delimiter=";") assert_eq(expected, actual) assert_eq(sr, sr.str.replace_tokens(targets, replacements)) assert_eq(sr, sr.str.replace_tokens([""], [""]))
def test_contains_scalar(data, scalar, expect): sr = cudf.Series(data) expect = cudf.Series(expect) got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) assert_eq(expect, got)
def bfs_df_pregel(_df, start, src_col='src', dst_col='dst', copy_data=True): """ This function executes an unwieghted Breadth-First-Search (BFS) traversal to find the distances and predecessors from a specified starting vertex NOTE: Only reachable vertices are returned NOTE: data is not sorted Parameters ---------- _df : cudf.dataframe a dataframe containing the source and destination edge list start : same type as 'src' and 'dst' The index of the graph vertex from which the traversal begins src : string the source column name dst : string the destination column name copy_data : Bool whether we can manipulate the dataframe or if a copy should be made Returns ------- df : cudf.DataFrame df['vertex'][i] gives the vertex id of the i'th vertex df['distance'][i] gives the path distance for the i'th vertex from the starting vertex df['predecessor'][i] gives for the i'th vertex the vertex it was reached from in the traversal Examples -------- >>> data_df = cudf.read_csv('datasets/karate.csv', delimiter=' ', header=None) >>> df = cugraph.pregel_bfs(data_df, 1, '0', '1') """ # extract the src and dst into a dataframe that can be modified if copy_data: coo_data = _df[[src_col, dst_col]] else: coo_data = _df coo_data.rename(columns={src_col: 'src', dst_col: 'dst'}, inplace=True) # convert the "start" vertex into a series frontier = cudf.Series(start).to_frame('dst') # create the answer DF answer = cudf.DataFrame() answer['vertex'] = start answer['distance'] = 0 answer['predecessor'] = -1 # init some variables distance = 0 done = False while not done: # --------------------------------- # update the distance and add it to the dataframe distance = distance + 1 frontier['distance'] = distance # ----------------------------------- # Removed all instances of the frontier vertices from 'dst' side # we do not want to hop to a vertex that has already been seen coo_data = coo_data.merge(frontier, on=['dst'], how='left') coo_data = coo_data[coo_data.distance.isnull()] coo_data.drop('distance', inplace=True) # now update column names for finding source vertices frontier.rename(columns={'dst': 'src'}, inplace=True) # --------------------------------- # merge the list of vertices and distances with the COO list # there are two sets of results that we get from the "hop_df" merge # (A) the set of edges that start with a vertice in the frontier set # - this goes into the answer set # - this also forms the next frontier set # (B) the set of edges that did not start with a frontier vertex # - this form the new set of coo_data hop_df = coo_data.merge(frontier, on=['src'], how='left') # --------------------------------- # (A) get the data where the 'src' was in the frontier list # create a new dataframe of vertices to hop out from (the 'dst') one_hop = hop_df.query("distance == @distance") frontier = one_hop['dst'].to_frame('dst') # --------------------------------- # (B) get all the edges that where not touched coo_data = hop_df[hop_df.distance.isnull()] coo_data.drop('distance', inplace=True) # --------------------------------- # update the answer one_hop.rename(columns={ 'dst': 'vertex', 'src': 'predecessor' }, inplace=True) # remote duplicates. smallest vertex wins aggsOut = OrderedDict() aggsOut['predecessor'] = 'min' aggsOut['distance'] = 'min' _a = one_hop.groupby(['vertex'], as_index=False).agg(aggsOut) answer = cudf.concat([answer, _a]) if len(coo_data) == 0: done = True if not done and len(frontier) == 0: done = True # all done, return the answer return answer
def test_contains_null_search_key(data, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="bool") got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) assert_eq(expect, got)
{ "join_col": exp_join_data, "B_x": exp_other_data, "B_y": exp_other_data, } ) expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got) @pytest.mark.parametrize( "lhs", [ cudf.Series([1, 2, 3], name="a"), cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}), ], ) @pytest.mark.parametrize( "rhs", [ cudf.Series([1, 2, 3], name="b"), cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), ], ) @pytest.mark.parametrize( "how", ["left", "inner", "outer", "leftanti", "leftsemi"] ) @pytest.mark.parametrize( "kwargs",
import torch import s3fs import transformers from clx.analytics.cybert import Cybert S3_BASE_PATH = "models.huggingface.co/bert/raykallen/cybert_apache_parser" CONFIG_FILENAME = "config.json" MODEL_FILENAME = "pytorch_model.bin" fs = s3fs.S3FileSystem(anon=True) fs.get(S3_BASE_PATH + "/" + MODEL_FILENAME, MODEL_FILENAME) fs.get(S3_BASE_PATH + "/" + CONFIG_FILENAME, CONFIG_FILENAME) cyparse = Cybert() input_logs = cudf.Series( ['109.169.248.247 - -', 'POST /administrator/index.php HTTP/1.1 200 4494']) def get_expected_preprocess(): tokens = torch.tensor([[ 11523, 119, 20065, 119, 27672, 119, 26049, 118, 118, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 153, 9025, 1942, 120, 11065, 120, 7448, 119, 185, 16194, 145, 20174, 2101, 120, 122, 119, 122, 2363, 3140, 1580, 1527, 0, 0, 0, 0, 0, 0,