def test_multiindex_sort_values(pmidx, ascending, return_indexer): pmidx = pmidx midx = cudf.from_pandas(pmidx) expected = pmidx.sort_values( ascending=ascending, return_indexer=return_indexer ) actual = midx.sort_values( ascending=ascending, return_indexer=return_indexer ) if return_indexer: expected_indexer = expected[1] actual_indexer = actual[1] assert_eq(expected_indexer, actual_indexer) expected = expected[0] actual = actual[0] assert_eq(expected, actual)
def test_series_setitem_categorical(): psr = pd.Series(["a", "b", "a", "c", "d"], dtype="category") gsr = cudf.from_pandas(psr) psr[0] = "d" gsr[0] = "d" assert_eq(psr, gsr) psr = psr.cat.add_categories(["e"]) gsr = gsr.cat.add_categories(["e"]) psr[0] = "e" gsr[0] = "e" assert_eq(psr, gsr) psr[[0, 1]] = "b" gsr[[0, 1]] = "b" assert_eq(psr, gsr) psr[0:3] = "e" gsr[0:3] = "e" assert_eq(psr, gsr)
def test_concat_join_one_df(ignore_index, sort, join, axis): pdf1 = pd.DataFrame({ "x": range(10), "y": list(map(float, range(10))), "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], }) gdf1 = gd.from_pandas(pdf1) assert_eq( pd.concat([pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis), gd.concat([gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis), )
def test_serialize(df): """ This should hopefully replace all functions below """ a = df() if "cudf" not in type(a).__module__: a = cudf.from_pandas(a) header, frames = a.serialize() msgpack.dumps(header) # ensure that header is msgpack serializable ndevice = 0 for frame in frames: if not isinstance(frame, (bytes, memoryview)): ndevice += 1 # Indices etc. will not be DeviceNDArray # but data should be... if hasattr(df, "_cols"): assert ndevice >= len(df._data) else: assert ndevice > 0 typ = type(a) b = typ.deserialize(header, frames) assert_eq(a, b)
def get_mean_reciprocal_rank(sub): # sub is a pandas dataframe # sub should have the following columns: # 'row_id', 'prob', 'reference', 'item_id' # sorted by prob in descending order for each group sub = gd.from_pandas(sub) def get_order_in_group(prob, row_id, order): for i in range(cuda.threadIdx.x, len(prob), cuda.blockDim.x): order[i] = i dg = sub.groupby('row_id', method="cudf").apply_grouped(get_order_in_group, incols=['prob', 'row_id'], outcols={'order': np.int32}, tpb=32) dg = dg.to_pandas() dg['order'] = 1.0 / (1 + dg['order']) dg = dg[dg['reference'] == dg['item_id']] return dg['order'].mean()
def overlap_coefficient(G, ebunch=None): """ NetworkX similar API. See 'jaccard' for a description """ vertex_pair = None G, isNx = check_nx_graph(G) if isNx is True and ebunch is not None: vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch)) df = overlap(G, vertex_pair) if isNx is True: df = df_edge_score_to_dictionary(df, k="overlap_coeff", src="source", dst="destination") return df
def test_to_numeric_downcast_string_large_float(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) if downcast == "float": expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 with pytest.raises(AssertionError, match="Series are different"): assert_eq(expected, got) else: expected = pd.Series([np.inf, -np.inf]) with pytest.warns( UserWarning, match="Downcasting from float to int " "will be limited by float32 precision.", ): got = cudf.to_numeric(gs, downcast=downcast) assert_eq(expected, got)
def test_string_index(): from cudf.dataframe.index import StringIndex, StringColumn pdf = pd.DataFrame(np.random.rand(5, 5)) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = np.array(["a", "b", "c", "d", "e"]) pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf) stringIndex = StringColumn(["a", "b", "c", "d", "e"], name="name") pdf.index = stringIndex gdf.index = stringIndex assert_eq(pdf, gdf)
def test_parquet_write_partitioned(tmpdir_factory, cols): # Checks that write_to_dataset is wrapping to_parquet # as expected gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) size = 100 pdf = pd.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), "b": np.random.choice(list("abcd"), size=size), "c": np.random.choice(np.arange(4), size=size), } ) pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) gdf = cudf.from_pandas(pdf) gdf.to_parquet(gdf_dir, index=False, partition_cols=cols) # Use pandas since dataset may be partitioned expect = pd.read_parquet(pdf_dir) got = pd.read_parquet(gdf_dir) assert_eq(expect, got)
def test_csv_writer_chunksize(chunksize, tmpdir): pdf_df_fname = tmpdir.join("pdf_df_4.csv") gdf_df_fname = tmpdir.join("gdf_df_4.csv") pdf = make_numpy_mixed_dataframe() pdf["Date"] = pdf["Date"].astype("datetime64") # Increase the df len as chunked logic only gets applied from chunksize >=8 pdf = pd.concat([pdf] * 5) gdf = cudf.from_pandas(pdf) pdf.to_csv( pdf_df_fname, date_format="%Y-%m-%dT%H:%M:%SZ", chunksize=chunksize, ) gdf.to_csv(gdf_df_fname, chunksize=chunksize) assert os.path.exists(pdf_df_fname) assert os.path.exists(gdf_df_fname) expect = pd.read_csv(pdf_df_fname) got = pd.read_csv(gdf_df_fname) assert_eq(expect, got)
def test_series_datetime_value_counts(data, nulls, normalize, dropna): psr = data.copy() if len(data) > 0: if nulls == "one": p = np.random.randint(0, len(data)) psr[p] = None elif nulls == "some": p = np.random.randint(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) expected = psr.value_counts(dropna=dropna, normalize=normalize) got = gsr.value_counts(dropna=dropna, normalize=normalize) assert_eq(expected.sort_index(), got.sort_index(), check_dtype=False) assert_eq( expected.reset_index(drop=True), got.reset_index(drop=True), check_dtype=False, )
def test_groupby_split_out(split_out, column): df = pd.DataFrame({ "a": np.arange(8), "b": [1, 0, 0, 2, 1, 1, 2, 0], "c": [0, 1] * 4, "d": ["dog", "cat", "cat", "dog", "dog", "dog", "cat", "bird"], }).fillna(0) df["e"] = df["d"].astype("category") gdf = cudf.from_pandas(df) ddf = dd.from_pandas(df, npartitions=3) gddf = dask_cudf.from_cudf(gdf, npartitions=3) ddf_result = (ddf.groupby(column).a.mean( split_out=split_out).compute().sort_values().dropna()) gddf_result = (gddf.groupby(column).a.mean( split_out=split_out).compute().sort_values()) dd.assert_eq(gddf_result, ddf_result, check_index=False)
def test_index_sort_values(data, ascending, return_indexer): pdi = data gdi = cudf.from_pandas(pdi) expected = pdi.sort_values( ascending=ascending, return_indexer=return_indexer ) actual = gdi.sort_values( ascending=ascending, return_indexer=return_indexer ) if return_indexer: expected_indexer = expected[1] actual_indexer = actual[1] assert_eq(expected_indexer, actual_indexer) expected = expected[0] actual = actual[0] assert_eq(expected, actual)
def read_partition(cls, fs, piece, columns): path = piece["path"] if "rows" in piece: # See: (https://github.com/rapidsai/cudf/issues/6529) # Using `uavro` library for now. This means we must convert # data to pandas, and then to cudf (which is much slower # than `cudf.read_avro`). TODO: Once `num_rows` is fixed, # this can be changed to: # # skiprows, num_rows = piece["rows"] # df = cudf.io.read_avro( # path, skiprows=skiprows, num_rows=num_rows # ) block_offset, part_blocks = piece["blocks"] file_size = fs.du(piece["path"]) with fs.open(piece["path"], "rb") as fo: header = ua.core.read_header(fo) ua.core.scan_blocks(fo, header, file_size) header["blocks"] = header["blocks"][block_offset : block_offset + part_blocks] # Adjust the total row count nrows = 0 for block in header["blocks"]: nrows += block["nrows"] header["nrows"] = nrows # Read in as pandas and convert to cudf (avoid block scan) df = cudf.from_pandas( ua.core.filelike_to_dataframe(fo, file_size, header, scan=False) ) else: df = cudf.io.read_avro(path) # Deal with column selection if columns is None: columns = list(df.columns) return df[columns]
def test_networkx_compatibility(graph_file): gc.collect() # test from_cudf_edgelist() M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M['0']) df['target'] = pd.Series(M['1']) df['weight'] = pd.Series(M.weight) gdf = cudf.from_pandas(df) Gnx = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr='weight', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', destination='target', edge_attr='weight', create_using=cugraph.DiGraph) assert compare_graphs(Gnx, G) Gnx.clear() G.clear() Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(gdf, source='source', destination='target', create_using=cugraph.DiGraph) assert compare_graphs(Gnx, G) Gnx.clear() G.clear()
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if isinstance(input_df, pd.DataFrame): new_df = input_df.copy() elif cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = input_df.to_pandas() else: raise RuntimeError("Unexpected data type: {}".format(type(input_df))) generated_cols = [] input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if len(self._exclude_cols) > 0: input_cols = [col for col in input_cols if col not in self._exclude_cols] for col in input_cols: new_col = self._output_prefix + col + self._output_suffix if self._fillna is not None: new_df[new_col] = ( new_df[col].fillna(self._fillna).apply(self._lambda_func) ) else: new_df[new_col] = new_df[col].apply(self._lambda_func) generated_cols.append(new_col) if cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = cudf.from_pandas(new_df) if self._drop_origin: return new_df[generated_cols] return new_df
def test_groupby_size(): pdf = pd.DataFrame({ "a": [1, 1, 3, 4], "b": ["bob", "bob", "alice", "cooper"], "c": [1, 2, 3, 4], }) gdf = cudf.from_pandas(pdf) assert_eq(pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False) assert_eq( pdf.groupby(["a", "b", "c"]).size(), gdf.groupby(["a", "b", "c"]).size(), check_dtype=False, ) sr = pd.Series(range(len(pdf))) assert_eq(pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False)
def test_rollling_series_basic(data, index, agg, nulls, center): if len(data) > 0: if nulls == "one": p = np.random.randint(0, len(data)) data[p] = None elif nulls == "some": p1, p2 = np.random.randint(0, len(data), (2, )) data[p1] = None data[p2] = None elif nulls == "all": data = [None] * len(data) psr = pd.Series(data, index=index) gsr = cudf.from_pandas(psr) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): assert_eq(getattr(psr.rolling(window_size, min_periods, center), agg)().fillna(-1), getattr(gsr.rolling(window_size, min_periods, center), agg)().fillna(-1), check_dtype=False)
def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): # column major array inplace_predt = booster.inplace_predict(x.values) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) assert cp.all(copied_predt == inplace_predt) inplace_predt = booster.inplace_predict(x) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) base_margin = cudf.Series(rng.randn(rows)) self.run_inplace_base_margin(booster, dtrain, X, base_margin)
def put(self, pandas_df): """ Given a pandas.DataFrame, convert it to a cudf.DataFrame, and add it to the cudf_dataframe_dict. Return the new key added to the dictionary (as an OID). Parameters ---------- pandas_df : pandas.dataFrame/pandas.Series the pandas dataFrame object. If it is a pandas.Series object, convert it to a pandas.dataFrame. No matter what, the pandas.dataFrame will be converted to a cudf.dataFrame. Returns ------- an oid corresponding to the key generated when you added the new cudf.DataFrame object to the cudf_dataframe_dict. """ if isinstance(pandas_df, pandas.Series): pandas_df = pandas_df.to_frame() return self.store_new_df(cudf.from_pandas(pandas_df))
def transform(self, y: cudf.Series) -> cudf.Series: """ Transform an input into its categorical keys. This is intended for use with small inputs relative to the size of the dataset. For fitting and transforming an entire dataset, prefer `fit_transform`. Parameters ---------- y : cudf.Series Input keys to be transformed. Its values should match the categories given to `fit` Returns ------- encoded : cudf.Series The ordinally encoded input series Raises ------ KeyError if a category appears that was not seen in `fit` """ if isinstance(y, pdSeries): y = cudf.from_pandas(y) self._check_is_fitted() y = y.astype('category') encoded = y.cat.set_categories(self.classes_)._column.codes encoded = cudf.Series(encoded, index=y.index) if encoded.has_nulls and self.handle_unknown == 'error': raise KeyError("Attempted to encode unseen key") return encoded
def parquet_writer_test_rowgroup_index_compression( pdf, compression, row_group_size ): pd_file_name = "cpu_pdf.parquet" gd_file_name = "gpu_pdf.parquet" gdf = cudf.from_pandas(pdf) pdf.to_parquet( pd_file_name, compression=compression, row_group_size=row_group_size, ) gdf.to_parquet( gd_file_name, compression=compression, row_group_size=row_group_size, ) actual = cudf.read_parquet(gd_file_name) expected = pd.read_parquet(pd_file_name) assert_eq(actual, expected) actual = cudf.read_parquet(pd_file_name) expected = pd.read_parquet(gd_file_name) assert_eq(actual, expected)
def test_dropna_series(data, nulls): psr = pd.Series(data) if len(data) > 0: if nulls == "one": p = np.random.randint(0, 4) psr[p] = None elif nulls == "some": p1, p2 = np.random.randint(0, 4, (2, )) psr[p1] = None psr[p2] = None elif nulls == "all": psr[:] = None gsr = cudf.from_pandas(psr) check_dtype = True if gsr.null_count == len(gsr): check_dtype = False assert_eq(psr.dropna(), gsr.dropna(), check_dtype=check_dtype)
def test_dropna_thresh_cols(thresh, subset, inplace): pdf = pd.DataFrame( {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} ) gdf = cudf.from_pandas(pdf) if inplace: pdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) expected = pdf actual = gdf else: expected = pdf.dropna( axis=1, thresh=thresh, subset=subset, inplace=inplace ) actual = gdf.dropna( axis=1, thresh=thresh, subset=subset, inplace=inplace ) assert_eq( expected, actual, )
def test_fillna_method_numerical(data, container, data_dtype, method, inplace): if container == pd.DataFrame: data = {"a": data, "b": data, "c": data} pdata = container(data) if np.dtype(data_dtype).kind not in ("f"): data_dtype = cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[np.dtype( data_dtype)] pdata = pdata.astype(data_dtype) # Explicitly using nans_as_nulls=True gdata = cudf.from_pandas(pdata, nan_as_null=True) expected = pdata.fillna(method=method, inplace=inplace) actual = gdata.fillna(method=method, inplace=inplace) if inplace: expected = pdata actual = gdata assert_eq(expected, actual, check_dtype=False)
def test_rolling_dataframe_numba_udf_basic(data, center): pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) def some_func(A): b = 0 for a in A: b = b + a ** 2 return b / len(A) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): assert_eq( pdf.rolling(window_size, min_periods, center) .apply(some_func) .fillna(-1), gdf.rolling(window_size, min_periods, center) .apply(some_func) .fillna(-1), check_dtype=False, )
def test_rollling_series_numba_udf_basic(data, index, center): psr = pd.Series(data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): b = 0 for a in A: b = max(b, math.sqrt(a)) return b for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): assert_eq( psr.rolling(window_size, min_periods, center) .apply(some_func) .fillna(-1), gsr.rolling(window_size, min_periods, center) .apply(some_func) .fillna(-1), check_dtype=False, )
def test_as_array(): pd_df = dsutils.load_bank() pd_series = pd_df['id'] assert isinstance(df_utils.as_array(pd_series), np.ndarray) assert isinstance(df_utils.as_array(pd_series.values), np.ndarray) assert isinstance(df_utils.as_array(pd_series.values.tolist()), np.ndarray) installed_cudf = False try: import cudf import cupy installed_cudf = True except Exception as e: pass if installed_cudf: import cudf cudf_series = cudf.from_pandas(pd_df)['id'] assert isinstance(df_utils.as_array(cudf_series), np.ndarray) assert isinstance(df_utils.as_array(cudf_series.values), np.ndarray)
def test_renumber_ips_cols(): source_list = [ '192.168.1.1', '172.217.5.238', '216.228.121.209', '192.16.31.23' ] dest_list = [ '172.217.5.238', '216.228.121.209', '192.16.31.23', '192.168.1.1' ] pdf = pd.DataFrame({'source_list': source_list, 'dest_list': dest_list}) gdf = cudf.from_pandas(pdf) gdf['source_as_int'] = gdf['source_list'].str.ip2int() gdf['dest_as_int'] = gdf['dest_list'].str.ip2int() src, dst, number_df = cugraph.renumber_from_cudf(gdf, ['source_as_int'], ['dest_as_int']) for i in range(len(gdf)): assert number_df['0'][src[i]] == gdf['source_as_int'][i] assert number_df['0'][dst[i]] == gdf['dest_as_int'][i]
def test_multiindex_loc(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex # return 2 rows, 0 remaining keys = dataframe with entire index assert_eq(pdf.loc[('a', 'store', 'clouds', 'fire')], gdf.loc[('a', 'store', 'clouds', 'fire')]) # return 2 rows, 1 remaining key = dataframe with n-k index columns assert_eq(pdf.loc[('a', 'store', 'storm')], gdf.loc[('a', 'store', 'storm')]) # return 2 rows, 2 remaining keys = dataframe with n-k index columns assert_eq(pdf.loc[('a', 'store')], gdf.loc[('a', 'store')]) assert_eq(pdf.loc[('b', 'house')], gdf.loc[('b', 'house')]) # return 2 rows, n-1 remaining keys = dataframe with n-k index columns assert_eq(pdf.loc[('a', )], gdf.loc[('a', )]) # return 1 row, 0 remaining keys = dataframe with entire index assert_eq(pdf.loc[('a', 'store', 'storm', 'smoke')], gdf.loc[('a', 'store', 'storm', 'smoke')]) # return 1 row and 1 remaining key = series assert_eq(pdf.loc[('c', 'forest', 'clear')], gdf.loc[('c', 'forest', 'clear')])