def test_searchsorted(side, obj_class, vals_class): nelem = 1000 column_data = gen_rand("float64", nelem) column_mask = random_bitmask(nelem) values_data = gen_rand("float64", nelem) values_mask = random_bitmask(nelem) sr = cudf.Series.from_masked_array(column_data, column_mask) vals = cudf.Series.from_masked_array(values_data, values_mask) sr = sr.sort_values() # Reference object can be Series, Index, or Column if obj_class == "index": sr.reset_index(drop=True) elif obj_class == "column": sr = sr._column # Values can be Series or Index if vals_class == "index": vals.reset_index(drop=True) psr = sr.to_pandas() pvals = vals.to_pandas() expect = psr.searchsorted(pvals, side) got = sr.searchsorted(vals, side) assert_eq(expect, cupy.asnumpy(got))
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = cudf.DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem)) gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() assert_eq(expect, got, check_dtype=False)
def test_null_series(nrows, dtype): size = 5 mask = utils.random_bitmask(size) data = cudf.Series(np.random.randint(1, 9, size)) column = data.set_mask(mask) sr = cudf.Series(column).astype(dtype) if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view.copy_to_host(), dtype=cudf_dtypes_to_pandas_dtypes.get(cudf.dtype(dtype), cudf.dtype(dtype)), ) ps[sr.isnull().to_pandas()] = pd.NA else: ps = sr.to_pandas() pd.options.display.max_rows = int(nrows) psrepr = ps.__repr__() psrepr = psrepr.replace("NaN", "<NA>") psrepr = psrepr.replace("NaT", "<NA>") psrepr = psrepr.replace("None", "<NA>") if (dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("long")): psrepr = psrepr.replace( str(sr._column.default_na_value()) + "\n", "<NA>\n") if "UInt" in psrepr: psrepr = psrepr.replace("UInt", "uint") elif "Int" in psrepr: psrepr = psrepr.replace("Int", "int") assert psrepr.split() == sr.__repr__().split() pd.reset_option("display.max_rows")
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] sr = Series.from_masked_array(data, mask) # Result with pytest.warns( FutureWarning, match="Series.ceil and DataFrame.ceil are deprecated" ): res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_numpy() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print("expect") print(expect) print("got") print(got) np.testing.assert_array_equal(expect, got)
def test_applymap_round(nelem, masked): # Generate data np.random.seed(0) data = np.random.random(nelem) * 100 if masked: # Make mask bitmask = utils.random_bitmask(nelem) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask), dtype=np.bool_)[:nelem] data[~boolmask] = np.nan sr = Series(data) if masked: # Mask the Series sr = sr.set_mask(bitmask) # Call applymap out = sr.applymap(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) if masked: # Fill masked values out = out.fillna(np.nan) # Check expect = np.round(data) got = out.to_array() np.testing.assert_array_almost_equal(expect, got)
def test_serialize_masked_series(): nelem = 50 data = np.random.random(nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) assert null_count >= 0 sr = cudf.Series.from_masked_array(data, mask, null_count=null_count) outsr = cudf.Series.deserialize(*sr.serialize()) assert_eq(sr, outsr)
def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) got = sr.sum() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = data[res_mask].sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_null_dataframe(ncols): size = 20 gdf = cudf.DataFrame() for idx, dtype in enumerate(dtype_categories): mask = utils.random_bitmask(size) data = cudf.Series(np.random.randint(0, 128, size)) column = data.set_mask(mask) sr = cudf.Series(column).astype(dtype) gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) pdfrepr = pdf.__repr__() pdfrepr = pdfrepr.replace("NaN", "<NA>") pdfrepr = pdfrepr.replace("NaT", "<NA>") pdfrepr = pdfrepr.replace("None", "<NA>") assert pdfrepr.split() == gdf.__repr__().split() pd.reset_option("display.max_columns")
def test_onehot_masked(): np.random.seed(0) high = 5 size = 100 arr = np.random.randint(low=0, high=high, size=size) bitmask = utils.random_bitmask(size) bytemask = np.asarray( utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_ ) arr[~bytemask] = -1 df = DataFrame() df["a"] = Series(arr).set_mask(bitmask) out = df.one_hot_encoding( "a", cats=list(range(high)), prefix="a", dtype=np.int32 ) assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4") np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0) np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1) np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2) np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3) np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] sr = Series.from_masked_array(data, mask) # Result res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_array() res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print("expect") print(expect) print("got") print(got) np.testing.assert_array_equal(expect, got)