def test_validity_add(nelem, lhs_nulls, rhs_nulls): np.random.seed(0) # LHS lhs_data = np.random.random(nelem) if lhs_nulls == "some": lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] lhs_null_count = utils.count_zero(lhs_bitmask) assert lhs_null_count >= 0 lhs = Series.from_masked_array(lhs_data, lhs_mask) assert lhs.null_count == lhs_null_count else: lhs = Series(lhs_data) # RHS rhs_data = np.random.random(nelem) if rhs_nulls == "some": rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] rhs_null_count = utils.count_zero(rhs_bitmask) assert rhs_null_count >= 0 rhs = Series.from_masked_array(rhs_data, rhs_mask) assert rhs.null_count == rhs_null_count else: rhs = Series(rhs_data) # Result res = lhs + rhs if lhs_nulls == "some" and rhs_nulls == "some": res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool)[:nelem] if lhs_nulls == "some" and rhs_nulls == "none": res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool)[:nelem] if lhs_nulls == "none" and rhs_nulls == "some": res_mask = np.asarray(utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool)[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_array() expect = lhs_data + rhs_data if lhs_nulls == "some" or rhs_nulls == "some": expect[~res_mask] = na_value np.testing.assert_array_equal(expect, got)
def test_to_dense_array(): data = np.random.random(8) mask = np.asarray([0b11010110], dtype=np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.null_count > 0 assert sr.null_count != len(sr) filled = sr.to_array(fillna="pandas") dense = sr.to_array() assert dense.size < filled.size assert filled.size == len(sr)
def test_fillna(): _, schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array( data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count, ) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert dense.null_count == 0
def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) got = sr.sum() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = data[res_mask].sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_series_reductions(method, dtype): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 else: mask = arr > 0.5 arr = arr.astype(dtype) arr2 = arr[mask] sr = Series.from_masked_array(arr, Series(mask).as_mask()) def call_test(sr): fn = getattr(sr, method) if method in ["std", "var"]: return fn(ddof=1) else: return fn() expect, got = call_test(arr2), call_test(sr) print(expect, got) np.testing.assert_approx_equal(expect, got)
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] sr = Series.from_masked_array(data, mask) # Result res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_array() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print("expect") print(expect) print("got") print(got) np.testing.assert_array_equal(expect, got)
def test_series_median(dtype, num_na): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = np.arange(100) >= num_na arr = arr.astype(dtype) sr = Series.from_masked_array(arr, Series(mask).as_mask()) arr2 = arr[mask] ps = pd.Series(arr2, dtype=dtype) actual = sr.median(skipna=True) desired = ps.median(skipna=True) print(actual, desired) np.testing.assert_approx_equal(actual, desired) # only for float until integer null supported convert to pandas in cudf # eg. pd.Int64Dtype if np.issubdtype(dtype, np.floating): ps = sr.to_pandas() actual = sr.median(skipna=False) desired = ps.median(skipna=False) np.testing.assert_approx_equal(actual, desired)