def test_operator_func_between_series(dtype, func, has_nulls, fill_value): nelem = 1000 arr1 = utils.gen_rand(dtype, nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error arr2 = utils.gen_rand(dtype, nelem) * 100 if has_nulls == 'some': nulls1 = utils.random_bitmask(nelem) nulls2 = utils.random_bitmask(nelem) sr1 = Series.from_masked_array(arr1, nulls1) sr2 = Series.from_masked_array(arr2, nulls2) else: sr1 = Series(arr1) sr2 = Series(arr2) psr1 = sr1.to_pandas() psr2 = sr2.to_pandas() expect = getattr(psr1, func)(psr2, fill_value=fill_value) got = getattr(sr1, func)(sr2, fill_value=fill_value) # This is being done because of the various gymnastics required to support # equality for null values. cudf.Series().to_pandas() replaces nulls with # None and so a bool Series becomes object Series. Which does not match the # output of equality op in pandas which remains a bool. Furthermore, NaN # values are treated as not comparable and always return False in a bool op # except in not-equal op where bool(Nan != Nan) gives True. if got.dtype == np.bool: got = got.fillna(True) if func == 'ne' else got.fillna(False) utils.assert_eq(expect, got)
def test_searchsorted(side, obj_class, vals_class): nelem = 1000 column_data = gen_rand("float64", nelem) column_mask = random_bitmask(nelem) values_data = gen_rand("float64", nelem) values_mask = random_bitmask(nelem) sr = cudf.Series.from_masked_array(column_data, column_mask) vals = cudf.Series.from_masked_array(values_data, values_mask) sr = sr.sort_values() # Reference object can be Series, Index, or Column if obj_class == "index": sr = cudf.Series.as_index(sr) elif obj_class == "column": sr = sr._column # Values can be Series or Index if vals_class == "index": vals = cudf.Series.as_index(vals) psr = sr.to_pandas() pvals = vals.to_pandas() expect = psr.searchsorted(pvals, side) got = sr.searchsorted(vals, side) assert_eq(expect, cupy.asnumpy(got))
def test_reflected_ops_cudf_scalar(funcs, dtype, obj_class): cpu_func, gpu_func = funcs # create random series np.random.seed(12) random_series = utils.gen_rand(dtype, 100, low=10) # gpu series gs = Series(random_series) # class typing if obj_class == "Index": gs = as_index(gs) gs_result = gpu_func(gs) # class typing if obj_class == "Index": gs = Series(gs) # pandas ps_result = cpu_func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result.to_array())
def test_sum_decimal(dtype, nelem): data = [str(x) for x in gen_rand("int64", nelem) / 100] expected = pd.Series([Decimal(x) for x in data]).sum() got = cudf.Series(data).astype(dtype).sum() assert_eq(expected, got)
def test_product_decimal(dtype): data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).product() got = cudf.Series(data).astype(dtype).product() assert_eq(expected, got)
def test_sum_of_squares_decimal(dtype): data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() got = cudf.Series(data).astype(dtype).sum_of_squares() assert_eq(expected, got)
def test_max(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.max() expect = dtype(data.max()) assert expect == got
def test_sum(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum() expect = dtype(data.sum()) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_min(dtype, nelem): dtype = np.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) got = sr.min() expect = dtype(data.min()) assert expect == got
def test_series_binop(binop, obj_class): nelem = 1000 arr1 = utils.gen_rand("float64", nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error arr2 = utils.gen_rand("float64", nelem) * 10 sr1 = Series(arr1) sr2 = Series(arr2) if obj_class == "Index": sr1 = as_index(sr1) sr2 = as_index(sr2) result = binop(sr1, sr2) expect = binop(pd.Series(arr1), pd.Series(arr2)) if obj_class == "Index": result = Series(result) utils.assert_eq(result, expect)
def test_cummin(dtype, nelem): if dtype == np.int8: # to keep data in range data = gen_rand(dtype, nelem, low=-2, high=2) else: data = gen_rand(dtype, nelem) decimal = 4 if dtype == np.float32 else 6 # series gs = Series(data) ps = pd.Series(data) np.testing.assert_array_almost_equal(gs.cummin(), ps.cummin(), decimal=decimal) # dataframe series (named series) gdf = DataFrame() gdf['a'] = Series(data) pdf = pd.DataFrame() pdf['a'] = pd.Series(data) np.testing.assert_array_almost_equal(gdf.a.cummin(), pdf.a.cummin(), decimal=decimal)
def test_searchsorted(side, obj_class): nelem = 1000 column_data = gen_rand("float64", nelem) column_mask = random_bitmask(nelem) values_data = gen_rand("float64", nelem) values_mask = random_bitmask(nelem) sr = cudf.Series.from_masked_array(column_data, column_mask) vals = cudf.Series.from_masked_array(values_data, values_mask) sr = sr.sort_values() if obj_class == "series": sr = cudf.Series.as_index(sr) psr = sr.to_pandas() pvals = vals.to_pandas() expect = psr.searchsorted(pvals, side) got = sr.searchsorted(vals, side) assert_eq(expect, got.to_array())
def gen_df(): pdf = pd.DataFrame() from string import ascii_lowercase cols = np.random.choice(num_cols + 5, num_cols, replace=False) for i in range(num_cols): colname = ascii_lowercase[cols[i]] data = utils.gen_rand('float64', num_rows) * 10000 if nulls == 'some': idx = np.random.choice(num_rows, size=int(num_rows/2), replace=False) data[idx] = np.nan pdf[colname] = data return pdf
def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) got = sr.sum() res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] expect = data[res_mask].sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_sum_of_squares(dtype, nelem): data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum_of_squares() expect = (data ** 2).sum() if np.dtype(dtype).kind == "i": if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) else: print("overflow, passing") else: np.testing.assert_approx_equal( expect, got, significant=accuracy_for_dtype[dtype] )
def test_product(dtype, nelem): if np.dtype(dtype).kind == "i": data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[random.randrange(nelem)] = random.random() * 2 else: data = gen_rand(dtype, nelem) sr = Series(data) got = sr.product() expect = np.product(data) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_product(dtype, nelem): np.random.seed(0) dtype = np.dtype(dtype).type if np.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): data[np.random.randint(low=0, high=nelem, size=1)] = (np.random.uniform() * 2) else: data = gen_rand(dtype, nelem) sr = Series(data) got = sr.product() expect = np.product(data) significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def math_op_test(dtype, fn, nelem=128, test_df=False, positive_only=False): randvals = gen_rand(dtype, nelem, positive_only=positive_only) h_series = pd.Series(randvals.astype(dtype)) d_series = cudf.Series(h_series) if test_df: d_in = cudf.DataFrame() d_in[0] = d_series h_in = pd.DataFrame() h_in[0] = h_series else: d_in = d_series h_in = h_series expect = fn(h_in) got = fn(d_in) print("got") print(got) print("expect") print(expect) assert_eq(expect, got)
def test_reflected_ops_scalar(func, dtype, obj_class): # create random series np.random.seed(12) random_series = utils.gen_rand(dtype, 100, low=10) # gpu series gs = Series(random_series) # class typing if obj_class == 'Index': gs = as_index(gs) gs_result = func(gs) # class typing if obj_class == 'Index': gs = Series(gs) # pandas ps_result = func(random_series) # verify np.testing.assert_allclose(ps_result, gs_result)
def math_op_test(dtype, fn, nelem=128, test_df=False, positive_only=False, check_dtype=True): np.random.seed(0) randvals = gen_rand(dtype, nelem, positive_only=positive_only) h_series = pd.Series(randvals.astype(dtype)) d_series = cudf.Series(h_series) if test_df: d_in = cudf.DataFrame() d_in[0] = d_series h_in = pd.DataFrame() h_in[0] = h_series else: d_in = d_series h_in = h_series expect = fn(h_in) got = fn(d_in) assert_eq(expect, got, check_dtype=check_dtype)