def test_validity_add(nelem): # LHS lhs_data = np.random.random(nelem) lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask) lhs_null_count = utils.count_zero(lhs_bitmask) lhs = Series.from_masked_array(lhs_data, lhs_mask, lhs_null_count) # RHS rhs_data = np.random.random(nelem) rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask) rhs_null_count = utils.count_zero(rhs_bitmask) rhs = Series.from_masked_array(rhs_data, rhs_mask, rhs_null_count) # Result res = lhs + rhs res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool)[:nelem] # Fill NA values na_value = -10000 got = res.fillna(na_value).to_array() expect = lhs_data + rhs_data expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_series_scale(): arr = np.random.randint(low=-10, high=10, size=100) sr = Series(arr) vmin = arr.min() vmax = arr.max() scaled = (arr - vmin) / (vmax - vmin) assert scaled.min() == 0 assert scaled.max() == 1 np.testing.assert_equal(sr.scale().to_array(), scaled)
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique_k(k=10).to_array()) # test out of space arr = np.arange(10) sr = Series(arr) with pytest.raises(ValueError) as raises: sr.unique_k(k=7) raises.match('too many unique value')
def test_categorical_unary_ceil(): cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) with pytest.raises(AttributeError) as raises: pdsr.ceil() raises.match(r'''no attribute ['"]ceil['"]''') with pytest.raises(TypeError) as raises: sr.ceil() raises.match('Categorical cannot perform the operation: ceil')
def test_categorical_empty(): cat = pd.Categorical([]) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array()) assert sr.dtype == pdsr.dtype # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.data, sr.cat.codes.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)
def test_series_unique(): for size in [10 ** x for x in range(5)]: arr = np.random.randint(low=-1, high=10, size=size) mask = arr != -1 sr = Series.from_masked_array(arr, Series(mask).as_mask()) assert set(arr[mask]) == set(sr.unique().to_array()) assert len(set(arr[mask])) == sr.unique_count() df = pd.DataFrame(data=arr[mask], columns=['col']) expect = df.col.value_counts().sort_index() got = sr.value_counts().to_pandas().sort_index() print(expect.head()) print(got.head()) assert got.equals(expect)
def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype): print(cmpop, lhs_dtype, rhs_dtype) nelem = 5 lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) print(lhs) print(rhs) exp = cmpop(lhs, rhs) got = cmpop(Series(lhs), Series(rhs)).to_array() print('exp', exp) print('got', got) np.testing.assert_array_equal(cmpop(Series(lhs), Series(rhs)).to_array(), cmpop(lhs, rhs))
def test_series(data): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) np.testing.assert_equal( np.array(pd_data), np.array(gdf_data), )
def test_categorical_compare_unordered(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_pygdf(sr, npartitions=2) # Test equality out = dsr == dsr assert out.dtype == np.bool_ assert np.all(out.compute()) assert np.all(pdsr == pdsr) # Test inequality out = dsr != dsr assert not np.any(out.compute()) assert not np.any(pdsr != pdsr) assert not dsr.cat.ordered.compute() assert not pdsr.cat.ordered with pytest.raises((TypeError, ValueError)) as raises: pdsr < pdsr raises.match("Unordered Categoricals can only compare equality or not") with pytest.raises((TypeError, ValueError)) as raises: dsr < dsr raises.match("Unordered Categoricals can only compare equality or not")
def test_categorical_value_counts(num_elements): from string import ascii_letters, digits # create categorical series np.random.seed(12) pd_cat = pd.Categorical( pd.Series( np.random.choice(list(ascii_letters + digits), num_elements), dtype='category' ) ) # gdf gdf = DataFrame() gdf['a'] = Series.from_categorical(pd_cat) gdf_value_counts = gdf['a'].value_counts() # pandas pdf = pd.DataFrame() pdf['a'] = pd_cat pdf_value_counts = pdf['a'].value_counts() # verify pandas_dict = pdf_value_counts.to_dict() gdf_dict = gdf_value_counts.to_pandas().to_dict() assert pandas_dict == gdf_dict
def test_series_std(dtype): arr = np.random.random(100) if issubclass(arr.dtype.type, np.integer): arr *= 100 arr = arr.astype(dtype) sr = Series.from_any(arr) np.testing.assert_almost_equal(arr.std(), sr.std())
def test_categorical_masking(): """ Test common operation for getting a all rows that matches a certain category. """ cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) # check scalar comparison expect_matches = (pdsr == 'a') got_matches = (sr == 'a') print('---expect_matches---') print(expect_matches) print('---got_matches---') print(got_matches) np.testing.assert_array_equal(expect_matches.values, got_matches.to_array()) # mask series expect_masked = pdsr[expect_matches] got_masked = sr[got_matches] print('---expect_masked---') print(expect_masked) print('---got_masked---') print(got_masked) assert len(expect_masked) == len(got_masked) assert len(expect_masked) == got_masked.valid_count assert list(expect_masked) == list(got_masked)
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask) null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) # Result res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_array() res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_categorical_compare_unordered(): cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series.from_any(cat) # test equal out = sr == sr assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ assert np.all(out) assert np.all(pdsr == pdsr) # test inequal out = sr != sr assert not np.any(out) assert not np.any(pdsr != pdsr) assert not pdsr.cat.ordered assert not sr.cat.ordered # test using ordered operators with pytest.raises(TypeError) as raises: pdsr < pdsr raises.match("Unordered Categoricals can only compare equality or not") with pytest.raises(TypeError) as raises: sr < sr raises.match("Unordered Categoricals can only compare equality or not")
def test_validity_ceil(nelem): # Data data = np.random.random(nelem) * 100 mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask) null_count = utils.count_zero(bitmask) sr = Series.from_masked_array(data, mask, null_count) # Result res = sr.ceil() na_value = -100000 got = res.fillna(na_value).to_array() res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size] expect = np.ceil(data) expect[~res_mask] = na_value # Check print('expect') print(expect) print('got') print(got) np.testing.assert_array_equal(expect, got)
def test_categorical_basic(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_pygdf(sr, npartitions=2) result = dsr.compute() np.testing.assert_array_equal(cat.codes, result.to_array()) assert dsr.dtype == pdsr.dtype # Test attributes assert pdsr.cat.ordered == dsr.cat.ordered.compute() # TODO: Investigate dsr.cat.categories: It raises # ValueError: Expected iterable of tuples of (name, dtype), # got ('a', 'b', 'c') # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories)) np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype) string = str(result) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split()))
def test_dt_series(data, field): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5) base = getattr(pd_data.dt, field) test = getattr(dask_gdf_data.dt, field).compute()\ .to_pandas().astype('int64') assert_series_equal(base, test)
def test_dt_series(data, field): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_pygdf(sr, npartitions=5) base = getattr(pdsr.dt, field) test = getattr(dsr.dt, field).compute()\ .to_pandas().astype('int64') assert_series_equal(base, test)
def test_dataframe_to_string(): with set_options(formatting={'nrows': 5, 'ncols': 8}): # Test basic df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) string = str(df) print(string) assert string.splitlines()[-1] == '[1 more rows]' # Test skipped columns df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16]), ('c', [11, 12, 13, 14, 15, 16]), ('d', [11, 12, 13, 14, 15, 16])]) string = df.to_string(ncols=3) print(string) assert string.splitlines()[-2] == '[1 more rows]' assert string.splitlines()[-1] == '[1 more columns]' # Test masked df = DataFrame([('a', [1, 2, 3, 4, 5, 6]), ('b', [11, 12, 13, 14, 15, 16])]) data = np.arange(6) mask = np.zeros(1, dtype=np.uint8) mask[0] = 0b00101101 masked = Series.from_masked_array(data, mask) assert masked.null_count == 2 df['c'] = masked # check data values = list(masked) validids = [0, 2, 3, 5] densearray = masked.to_array() np.testing.assert_equal(data[validids], densearray) # valid position is corret for i in validids: assert data[i] == values[i] # null position is correct for i in range(len(values)): if i not in validids: assert values[i] is None got = df.to_string(nrows=None) print(got) expect = ''' a b c 0 1 11 0 1 2 12 2 3 13 2 3 4 14 3 4 5 15 5 6 16 5 ''' # values should match despite whitespace difference assert got.split() == expect.split()
def test_series(data): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5) np.testing.assert_equal( np.array(pd_data), np.array(dask_gdf_data.compute()), )
def test_series(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_pygdf(sr, npartitions=5) np.testing.assert_equal( np.array(pdsr), np.array(dsr.compute()), )
def test_series_scale(): arr = np.random.randint(low=-10, high=10, size=100) sr = Series.from_any(arr) vmin = arr.min() vmax = arr.max() scaled = (arr - vmin) / (vmax - vmin) assert scaled.min() == 0 assert scaled.max() == 1 np.testing.assert_equal(sr.scale().to_array(), scaled)
def test_categorical_element_indexing(): """ Element indexing to a cat column must give the underlying object not the numerical index. """ cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) assert list(pdsr) == list(sr) assert list(pdsr.cat.codes) == list(sr.cat.codes)
def test_to_dense_array(): data = np.random.random(8) mask = np.asarray([0b11010110], dtype=np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.null_count > 0 assert sr.null_count != len(sr) filled = sr.to_array(fillna='pandas') dense = sr.to_array() assert dense.size < filled.size assert filled.size == len(sr)
def test_fillna(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert not dense.has_null_mask
def test_series_reductions(method, dtype): np.random.seed(0) arr = np.random.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 else: mask = arr > 0.5 arr = arr.astype(dtype) arr2 = arr[mask] sr = Series.from_masked_array(arr, Series(mask).as_mask()) def call_test(sr): fn = getattr(sr, method) return fn() expect, got = call_test(arr2), call_test(sr) print(expect, got) np.testing.assert_approx_equal(expect, got)
def test_to_dense_array(): data = np.random.random(8) mask = np.asarray([0b11010110], dtype=np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) assert sr.null_count > 0 assert sr.null_count != len(sr) filled = sr.to_array(fillna='pandas') dense = sr.to_array() assert dense.size < filled.size assert filled.size == len(sr)
def test_categorical_integer(): cat = pd.Categorical(['a', '_', '_', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array(fillna='pandas')) assert sr.null_count == 2 np.testing.assert_array_equal(pdsr.cat.codes.data, sr.cat.codes.fillna(-1).to_array()) np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 2 3 c 4 a """ assert string.split() == expect_str.split()
def test_fillna(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert not dense.has_null_mask
def test_categorical_missing(): cat = pd.Categorical(['a', '_', '_', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) fillna = lambda x: np.where(np.isnan(x), -1, x) np.testing.assert_array_equal(cat.codes, fillna(sr.to_array(fillna='pandas'))) assert sr.null_count == 2 np.testing.assert_array_equal(pdsr.cat.codes.data, fillna(sr.cat.codes.to_array(fillna='pandas'))) np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 2 3 c 4 a """ assert string.split() == expect_str.split()
def test_categorical_binary_add(): cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series.from_any(cat) with pytest.raises(TypeError) as raises: pdsr + pdsr raises.match('Categorical cannot perform the operation \+') with pytest.raises(TypeError) as raises: sr + sr raises.match('Categorical cannot perform the operation: add')
def test_nonmatching_index_setitem(nrows): np.random.seed(0) gdf = DataFrame() gdf['a'] = np.random.randint(2147483647, size=nrows) gdf['b'] = np.random.randint(2147483647, size=nrows) gdf = gdf.set_index('b') test_values = np.random.randint(2147483647, size=nrows) gdf['c'] = test_values assert (len(test_values) == len(gdf['c'])) assert (gdf['c'].to_pandas().equals( Series(test_values).set_index(gdf._index).to_pandas()))
def test_categorical_basic(): cat = pd.Categorical(['a', 'a', 'b', 'c', 'a'], categories=['a', 'b', 'c']) pdsr = pd.Series(cat) sr = Series(cat) np.testing.assert_array_equal(cat.codes, sr.to_array()) assert sr.dtype == pdsr.dtype # Test attributes assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories) assert pdsr.cat.ordered == sr.cat.ordered np.testing.assert_array_equal(pdsr.cat.codes.data, sr.cat.codes.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype) string = str(sr) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split()))
def test_series_indexing(): a1 = np.arange(20) series = Series(a1) # Indexing sr1 = series[:12] assert sr1.null_count == 0 np.testing.assert_equal(sr1.to_array(), a1[:12]) sr2 = sr1[3:] assert sr2.null_count == 0 np.testing.assert_equal(sr2.to_array(), a1[3:12]) # Index with stride sr3 = sr2[::2] assert sr3.null_count == 0 np.testing.assert_equal(sr3.to_array(), a1[3:12:2])
def test_series_indexing(): a1 = np.arange(20) series = Series.from_any(a1) # Indexing sr1 = series[:12] assert not sr1.has_null_mask np.testing.assert_equal(sr1.to_array(), a1[:12]) sr2 = sr1[3:] assert not sr2.has_null_mask np.testing.assert_equal(sr2.to_array(), a1[3:12]) # Index with stride sr3 = sr2[::2] assert not sr3.has_null_mask np.testing.assert_equal(sr3.to_array(), a1[3:12:2])
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df['fo'] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding('fo', cats=df.fo.unique(), prefix='fo', dtype=np.int32) assert set(out.columns) == {'fo', 'fo_0', 'fo_1', 'fo_2', 'fo_3'} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_series_floor(): arr = np.random.random(100) * 100 sr = Series(arr) np.testing.assert_equal(sr.floor().to_array(), np.floor(arr))