def test_categorical_compare_ordered(data): cat1 = data[0] cat2 = data[1] pdsr1 = pd.Series(cat1) pdsr2 = pd.Series(cat2) sr1 = Series(cat1) sr2 = Series(cat2) dsr1 = dgd.from_pygdf(sr1, npartitions=2) dsr2 = dgd.from_pygdf(sr2, npartitions=2) # Test equality out = dsr1 == dsr1 assert out.dtype == np.bool_ assert np.all(out.compute()) assert np.all(pdsr1 == pdsr1) # Test inequality out = dsr1 != dsr1 assert not np.any(out.compute()) assert not np.any(pdsr1 != pdsr1) assert dsr1.cat.ordered assert pdsr1.cat.ordered # Test ordered operators np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute()) np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute())
def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): chunksize = 50 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows, dtype=np.float64)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows, dtype=np.float64)}.items()) expect = left.set_index('x').join(right.set_index('x'), how=how, sort=True, lsuffix='l', rsuffix='r') expect = expect.to_pandas() # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.set_index('x').join(right.set_index('x'), how=how, lsuffix='l', rsuffix='r') got = joined.compute().to_pandas() # Check index np.testing.assert_array_equal(expect.index.values, got.index.values) # Check rows in each groups expect_rows = {} got_rows = {} def gather(df, grows): cola = np.sort(np.asarray(df.al)) colb = np.sort(np.asarray(df.ar)) grows[df['index'].values[0]] = (cola, colb) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=expect_rows)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=got_rows)) for k in expect_rows: np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0]) np.testing.assert_array_equal(expect_rows[k][1], got_rows[k][1])
def test_assign(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=20), 'y': np.random.normal(size=20)}) dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2) pdcol = pd.Series(np.arange(20) + 1000) newcol = dgd.from_pygdf(gd.Series(pdcol), npartitions=dgf.npartitions) out = dgf.assign(z=newcol) got = out.compute().to_pandas() assert_frame_equal(got.loc[:, ['x', 'y']], df) np.testing.assert_array_equal(got['z'], pdcol)
def test_categorical_compare_unordered(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_pygdf(sr, npartitions=2) # Test equality out = dsr == dsr assert out.dtype == np.bool_ assert np.all(out.compute()) assert np.all(pdsr == pdsr) # Test inequality out = dsr != dsr assert not np.any(out.compute()) assert not np.any(pdsr != pdsr) assert not dsr.cat.ordered.compute() assert not pdsr.cat.ordered with pytest.raises((TypeError, ValueError)) as raises: pdsr < pdsr raises.match("Unordered Categoricals can only compare equality or not") with pytest.raises((TypeError, ValueError)) as raises: dsr < dsr raises.match("Unordered Categoricals can only compare equality or not")
def test_series_to_delayed(): nelem = 100 sr = gd.Series(np.random.randint(nelem, size=nelem)) dsr = dgd.from_pygdf(sr, npartitions=5) delays = dsr.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert isinstance(got, gd.Series) np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas()) # Check individual partitions divs = dsr.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = sr[s:e].to_pandas() got = part.compute().to_pandas() np.testing.assert_array_equal(got, expect)
def test_repeated_groupby(): np.random.seed(0) nelem = 100 df = pd.DataFrame() df['a'] = _gen_uniform_keys(nelem) df['b'] = _gen_uniform_keys(nelem) ref_df = gd.DataFrame.from_pandas(df) df = dgd.from_pygdf(ref_df, npartitions=3) assert df.known_divisions df2 = df.groupby('a').apply(lambda x: x) assert not df2.known_divisions got = df2.groupby('a').apply(lambda x: x).compute().to_pandas() expect = ref_df.groupby('a').apply(lambda x: x).to_pandas() def sort_content(df): return sorted(list(df.b)) got = got.groupby('a').apply(sort_content) expect = expect.groupby('a').apply(sort_content) pd.util.testing.assert_series_equal(got, expect)
def test_groupby_multi_keys(keygen): np.random.seed(0) nelem = 500 npartitions = 10 # Generate the keys xs = keygen(nelem) ys = keygen(nelem) assert xs.size == nelem assert ys.size == nelem df = pd.DataFrame({ 'x': xs, 'y': ys, 'z': np.random.normal(size=nelem) + 1 }) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_pygdf(gdf, npartitions=npartitions) groups = dgf.groupby(by=['x', 'y']).count() got = groups.compute().to_pandas() # Check against expectation expect = df.groupby(by=['x', 'y'], as_index=False).count() # Check keys np.testing.assert_array_equal(got.x, expect.x) np.testing.assert_array_equal(got.y, expect.y) # Check values np.testing.assert_array_equal(got.z, expect.z)
def test_dataframe_to_delayed(): nelem = 100 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf = dgd.from_pygdf(df, npartitions=5) delays = ddf.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert_frame_equal(got.to_pandas(), df.to_pandas()) # Check individual partitions divs = ddf.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = df[s:e].to_pandas() got = part.compute().to_pandas() assert_frame_equal(got, expect)
def test_categorical_basic(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_pygdf(sr, npartitions=2) result = dsr.compute() np.testing.assert_array_equal(cat.codes, result.to_array()) assert dsr.dtype == pdsr.dtype # Test attributes assert pdsr.cat.ordered == dsr.cat.ordered.compute() # TODO: Investigate dsr.cat.categories: It raises # ValueError: Expected iterable of tuples of (name, dtype), # got ('a', 'b', 'c') # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories)) np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype) string = str(result) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split()))
def test_dt_series(data, field): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_pygdf(sr, npartitions=5) base = getattr(pdsr.dt, field) test = getattr(dsr.dt, field).compute()\ .to_pandas().astype('int64') assert_series_equal(base, test)
def test_head(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=100), 'y': np.random.normal(size=100)}) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_pygdf(gdf, npartitions=2) assert_frame_equal(dgf.head().to_pandas(), df.head())
def _make_random_frame(nelem, npartitions=2): df = pd.DataFrame({ 'x': np.random.randint(0, 5, size=nelem), 'y': np.random.normal(size=nelem) + 1 }) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_pygdf(gdf, npartitions=npartitions) return df, dgf
def test_dt_series(data, field): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5) base = getattr(pd_data.dt, field) test = getattr(dask_gdf_data.dt, field).compute()\ .to_pandas().astype('int64') assert_series_equal(base, test)
def test_series(data): pd_data = pd.Series(data.copy()) gdf_data = Series(pd_data) dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5) np.testing.assert_equal( np.array(pd_data), np.array(dask_gdf_data.compute()), )
def test_sort_values(nelem, nparts, by): df = pygdf.DataFrame() df['a'] = np.ascontiguousarray(np.arange(nelem)[::-1]) df['b'] = np.arange(100, nelem + 100) ddf = dgd.from_pygdf(df, npartitions=nparts) got = ddf.sort_values(by=by).compute().to_pandas() expect = df.sort_values(by=by).to_pandas().reset_index(drop=True) pd.util.testing.assert_frame_equal(got, expect)
def test_frame_extra_columns_error(): nelem = 20 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf1 = dgd.from_pygdf(df, npartitions=5) df['z'] = np.arange(nelem) ddf2 = dgd.from_pygdf(df, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: out = combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"extra columns")
def test_series(data): pdsr = pd.Series(data.copy()) sr = Series(pdsr) dsr = dgd.from_pygdf(sr, npartitions=5) np.testing.assert_equal( np.array(pdsr), np.array(dsr.compute()), )
def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): chunksize = 50 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows)}.items()) expect = left.set_index('x').join(right.set_index('x'), how='inner', sort=True, lsuffix='l', rsuffix='r') expect = expect.to_pandas() # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.set_index('x').join(right.set_index('x'), how='inner', lsuffix='l', rsuffix='r') got = joined.compute().to_pandas() # Check index np.testing.assert_array_equal(expect.index.values, got.index.values) # Check rows in each groups expect_rows = {} got_rows = {} def gather(df, grows): grows[df['index'].values[0]] = (set(df.al), set(df.ar)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=expect_rows)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=got_rows)) assert got_rows == expect_rows
def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how='left'): print(left_nrows, right_nrows, left_nkeys, right_nkeys) chunksize = 3 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'y': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows, dtype=np.float64)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'y': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows, dtype=np.float64)}.items()) print(left.to_pandas()) print(right.to_pandas()) expect = left.merge(right, on=('x', 'y'), how=how) expect = expect.to_pandas().sort_values(['x', 'y', 'a_x', 'a_y'])\ .reset_index(drop=True) print("Expect".center(80, '=')) print(expect) # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.merge(right, on=('x', 'y'), how=how) print("Got".center(80, '=')) got = joined.compute().to_pandas() got = got.sort_values(['x', 'y', 'a_x', 'a_y']).reset_index(drop=True) print(got) pd.util.testing.assert_frame_equal(expect, got)
def test_set_index_2(nelem): np.random.seed(0) df = pd.DataFrame({'x': 100 + np.random.randint(0, nelem//2, size=nelem), 'y': np.random.normal(size=nelem)}) expect = df.set_index('x').sort_index() dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=4) res = dgf.set_index('x') # sort by default got = res.compute().to_pandas() assert_frame_equal_by_index_group(expect, got)
def test_setitem_scalar_float(data_type): np.random.seed(0) scalar = np.random.randn(1).astype(data_type)[0] df = pd.DataFrame({'x': np.random.randint(0, 5, size=20), 'y': np.random.normal(size=20)}) dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2) df['z'] = scalar dgf['z'] = scalar got = dgf.compute().to_pandas() np.testing.assert_array_equal(got['z'], df['z'])
def test_setitem_scalar_datetime(): np.random.seed(0) scalar = np.int64(np.random.randint(0, 100)).astype('datetime64[ms]') df = pd.DataFrame({'x': np.random.randint(0, 5, size=20), 'y': np.random.normal(size=20)}) dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2) df['z'] = scalar dgf['z'] = scalar got = dgf.compute().to_pandas() np.testing.assert_array_equal(got['z'], df['z'])
def test_frame_dtype_error(): nelem = 20 df1 = gd.DataFrame() df1['bad'] = np.arange(nelem) df1['bad'] = np.arange(nelem, dtype=np.float64) df2 = gd.DataFrame() df2['bad'] = np.arange(nelem) df2['bad'] = np.arange(nelem, dtype=np.float32) ddf1 = dgd.from_pygdf(df1, npartitions=5) ddf2 = dgd.from_pygdf(df2, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() print("out") raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
def test_from_pygdf(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=10000), 'y': np.random.normal(size=10000)}) gdf = gd.DataFrame.from_pandas(df) # Test simple around to/from dask ingested = dgd.from_pygdf(gdf, npartitions=2) assert_frame_equal(ingested.compute().to_pandas(), df) # Test conversion to dask.dataframe ddf = ingested.to_dask_dataframe() assert_frame_equal(ddf.compute(), df)
def test_groupby_apply_grouped(): np.random.seed(0) nelem = 100 xs = _gen_uniform_keys(nelem) ys = _gen_uniform_keys(nelem) df = pd.DataFrame({ 'x': xs, 'y': ys, 'idx': np.arange(nelem), 'v1': np.random.normal(size=nelem), 'v2': np.random.normal(size=nelem) }) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_pygdf(gdf, npartitions=2) def transform(y, v1, v2, out1): for i in range(cuda.threadIdx.x, y.size, cuda.blockDim.x): out1[i] = y[i] * (v1[i] + v2[i]) grouped = dgf.groupby(by=['x', 'y']).apply_grouped( transform, incols=['y', 'v1', 'v2'], outcols={'out1': np.float64}, ) # Compute with dask dgd_grouped = grouped.compute().to_pandas() binning = {} for _, row in dgd_grouped.iterrows(): binning[row.idx] = row # Emulate the operation with pandas def emulate(df): df['out1'] = df.y * (df.v1 + df.v2) return df pd_groupby = df.groupby(by=['x', 'y'], sort=True, as_index=True).apply(emulate) # Check the result for _, expect in pd_groupby.iterrows(): got = binning[expect.idx] attrs = ['x', 'y', 'v1', 'v2', 'out1'] for a in attrs: np.testing.assert_equal(getattr(got, a), getattr(expect, a))
def test_query(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=10), 'y': np.random.normal(size=10)}) gdf = gd.DataFrame.from_pandas(df) expr = 'x > 2' assert_frame_equal(gdf.query(expr).to_pandas(), df.query(expr)) queried = (dgd.from_pygdf(gdf, npartitions=2).query(expr)) got = queried.compute().to_pandas() expect = gdf.query(expr).to_pandas() assert_frame_equal(got, expect)
def test_mixing_series_frame_error(): nelem = 20 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf = dgd.from_pygdf(df, npartitions=5) delay_frame = ddf.to_delayed() delay_series = ddf.x.to_delayed() combined = dgd.from_delayed(delay_frame + delay_series) with pytest.raises(ValueError) as raises: out = combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"Expected partition of type `DataFrame` but got `Series`")
def test_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)}) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) # Combine with .append head = frags[0] tail = frags[1:] appended = dgd.from_pygdf(head, npartitions=1) for each in tail: appended = appended.append(each) assert_frame_equal(df, appended.compute().to_pandas())
def test_series_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)}) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) frags = [df.x for df in frags] appending = dgd.from_pygdf(frags[0], npartitions=1) for frag in frags[1:]: appending = appending.append(frag) appended = appending.compute().to_pandas() assert isinstance(appended, pd.Series) np.testing.assert_array_equal(appended, df.x)
def test_sort_values_binned(): np.random.seed(43) nelem = 100 nparts = 5 by = 'a' df = pygdf.DataFrame() df['a'] = np.random.randint(1, 5, nelem) ddf = dgd.from_pygdf(df, npartitions=nparts) parts = ddf.sort_values_binned(by=by).to_delayed() part_uniques = [] for i, p in enumerate(parts): part = dask.compute(p)[0] part_uniques.append(set(part.a.unique())) # Partitions do not have intersecting keys for i in range(len(part_uniques)): for j in range(i + 1, len(part_uniques)): assert not (part_uniques[i] & part_uniques[j]), \ "should have empty intersection"