def test_compare_frame(seed, nelem): np.random.seed(seed) max_part_size = nelem # Make LHS lhs = pygdf.DataFrame() lhs['a'] = lhs_a = np.random.random(nelem) lhs['b'] = lhs_b = np.random.random(nelem) # Make RHS rhs = pygdf.DataFrame() rhs['a'] = rhs_a = np.random.random(nelem) rhs['b'] = rhs_b = np.random.random(nelem) # Sort by column "a" got_a = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by='a') # Check expect_a = np.hstack([lhs_a, rhs_a]) expect_a.sort() np.testing.assert_array_equal(got_a[0].a.to_array(), expect_a[:nelem]) np.testing.assert_array_equal(got_a[1].a.to_array(), expect_a[nelem:]) # Sort by column "b" got_b = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by='b') # Check expect_b = np.hstack([lhs_b, rhs_b]) expect_b.sort() np.testing.assert_array_equal(got_b[0].b.to_array(), expect_b[:nelem]) np.testing.assert_array_equal(got_b[1].b.to_array(), expect_b[nelem:])
def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): chunksize = 50 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows, dtype=np.float64)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows, dtype=np.float64)}.items()) expect = left.set_index('x').join(right.set_index('x'), how=how, sort=True, lsuffix='l', rsuffix='r') expect = expect.to_pandas() # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.set_index('x').join(right.set_index('x'), how=how, lsuffix='l', rsuffix='r') got = joined.compute().to_pandas() # Check index np.testing.assert_array_equal(expect.index.values, got.index.values) # Check rows in each groups expect_rows = {} got_rows = {} def gather(df, grows): cola = np.sort(np.asarray(df.al)) colb = np.sort(np.asarray(df.ar)) grows[df['index'].values[0]] = (cola, colb) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=expect_rows)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=got_rows)) for k in expect_rows: np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0]) np.testing.assert_array_equal(expect_rows[k][1], got_rows[k][1])
def pd2pygdf(df): if isinstance(df, np.ndarray): return np2pygdf(df) pdf = pygdf.DataFrame() for c, column in enumerate(df): pdf[c] = df[column] return pdf
def test_dataframe_to_delayed(): nelem = 100 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf = dgd.from_pygdf(df, npartitions=5) delays = ddf.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert_frame_equal(got.to_pandas(), df.to_pandas()) # Check individual partitions divs = ddf.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = df[s:e].to_pandas() got = part.compute().to_pandas() assert_frame_equal(got, expect)
def test_serialize_dataframe(): df = pygdf.DataFrame() df['a'] = np.arange(100) df['b'] = np.arange(100, dtype=np.float32) df['c'] = pd.Categorical(['a', 'b', 'c', '_', '_'] * 20, categories=['a', 'b', 'c']) outdf = deserialize(*serialize(df)) pd.util.testing.assert_frame_equal(df.to_pandas(), outdf.to_pandas())
def make_empty(): df = gd.DataFrame() for k in on: df[k] = np.asarray([], dtype=dtypes[k]) for k in left_val_names: df[fix_name(k, lsuffix)] = np.asarray([], dtype=dtypes[k]) for k in right_val_names: df[fix_name(k, rsuffix)] = np.asarray([], dtype=dtypes[k]) return df
def query(df, expr, callenv): boolmask = gd.queryutils.query_execute(df, expr, callenv) selected = gd.Series.from_array(boolmask) newdf = gd.DataFrame() for col in df.columns: newseries = df[col][selected] newdf[col] = newseries return newdf
def test_sort_values(nelem, nparts, by): df = pygdf.DataFrame() df['a'] = np.ascontiguousarray(np.arange(nelem)[::-1]) df['b'] = np.arange(100, nelem + 100) ddf = dgd.from_pygdf(df, npartitions=nparts) got = ddf.sort_values(by=by).compute().to_pandas() expect = df.sort_values(by=by).to_pandas().reset_index(drop=True) pd.util.testing.assert_frame_equal(got, expect)
def test_serialize_dataframe_with_index(): df = pygdf.DataFrame() df['a'] = np.arange(100) df['b'] = np.random.random(100) df['c'] = pd.Categorical(['a', 'b', 'c', '_', '_'] * 20, categories=['a', 'b', 'c']) df = df.sort_values('b') outdf = deserialize(*serialize(df)) pd.util.testing.assert_frame_equal(df.to_pandas(), outdf.to_pandas())
def test_compare_frame_with_none(): df = pygdf.DataFrame() max_part_size = 1 df['a'] = [0] res = batcher_sortnet._compare_frame(df, None, max_part_size, by='a') assert res[0] is not None, res[1] is None res = batcher_sortnet._compare_frame(None, df, max_part_size, by='a') assert res[0] is not None, res[1] is None res = batcher_sortnet._compare_frame(None, None, max_part_size, by='a') assert res == (None, None)
def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how='left'): print(left_nrows, right_nrows, left_nkeys, right_nkeys) chunksize = 3 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'y': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows, dtype=np.float64)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'y': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows, dtype=np.float64)}.items()) print(left.to_pandas()) print(right.to_pandas()) expect = left.merge(right, on=('x', 'y'), how=how) expect = expect.to_pandas().sort_values(['x', 'y', 'a_x', 'a_y'])\ .reset_index(drop=True) print("Expect".center(80, '=')) print(expect) # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.merge(right, on=('x', 'y'), how=how) print("Got".center(80, '=')) got = joined.compute().to_pandas() got = got.sort_values(['x', 'y', 'a_x', 'a_y']).reset_index(drop=True) print(got) pd.util.testing.assert_frame_equal(expect, got)
def test_serialize_groupby(): df = pygdf.DataFrame() df['key'] = np.random.randint(0, 20, 100) df['val'] = np.arange(100, dtype=np.float32) gb = df.groupby('key') outgb = deserialize(*serialize(gb)) got = gb.mean() expect = outgb.mean() pd.util.testing.assert_frame_equal(got.to_pandas(), expect.to_pandas())
def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): chunksize = 50 np.random.seed(0) # PyGDF left = gd.DataFrame({'x': np.random.randint(0, left_nkeys, size=left_nrows), 'a': np.arange(left_nrows)}.items()) right = gd.DataFrame({'x': np.random.randint(0, right_nkeys, size=right_nrows), 'a': 1000 * np.arange(right_nrows)}.items()) expect = left.set_index('x').join(right.set_index('x'), how='inner', sort=True, lsuffix='l', rsuffix='r') expect = expect.to_pandas() # Dask GDf left = dgd.from_pygdf(left, chunksize=chunksize) right = dgd.from_pygdf(right, chunksize=chunksize) joined = left.set_index('x').join(right.set_index('x'), how='inner', lsuffix='l', rsuffix='r') got = joined.compute().to_pandas() # Check index np.testing.assert_array_equal(expect.index.values, got.index.values) # Check rows in each groups expect_rows = {} got_rows = {} def gather(df, grows): grows[df['index'].values[0]] = (set(df.al), set(df.ar)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=expect_rows)) expect.reset_index().groupby('index')\ .apply(partial(gather, grows=got_rows)) assert got_rows == expect_rows
def fix_left(df): newdf = gd.DataFrame() df = df.reset_index() for k in on: newdf[k] = df[k] for k in left_val_names: newdf[fix_name(k, lsuffix)] = df[k] for k in right_val_names: newdf[fix_name(k, rsuffix)] = nullcolumn(len(df), dtypes[k]) return newdf
def test_from_scalar_typing(data_type): if data_type == 'datetime64[ms]': scalar = np.dtype('int64').type(np.random.randint(0, 5))\ .astype('datetime64[ms]') else: scalar = np.dtype(data_type).type(np.random.randint(0, 5)) gdf = gd.DataFrame() gdf['a'] = [1, 2, 3, 4, 5] gdf['b'] = scalar assert (gdf['b'].dtype == np.dtype(data_type)) assert (len(gdf['b']) == len(gdf['a']))
def test_pca_inverse_transform(datatype): gdf = pygdf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) cutsvd = cuTSVD(n_components=1) Xcutsvd = cutsvd.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cutsvd.inverse_transform(Xcutsvd) print(input_gdf) assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
def test_pca_inverse_transform(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) cupca = cuPCA(n_components = 2) Xcupca = cupca.fit_transform(gdf) print("Calling inverse_transform") input_gdf = cupca.inverse_transform(Xcupca) assert array_equal(input_gdf, gdf, 1e-3,with_sign=True)
def test_frame_dtype_error(): nelem = 20 df1 = gd.DataFrame() df1['bad'] = np.arange(nelem) df1['bad'] = np.arange(nelem, dtype=np.float64) df2 = gd.DataFrame() df2['bad'] = np.arange(nelem) df2['bad'] = np.arange(nelem, dtype=np.float32) ddf1 = dgd.from_pygdf(df1, npartitions=5) ddf2 = dgd.from_pygdf(df2, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() print("out") raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
def test_pca_fit_transform(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit_transform") cupca = cuPCA(n_components = 2) Xcupca = cupca.fit_transform(gdf) skpca = skPCA(n_components = 2) Xskpca = skpca.fit_transform(X) assert array_equal(Xcupca, Xskpca, 1e-3,with_sign=False)
def test_dbscan_predict(datatype): gdf = pygdf.DataFrame() gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype) gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype) X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype=datatype) print("Calling fit_predict") cudbscan = cuDBSCAN(eps=3, min_samples=2) cu_labels = cudbscan.fit_predict(gdf) skdbscan = skDBSCAN(eps=3, min_samples=2) sk_labels = skdbscan.fit_predict(X) print(X.shape[0]) for i in range(X.shape[0]): assert cu_labels[i] == sk_labels[i]
def fix_column(lhs): df = gd.DataFrame() for k in lhs.columns: df[k + lsuffix] = lhs[k] for k, dtype in rhs_dtypes: data = np.zeros(len(lhs), dtype=dtype) mask_size = gd.utils.calc_chunk_size(data.size, gd.utils.mask_bitsize) mask = np.zeros(mask_size, dtype=gd.utils.mask_dtype) sr = gd.Series.from_masked_array(data=data, mask=mask, null_count=data.size) df[k + rsuffix] = sr.set_index(df.index) return df
def test_pca_fit(datatype): gdf = pygdf.DataFrame() gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype) gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype) print("Calling fit") cupca = cuPCA(n_components = 2) cupca.fit(gdf) skpca = skPCA(n_components = 2) skpca.fit(X) for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_','noise_variance_']: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cupca,attr),getattr(skpca,attr), 1e-3,with_sign=with_sign)
def test_frame_extra_columns_error(): nelem = 20 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf1 = dgd.from_pygdf(df, npartitions=5) df['z'] = np.arange(nelem) ddf2 = dgd.from_pygdf(df, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: out = combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"extra columns")
def test_mixing_series_frame_error(): nelem = 20 df = gd.DataFrame() df['x'] = np.arange(nelem) df['y'] = np.random.randint(nelem, size=nelem) ddf = dgd.from_pygdf(df, npartitions=5) delay_frame = ddf.to_delayed() delay_series = ddf.x.to_delayed() combined = dgd.from_delayed(delay_frame + delay_series) with pytest.raises(ValueError) as raises: out = combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"Expected partition of type `DataFrame` but got `Series`")
def test_sort_values_binned(): np.random.seed(43) nelem = 100 nparts = 5 by = 'a' df = pygdf.DataFrame() df['a'] = np.random.randint(1, 5, nelem) ddf = dgd.from_pygdf(df, npartitions=nparts) parts = ddf.sort_values_binned(by=by).to_delayed() part_uniques = [] for i, p in enumerate(parts): part = dask.compute(p)[0] part_uniques.append(set(part.a.unique())) # Partitions do not have intersecting keys for i in range(len(part_uniques)): for j in range(i + 1, len(part_uniques)): assert not (part_uniques[i] & part_uniques[j]), \ "should have empty intersection"
def test_tsvd_fit(datatype): gdf = pygdf.DataFrame() gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype) gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype) X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype=datatype) print("Calling fit") cutsvd = cuTSVD(n_components=1) cutsvd.fit(gdf) sktsvd = skTSVD(n_components=1) sktsvd.fit(X) for attr in [ 'singular_values_', 'components_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_index_in_dataframe_constructor(): a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) pd.testing.assert_frame_equal(a, b.to_pandas()) assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas())
def np2pygdf(df): pdf = pygdf.DataFrame() for c in range(df.shape[1]): pdf[c] = df[:, c] return pdf
def load_data(nelem, ident): df = gd.DataFrame() df['x'] = np.arange(nelem) df['ident'] = np.asarray([ident] * nelem) return df