def test_index_with_dataframe(method): res1 = shuffle(d, d[['b']], shuffle=method).compute() res2 = shuffle(d, ['b'], shuffle=method).compute() res3 = shuffle(d, 'b', shuffle=method).compute() assert sorted(res1.values.tolist()) == sorted(res2.values.tolist()) assert sorted(res1.values.tolist()) == sorted(res3.values.tolist())
def test_shuffle(): s = shuffle(d, d.b, npartitions=2) assert isinstance(s, dd.DataFrame) assert s.npartitions == 2 x = get_sync(s.dask, (s._name, 0)) y = get_sync(s.dask, (s._name, 1)) assert not (set(x.b) & set(y.b)) # disjoint assert shuffle(d, d.b, npartitions=2)._name == shuffle(d, d.b, npartitions=2)._name
def test_shuffle_empty_partitions(method): df = pd.DataFrame({'x': [1, 2, 3] * 10}) ddf = dd.from_pandas(df, npartitions=3) s = shuffle(ddf, ddf.x, npartitions=6, shuffle=method) parts = s._get(s.dask, s._keys()) for p in parts: assert s.columns == p.columns
def test_shuffle_from_one_partition_to_one_other(method): df = pd.DataFrame({'x': [1, 2, 3]}) a = dd.from_pandas(df, 1) for i in [1, 2]: b = shuffle(a, 'x', npartitions=i, shuffle=method) assert len(a.compute(get=get_sync)) == len(b.compute(get=get_sync))
def test_shuffle_empty_partitions(method): df = pd.DataFrame({'x': [1, 2, 3] * 10}) ddf = dd.from_pandas(df, npartitions=3) s = shuffle(ddf, ddf.x, npartitions=6, shuffle=method) parts = compute_as_if_collection(dd.DataFrame, s.dask, s.__dask_keys__()) for p in parts: assert s.columns == p.columns
def test_shuffle_npartitions_task(): df = pd.DataFrame({'x': np.random.random(100)}) ddf = dd.from_pandas(df, npartitions=10) s = shuffle(ddf, ddf.x, shuffle='tasks', npartitions=17, max_branch=4) sc = s.compute(get=get_sync) assert s.npartitions == 17 assert set(s.dask).issuperset(set(ddf.dask)) assert len(sc) == len(df) assert list(s.columns) == list(df.columns) assert (set(map(tuple, sc.values.tolist())) == set(map(tuple, df.values.tolist())))
def test_index_with_non_series(method): from dask.dataframe.tests.test_multi import list_eq list_eq(shuffle(d, d.b, shuffle=method), shuffle(d, 'b', shuffle=method))
def test_default_partitions(): assert shuffle(d, d.b).npartitions == d.npartitions
def test_index_with_non_series(method): tm.assert_frame_equal(shuffle(d, d.b, shuffle=method).compute(), shuffle(d, 'b', shuffle=method).compute())
def test_index_with_dataframe(): assert sorted(shuffle(d, d[['b']]).compute().values.tolist()) ==\ sorted(shuffle(d, ['b']).compute().values.tolist()) ==\ sorted(shuffle(d, 'b').compute().values.tolist())
def test_index_with_non_series(): tm.assert_frame_equal(shuffle(d, d.b).compute(), shuffle(d, 'b').compute())
def test_index_with_dataframe(method): assert sorted(shuffle(d, d[['b']], shuffle=method).compute().values.tolist()) ==\ sorted(shuffle(d, ['b'], shuffle=method).compute().values.tolist()) ==\ sorted(shuffle(d, 'b', shuffle=method).compute().values.tolist())
def test_index_with_non_series(shuffle_method): from dask.dataframe.tests.test_multi import list_eq list_eq(shuffle(d, d.b, shuffle=shuffle_method), shuffle(d, "b", shuffle=shuffle_method))