def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, 'data*') d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, '/data*', compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions
def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, 'data*') d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, '/data*', compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions
def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip("tables") df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, "data*") d = a.to_hdf(fn, "/data", compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, "/data*", compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, "/data", compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions