def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, 'data*') d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, '/data*', compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions
def test_read_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpfile('h5') as fn: df.to_hdf(fn, '/data') try: dd.read_hdf(fn, 'data', chunksize=2) assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile('h5') as fn: df.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2) assert a.npartitions == 2 assert a._known_dtype tm.assert_frame_equal(a.compute(), df) tm.assert_frame_equal( dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3).compute(), pd.read_hdf(fn, '/data', start=1, stop=3)) assert sorted(dd.read_hdf(fn, '/data').dask) == \ sorted(dd.read_hdf(fn, '/data').dask)
def test_to_hdf_thread(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df, 16) # test single file single node with tmpfile('h5') as fn: a.to_hdf(fn, '/data', get=dask.threaded.get) out = pd.read_hdf(fn, '/data') eq(df, out) # test multiple files single node with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', get=dask.threaded.get) out = dd.read_hdf(fn, '/data') eq(df, out) # test single file multiple nodes with tmpfile('h5') as fn: a.to_hdf(fn, '/data*', get=dask.threaded.get) out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_tokenize_numpy_memmap(): with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) y = tokenize(np.load(fn, mmap_mode='r')) with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) z = tokenize(np.load(fn, mmap_mode='r')) assert y != z with tmpfile('.npy') as fn: x = np.random.normal(size=(10, 10)) np.save(fn, x) mm = np.load(fn, mmap_mode='r') mm2 = np.load(fn, mmap_mode='r') a = tokenize(mm[0, :]) b = tokenize(mm[1, :]) c = tokenize(mm[0:3, :]) d = tokenize(mm[:, 0]) assert len(set([a, b, c, d])) == 4 assert tokenize(mm) == tokenize(mm2) assert tokenize(mm[1, :]) == tokenize(mm2[1, :])
def test_read_hdf(data, compare): pytest.importorskip('tables') with tmpfile('h5') as fn: data.to_hdf(fn, '/data') try: dd.read_hdf(fn, 'data', chunksize=2, mode='r') assert False except TypeError as e: assert "format='table'" in str(e) with tmpfile('h5') as fn: data.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2, mode='r') assert a.npartitions == 2 compare(a.compute(), data) compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3, mode='r').compute(), pd.read_hdf(fn, '/data', start=1, stop=3)) assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) == sorted(dd.read_hdf(fn, '/data', mode='r').dask)) with tmpfile('h5') as fn: sorted_data = data.sort_index() sorted_data.to_hdf(fn, '/data', format='table') a = dd.read_hdf(fn, '/data', chunksize=2, sorted_index=True, mode='r') assert a.npartitions == 2 compare(a.compute(), sorted_data)
def test_to_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) with tmpfile('h5') as fn: a.x.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) # test compute = False with tmpfile('h5') as fn: r = a.to_hdf(fn, '/data', compute=False) r.compute() out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:])
def test_tokenize_numpy_memmap_no_filename(): # GH 1562: with tmpfile('.npy') as fn1, tmpfile('.npy') as fn2: x = np.arange(5) np.save(fn1, x) np.save(fn2, x) a = np.load(fn1, mmap_mode='r') b = a + a assert tokenize(b) == tokenize(b)
def test_tokenize_numpy_memmap(): with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) y = tokenize(np.load(fn, mmap_mode='r')) with tmpfile('.npy') as fn: x = np.arange(5) np.save(fn, x) z = tokenize(np.load(fn, mmap_mode='r')) assert y != z
def test_to_csv_series(): s = pd.Series([1, 2, 3], index=[10, 20, 30], name="foo") a = dd.from_pandas(s, 2) with tmpfile("csv") as fn: with tmpfile("csv") as fn2: a.to_csv(fn) s.to_csv(fn2) with open(fn) as f: adata = f.read() with open(fn2) as f: sdata = f.read() assert adata == sdata
def test_to_textfiles_inputs(): B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) with tmpfile() as a: with tmpfile() as b: B.to_textfiles([a, b]).compute() assert os.path.exists(a) assert os.path.exists(b) with tmpfile() as dirname: B.to_textfiles(dirname).compute() assert os.path.exists(dirname) assert os.path.exists(os.path.join(dirname, '0.part')) assert raises(ValueError, lambda: B.to_textfiles(5))
def test_memmap(): with tmpfile('npy') as fn_1: with tmpfile('npy') as fn_2: x = da.arange(100, chunks=15) target = np.memmap(fn_1, shape=x.shape, mode='w+', dtype=x.dtype) x.store(target) assert eq(target, x) np.save(fn_2, target) assert eq(np.load(fn_2, mmap_mode='r'), x)
def test_ensure_file_directory(mkdir): a = {'x': 1, 'y': {'a': 1}} with tmpfile(extension='yaml') as source: with tmpfile() as destination: if mkdir: os.mkdir(destination) with open(source, 'w') as f: yaml.dump(a, f) ensure_file(source=source, destination=destination) assert os.path.isdir(destination) [fn] = os.listdir(destination) assert os.path.split(fn)[1] == os.path.split(source)[1]
def test_to_hdf(): pytest.importorskip("tables") df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) with tmpfile("h5") as fn: a.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_frame_equal(df, out[:]) with tmpfile("h5") as fn: a.x.to_hdf(fn, "/data") out = pd.read_hdf(fn, "/data") tm.assert_series_equal(df.x, out[:])
def test_to_hdf_kwargs(): pytest.importorskip('tables') df = pd.DataFrame({'A': ['a', 'aaaa']}) ddf = dd.from_pandas(df, npartitions=2) with tmpfile('h5') as fn: ddf.to_hdf(fn, 'foo4', format='table', min_itemsize=4) df2 = pd.read_hdf(fn, 'foo4') tm.assert_frame_equal(df, df2) # test shorthand 't' for table with tmpfile('h5') as fn: ddf.to_hdf(fn, 'foo4', format='t', min_itemsize=4) df2 = pd.read_hdf(fn, 'foo4') tm.assert_frame_equal(df, df2)
def test_to_textfiles_inputs(): B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) with tmpfile() as a: with tmpfile() as b: B.to_textfiles([a, b]) assert os.path.exists(a) assert os.path.exists(b) with tmpdir() as dirname: B.to_textfiles(dirname) assert os.path.exists(dirname) assert os.path.exists(os.path.join(dirname, '0.part')) with pytest.raises(TypeError): B.to_textfiles(5)
def test_read_json_error(): with tmpfile('json') as f: with pytest.raises(ValueError): df.to_json(f, orient='split', lines=True) df.to_json(f, orient='split', lines=False) with pytest.raises(ValueError): dd.read_json(f, orient='split', blocksize=1)
def test_gh715(): bin_data = u'\u20ac'.encode('utf-8') with tmpfile() as fn: with open(fn, 'wb') as f: f.write(bin_data) a = db.from_filenames(fn) assert a.compute()[0] == bin_data.decode('utf-8')
def test_to_textfiles_endlines(): b = db.from_sequence(['a', 'b', 'c'], npartitions=1) with tmpfile() as fn: b.to_textfiles([fn]) with open(fn, 'r') as f: result = f.readlines() assert result == ['a\n', 'b\n', 'c']
def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1*(10-i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df16, out)
def test_gh606(): encoding = 'utf-16-le' euro = u'\u20ac' yen = u'\u00a5' linesep = os.linesep bin_euro = u'\u20ac'.encode(encoding) bin_yen = u'\u00a5'.encode(encoding) bin_linesep = linesep.encode(encoding) data = (euro * 10) + linesep + (yen * 10) + linesep + (euro * 10) bin_data = data.encode(encoding) with tmpfile() as fn: with open(fn, 'w+b') as f: f.write(bin_data) f.seek(0) stop = len(bin_euro) * 10 + len(bin_linesep) res = textblock(f, 1, stop, encoding=encoding) assert res == ((yen * 10) + linesep).encode(encoding) stop = len(bin_euro) * 10 + len(bin_linesep) res = textblock(f, 0, stop, encoding=encoding) assert res == ((euro * 10) + linesep + (yen * 10) + linesep).encode(encoding)
def test_nextlinesep(): lineseps = ('\r', '\n', '\r\n') encodings = ('utf-16-le', 'utf-8') for sep, encoding in product(lineseps, encodings): euro = u'\u20ac' yen = u'\u00a5' bin_euro = u'\u20ac'.encode(encoding) bin_yen = u'\u00a5'.encode(encoding) bin_sep = sep.encode(encoding) data = (euro * 10) + sep + (yen * 10) + sep + (euro * 10) bin_data = data.encode(encoding) with tmpfile() as fn: with open(fn, 'w+b') as f: f.write(bin_data) f.seek(0) start, stop = next_linesep(f, 5, encoding, sep) assert start == len(bin_euro) * 10 assert stop == len(bin_euro) * 10 + len(sep.encode(encoding)) seek = len(bin_euro) * 10 + len(bin_sep) + len(bin_yen) start, stop = next_linesep(f, seek, encoding, sep) exp_start = len(bin_euro) * 10 + len(bin_sep) + len(bin_yen) * 10 exp_stop = exp_start + len(bin_sep) assert start == exp_start assert stop == exp_stop
def test_linecount_gzip(): with tmpfile('gz') as fn: f = gzip.open(fn, 'wb') for line in text.split('\n'): f.write(line.encode('ascii')) f.write(b'\n') f.close() assert linecount(fn) == 7
def test_to_textfiles_endlines(): b = db.from_sequence(['a', 'b', 'c'], npartitions=1) with tmpfile() as fn: for last_endline in False, True: b.to_textfiles([fn], last_endline=last_endline) with open(fn, 'r') as f: result = f.readlines() assert result == ['a\n', 'b\n', 'c\n' if last_endline else 'c']
def test_to_csv(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpfile('csv') as fn: a.to_csv(fn) result = pd.read_csv(fn, index_col=0) tm.assert_frame_equal(result, df) a = dd.from_pandas(df, npartitions) with tmpfile('csv') as fn: r = a.to_csv(fn, compute=False) r.compute() result = pd.read_csv(fn, index_col=0) tm.assert_frame_equal(result, df)
def test_linecount_bz2(): with tmpfile('bz2') as fn: f = bz2.BZ2File(fn, 'wb') for line in text.split('\n'): f.write(line.encode('ascii')) f.write(b'\n') f.close() assert linecount(fn) == 7
def test_visualize_order(): pytest.importorskip('matplotlib') x = da.arange(5, chunks=2) with tmpfile(extension='dot') as fn: x.visualize(color='order', filename=fn, cmap='RdBu') with open(fn) as f: text = f.read() assert 'color="#' in text
def test_to_hdf_modes_multiple_nodes(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) # appending a single partition to existing data a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # overwriting a file with a single partition a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='w') out = dd.read_hdf(fn, '/data*') eq(df, out) # appending two partitions to existing data a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # overwriting a file with two partitions a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='w') out = dd.read_hdf(fn, '/data*') eq(df, out) # overwriting a single partition, keeping other partitions a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data1') a.to_hdf(fn, '/data2') a.to_hdf(fn, '/data*', mode='a', append=False) out = dd.read_hdf(fn, '/data*') eq(df.append(df), out)
def test_read_hdf_doesnt_segfault(): with tmpfile("h5") as fn: N = 40 df = pd.DataFrame(np.random.randn(N, 3)) with pd.HDFStore(fn, mode="w") as store: store.append("/x", df) ddf = dd.read_hdf(fn, "/x", chunksize=2) assert len(ddf) == N
def test_to_csv_gzip(): df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpfile("csv") as fn: a.to_csv(fn, compression="gzip") result = pd.read_csv(fn, index_col=0, compression="gzip") tm.assert_frame_equal(result, df)
def test_from_bcolz_filename(): bcolz = pytest.importorskip("bcolz") with tmpfile(".bcolz") as fn: t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], rootdir=fn) t.flush() d = dd.from_bcolz(fn, chunksize=2) assert list(d.x.compute()) == [1, 2, 3]
def test_to_hdf_multiple_datasets(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) b = dd.from_pandas(df16, 16) # saving to multiple datasets making sure order is kept with tmpfile('h5') as fn: b.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple datasets with tmpfile('h5') as fn: a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df, out) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df, out) # saving to multiple datasets with custom name_function with tmpfile('h5') as fn: a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data_*') eq(df, out) out = pd.read_hdf(fn, '/data_a') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, '/data_aa') tm.assert_frame_equal(out, df.iloc[2:]) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') eq(df, out) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # triggering too many asterisks error with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data_*') # triggering too many asterisks error with tmpfile() as fn: with pd.HDFStore(fn) as hdf: with pytest.raises(ValueError): a.to_hdf(hdf, '/data_*_*') # test hdf object with tmpfile('h5') as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, '/data*') out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_to_hdf_lock_delays(): pytest.importorskip("tables") df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df16, 16) # adding artificial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1 * (10 - i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple hdf files # adding artificial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, "data*") a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df16, out)
def test_to_hdf_schedulers(scheduler, npartitions): pytest.importorskip("tables") df = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df, npartitions=npartitions) # test single file single node with tmpfile("h5") as fn: a.to_hdf(fn, "/data", scheduler=scheduler) out = pd.read_hdf(fn, "/data") assert_eq(df, out) # test multiple files single node with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data", scheduler=scheduler) out = dd.read_hdf(fn, "/data") assert_eq(df, out) # test single file multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*", scheduler=scheduler) out = dd.read_hdf(fn, "/data*") assert_eq(df, out)
def test_to_hdf_multiple_nodes(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple nodes with tmpfile("h5") as fn: a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out) # saving to multiple nodes making sure order is kept with tmpfile("h5") as fn: b.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple datasets with custom name_function with tmpfile("h5") as fn: a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data_*") assert_eq(df, out) out = pd.read_hdf(fn, "/data_a") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, "/data_aa") tm.assert_frame_equal(out, df.iloc[2:]) # test multiple nodes with hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: b.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out)
def db(): with tmpfile() as f: uri = "sqlite:///%s" % f df.to_sql("test", uri, index=True, if_exists="replace") yield uri
def tmp_db_uri(): with tmpfile() as f: yield "sqlite:///%s" % f
def test_roundtrip_from_pandas(engine): with tmpfile() as fn: df = pd.DataFrame({'x': [1, 2, 3]}) fastparquet.write(fn, df) ddf = dd.io.parquet.read_parquet(fn, index=False, engine=engine) assert_eq(df, ddf)
def test_to_hdf(): pytest.importorskip('tables') df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4] }, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) with tmpfile('h5') as fn: a.x.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) # saving to multiple datasets a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') tm.assert_frame_equal(df, out.compute()) # saving to multiple files a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out.compute()) # saving to multiple datasets with custom name_function a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data_*') tm.assert_frame_equal(df, out.compute()) out = pd.read_hdf(fn, '/data_a') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(fn, '/data_aa') tm.assert_frame_equal(out, df.iloc[2:]) # saving to multiple files with custom name_function a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out.compute()) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # saving to different datasets in multiple files with custom name_function a = dd.from_pandas(df, 2) with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip("tables") df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, "data*") d = a.to_hdf(fn, "/data", compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, "/data*", compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, "/data", compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions