def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1*(10-i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df16, out)
def test_to_hdf_thread(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df, 16) # test single file single node with tmpfile('h5') as fn: a.to_hdf(fn, '/data', get=dask.threaded.get) out = pd.read_hdf(fn, '/data') eq(df, out) # test multiple files single node with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', get=dask.threaded.get) out = dd.read_hdf(fn, '/data') eq(df, out) # test single file multiple nodes with tmpfile('h5') as fn: a.to_hdf(fn, '/data*', get=dask.threaded.get) out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, 'data*') d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, '/data*', compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions
def test_read_chunked(block): with tmpdir() as path: fn = os.path.join(path, '1.json') df.to_json(fn, orient='records', lines=True) d = dd.read_json(fn, blocksize=block, sample=10) assert (d.npartitions > 1) or (block > 50) assert_eq(d, df, check_index=False)
def test_hdf_globbing(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table') with dask.set_options(get=dask.get): res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
def test_to_textfiles_name_function_preserves_order(): seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] b = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: b.to_textfiles(dn) out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute() assert seq == out
def random_images(n, shape): with tmpdir() as dirname: for i in range(n): fn = os.path.join(dirname, 'image.%d.png' % i) x = np.random.randint(0, 255, size=shape).astype('i1') imsave(fn, x) yield os.path.join(dirname, '*.png')
def test_to_hdf_multiple_files(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) b = dd.from_pandas(df16, 16) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df, out) # saving to multiple files making sure order is kept with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') b.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df16, out) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') assert_eq(df, out) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # test hdf object with tmpfile('h5') as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, '/data*') out = dd.read_hdf(fn, '/data*') assert_eq(df, out)
def test_write_json_basic(orient): with tmpdir() as path: fn = os.path.join(path, '1.json') df.to_json(fn, orient=orient, lines=False) actual = dd.read_json(fn, orient=orient, lines=False) out = actual.compute() if orient == 'values': out.columns = list(df.columns) assert_eq(out, df)
def test_to_csv_multiple_files_cornercases(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) a = dd.from_pandas(df, 2) with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, "data_*_*.csv") a.to_csv(fn) df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}) a = dd.from_pandas(df16, 16) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df16) # test handling existing files when links are optimized out a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_1.csv') a.to_csv(fn, index=False) fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, mode='w', index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) # test handling existing files when links are optimized out a = dd.from_pandas(df16, 16) with tmpdir() as dn: fn = os.path.join(dn, 'data_01.csv') a.to_csv(fn, index=False) fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, mode='w', index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df16) # test handling existing files when mode isn't 'w' a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') with pytest.raises(ValueError): a.to_csv(fn, mode='a')
def test_to_csv_simple(): df0 = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) df = dd.from_pandas(df0, npartitions=2) with tmpdir() as dir: dir = str(dir) df.to_csv(dir) assert os.listdir(dir) result = dd.read_csv(os.path.join(dir, '*')).compute() assert (result.x.values == df0.x.values).all()
def test_roundtrip(df, write_kwargs, read_kwargs): with tmpdir() as tmp: tmp = str(tmp) if df.index.name is None: df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) to_parquet(tmp, ddf, **write_kwargs) ddf2 = read_parquet(tmp, index=df.index.name, **read_kwargs) assert_eq(ddf, ddf2)
def test_to_csv_series(): df0 = pd.Series(['a', 'b', 'c', 'd'], index=[1., 2., 3., 4.]) df = dd.from_pandas(df0, npartitions=2) with tmpdir() as dir: dir = str(dir) df.to_csv(dir, header=False) assert os.listdir(dir) result = dd.read_csv(os.path.join(dir, '*'), header=None, names=['x']).compute() assert (result.x == df0).all()
def test_json_compressed(compression): if compression == 'xz' and lzma is None: pytest.skip( "LZMA not available. Please install backports.lzma on Python 2." ) with tmpdir() as path: dd.to_json(ddf, path, compression=compression) actual = dd.read_json(os.path.join(path, '*'), compression=compression) assert_eq(df, actual.compute(), check_index=False)
def test_zarr_distributed_roundtrip(loop): da = pytest.importorskip('dask.array') zarr = pytest.importorskip('zarr') assert_eq = da.utils.assert_eq with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: with tmpdir() as d: a = da.zeros((3, 3), chunks=(1, 1)) a.to_zarr(d) a2 = da.from_zarr(d) assert_eq(a, a2) assert a2.chunks == a.chunks
def test_to_hdf_modes_multiple_files(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) # appending a single partition to existing data a = dd.from_pandas(df, 1) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data2'), '/data') a.to_hdf(fn, '/data', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # appending two partitions to existing data a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data2'), '/data') a.to_hdf(fn, '/data', mode='a') out = dd.read_hdf(fn, '/data') eq(df.append(df), out) # overwriting a file with two partitions a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data1'), '/data') a.to_hdf(fn, '/data', mode='w') out = dd.read_hdf(fn, '/data') eq(df, out) # overwriting a single partition, keeping other partitions a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data1'), '/data') a.to_hdf(fn, '/data', mode='a', append=False) out = dd.read_hdf(fn, '/data') eq(df.append(df), out)
def test_hdf_file_list(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'), 'dataframe', format='table') df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'), 'dataframe', format='table') with dask.set_options(get=dask.get): input_files = [os.path.join(tdir, 'one.h5'), os.path.join(tdir, 'two.h5')] res = dd.read_hdf(input_files, 'dataframe') tm.assert_frame_equal(res.compute(), df)
def test_categorical(): with tmpdir() as tmp: df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) to_parquet(tmp, ddf) ddf2 = read_parquet(tmp, categories=['x']) assert ddf2.x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2)
def test_to_csv_header_empty_dataframe(header, expected): dfe = pd.DataFrame({'x': [], 'y': []}) ddfe = dd.from_pandas(dfe, npartitions=1) with tmpdir() as dn: ddfe.to_csv(os.path.join(dn, "fooe*.csv"), index=False, header=header) assert not os.path.exists(os.path.join(dn, "fooe1.csv")) filename = os.path.join(dn, 'fooe0.csv') with open(filename, 'r') as fp: line = fp.readline() assert line == expected os.remove(filename)
def test_visualize(): pytest.importorskip('graphviz') with tmpdir() as d: x = da.arange(5, chunks=2) x.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) x.visualize(filename=os.path.join(d, 'mydask.pdf')) assert os.path.exists(os.path.join(d, 'mydask.pdf')) visualize(x, 1, 2, filename=os.path.join(d, 'mydask.png')) assert os.path.exists(os.path.join(d, 'mydask.png')) dsk = {'a': 1, 'b': (add, 'a', 2), 'c': (mul, 'a', 1)} visualize(x, dsk, filename=os.path.join(d, 'mydask.png')) assert os.path.exists(os.path.join(d, 'mydask.png'))
def test_to_csv(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpdir() as dn: a.to_csv(dn, index=False) result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True) assert_eq(result, df) with tmpdir() as dn: r = a.to_csv(dn, index=False, compute=False) dask.compute(*r, scheduler='sync') result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True) assert_eq(result, df) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) assert_eq(result, df)
def test_to_textfiles_inputs(): B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) with tmpfile() as a: with tmpfile() as b: B.to_textfiles([a, b]) assert os.path.exists(a) assert os.path.exists(b) with tmpdir() as dirname: B.to_textfiles(dirname) assert os.path.exists(dirname) assert os.path.exists(os.path.join(dirname, '0.part')) assert raises(ValueError, lambda: B.to_textfiles(5))
def test_to_textfiles(ext, myopen): b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False) dask.compute(*c, get=dask.get) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode() assert 'xyz' in text f.close()
def test_compression_multiple_files(): with tmpdir() as tdir: f = gzip.open(os.path.join(tdir, 'a.csv.gz'), 'wb') f.write(text.encode()) f.close() f = gzip.open(os.path.join(tdir, 'b.csv.gz'), 'wb') f.write(text.encode()) f.close() df = dd.read_csv(os.path.join(tdir, '*.csv.gz'), compression='gzip') assert len(df.compute()) == (len(text.split('\n')) - 1) * 2
def test_append_wo_index(): """Test append with write_index=False.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")}) half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) with pytest.raises(ValueError) as excinfo: ddf2.to_parquet(tmp, write_index=False, append=True) assert 'Appended columns' in str(excinfo.value) with tmpdir() as tmp: ddf1.to_parquet(tmp, write_index=False) ddf2.to_parquet(tmp, write_index=False, append=True) ddf3 = read_parquet(tmp, index='f') assert_eq(df.set_index('f'), ddf3)
def test_to_textfiles_encoding(): b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2) for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]: with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), encoding='gb18030', compute=False) c.compute(get=dask.get) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode('gb18030') assert u'天气' in text f.close()
def test_to_textfiles(): b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]: with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False) c.compute(get=dask.get) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode() assert 'xyz' in text f.close()
def test_ordering(): with tmpdir() as tmp: tmp = str(tmp) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) to_parquet(tmp, ddf) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def test_to_textfiles_encoding(): b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2) for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]: with tmpdir() as dir: c = b.to_textfiles(os.path.join(dir, '*.' + ext), encoding='gb18030', compute=False) assert c.npartitions == b.npartitions c.compute(get=dask.get) assert os.path.exists(os.path.join(dir, '1.' + ext)) f = myopen(os.path.join(dir, '1.' + ext), 'rb') text = f.read() if hasattr(text, 'decode'): text = text.decode('gb18030') assert u'天气' in text f.close()
def test_to_csv_with_get(): from dask.multiprocessing import get as mp_get flag = [False] def my_get(*args, **kwargs): flag[0] = True return mp_get(*args, **kwargs) df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) ddf = dd.from_pandas(df, npartitions=2) with tmpdir() as dn: ddf.to_csv(dn, index=False, scheduler=my_get) assert flag[0] result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True) assert_eq(result, df)
def test_to_csv_line_ending(): df = pd.DataFrame({"x": [0]}) ddf = dd.from_pandas(df, npartitions=1) expected = {b"0\r\n", b"0\n"} # either/or # For comparison... # unexpected = {b'0\r\r\n'} # This test addresses GH4809, and checks that only (at most) one # '\r' character is written per line when writing to csv. # In case it's correct (on UNIX) to have no '\r' at all, this test # considers either '\r\n' or '\n' as appropriate line endings, # but not '\r\r\n'. with tmpdir() as dn: ddf.to_csv(os.path.join(dn, "foo*.csv"), header=False, index=False) filename = os.path.join(dn, "foo0.csv") with open(filename, "rb") as f: raw = f.read() assert raw in expected
def test_to_json_with_get(): from dask.multiprocessing import get as mp_get flag = [False] def my_get(*args, **kwargs): flag[0] = True return mp_get(*args, **kwargs) df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}) ddf = dd.from_pandas(df, npartitions=2) with tmpdir() as dn: ddf.to_json(dn, compute_kwargs={"scheduler": my_get}) assert flag[0] result = dd.read_json(os.path.join(dn, "*")) assert_eq(result, df, check_index=False)
def test_to_hdf_exceptions(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 1) # triggering too many asterisks error with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data_*') # triggering too many asterisks error with tmpfile() as fn: with pd.HDFStore(fn) as hdf: with pytest.raises(ValueError): a.to_hdf(hdf, '/data_*_*')
def test_to_fmt_warns(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # testing warning when breaking order with tmpfile('h5') as fn: with pytest.warns(None): a.to_hdf(fn, '/data*', name_function=str) # testing warning when breaking order with tmpdir() as dn: with pytest.warns(None): fn = os.path.join(dn, "data_*.csv") a.to_csv(fn, name_function=str)
def test_to_csv_with_get(): from dask.multiprocessing import get as mp_get flag = [False] def my_get(*args, **kwargs): flag[0] = True return mp_get(*args, **kwargs) df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}) ddf = dd.from_pandas(df, npartitions=2) with tmpdir() as dn: ddf.to_csv(dn, index=False, scheduler=my_get) assert flag[0] result = dd.read_csv(os.path.join(dn, "*")).compute().reset_index(drop=True) assert_eq(result, df)
def test_append(): """Test that appended parquet equal to the original one.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")}) df.index.name = 'index' half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) ddf2.to_parquet(tmp, append=True) ddf3 = read_parquet(tmp) assert_eq(df, ddf3)
def test_visualize(): pytest.importorskip("graphviz") X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) clf = SVC(random_state=0, gamma="auto") grid = {"C": [0.1, 0.5, 0.9]} gs = dcv.GridSearchCV(clf, param_grid=grid).fit(X, y) assert hasattr(gs, "dask_graph_") with tmpdir() as d: gs.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, param_grid=grid) with pytest.raises(NotFittedError): gs.visualize()
def test_to_hdf_exceptions(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) a = dd.from_pandas(df, 1) # triggering too many asterisks error with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data_*") # triggering too many asterisks error with tmpfile() as fn: with pd.HDFStore(fn) as hdf: with pytest.raises(ValueError): a.to_hdf(hdf, "/data_*_*")
def test_to_textfiles_encoding(): b = db.from_sequence([u"汽车", u"苹果", u"天气"], npartitions=2) for ext, myopen in [("gz", GzipFile), ("bz2", BZ2File), ("", open)]: if ext == "bz2" and PY2: continue with tmpdir() as dir: c = b.to_textfiles( os.path.join(dir, "*." + ext), encoding="gb18030", compute=False ) dask.compute(*c) assert os.path.exists(os.path.join(dir, "1." + ext)) f = myopen(os.path.join(dir, "1." + ext), "rb") text = f.read() if hasattr(text, "decode"): text = text.decode("gb18030") assert u"天气" in text f.close()
def test_visualize(): pytest.importorskip('graphviz') X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = SVC(random_state=0) grid = {'C': [.1, .5, .9]} gs = dcv.GridSearchCV(clf, grid).fit(X, y) assert hasattr(gs, 'dask_graph_') with tmpdir() as d: gs.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize()
def test_ordering(): with tmpdir() as tmp: tmp = str(tmp) df = pd.DataFrame( { 'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300] }, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) to_parquet(tmp, ddf) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def test_append_with_partition(): with tmpdir() as tmp: df0 = pd.DataFrame({'lat': np.arange(0, 10), 'lon': np.arange(10, 20), 'value': np.arange(100, 110)}) df0.index.name = 'index' df1 = pd.DataFrame({'lat': np.arange(10, 20), 'lon': np.arange(10, 20), 'value': np.arange(120, 130)}) df1.index.name = 'index' dd_df0 = dd.from_pandas(df0, npartitions=1) dd_df1 = dd.from_pandas(df1, npartitions=1) dd.to_parquet(tmp, dd_df0, partition_on=['lon']) dd.to_parquet(tmp, dd_df1, partition_on=['lon'], append=True, ignore_divisions=True) out = dd.read_parquet(tmp).compute() out['lon'] = out.lon.astype('int64') # just to pass assert # sort required since partitioning breaks index order assert_eq(out.sort_values('value'), pd.concat([df0, df1])[out.columns], check_index=False)
def test_visualize(): pytest.importorskip('graphviz') with tmpdir() as d: x = da.arange(5, chunks=2) x.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) x.visualize(filename=os.path.join(d, 'mydask.pdf')) assert os.path.exists(os.path.join(d, 'mydask.pdf')) visualize(x, 1, 2, filename=os.path.join(d, 'mydask.png')) assert os.path.exists(os.path.join(d, 'mydask.png')) dsk = {'a': 1, 'b': (add, 'a', 2), 'c': (mul, 'a', 1)} visualize(x, dsk, filename=os.path.join(d, 'mydask.png')) assert os.path.exists(os.path.join(d, 'mydask.png')) x = Tuple(dsk, ['a', 'b', 'c']) visualize(x, filename=os.path.join(d, 'mydask.png')) assert os.path.exists(os.path.join(d, 'mydask.png'))
def test_visualize(): pytest.importorskip("graphviz") with tmpdir() as d: x = da.arange(5, chunks=2) x.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) x.visualize(filename=os.path.join(d, "mydask.pdf")) assert os.path.exists(os.path.join(d, "mydask.pdf")) visualize(x, 1, 2, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png")) dsk = {"a": 1, "b": (add, "a", 2), "c": (mul, "a", 1)} visualize(x, dsk, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png")) x = Tuple(dsk, ["a", "b", "c"]) visualize(x, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png"))
def test_append_different_columns(): """Test raising of error when non equal columns.""" with tmpdir() as tmp: df1 = pd.DataFrame({'i32': np.arange(100, dtype=np.int32)}) df2 = pd.DataFrame({'i64': np.arange(100, dtype=np.int64)}) df3 = pd.DataFrame({'i32': np.arange(100, dtype=np.int64)}) ddf1 = dd.from_pandas(df1, chunksize=2) ddf2 = dd.from_pandas(df2, chunksize=2) ddf3 = dd.from_pandas(df3, chunksize=2) ddf1.to_parquet(tmp) with pytest.raises(ValueError) as excinfo: ddf2.to_parquet(tmp, append=True) assert 'Appended columns' in str(excinfo.value) with pytest.raises(ValueError) as excinfo: ddf3.to_parquet(tmp, append=True) assert 'Appended dtypes' in str(excinfo.value)
def dir_server(): with tmpdir() as d: for fn in files: with open(os.path.join(d, fn), "wb") as f: f.write(b"a" * 10000) cmd = [sys.executable, "-m", "http.server", "8999"] p = subprocess.Popen(cmd, cwd=d) timeout = 10 while True: try: requests.get("http://localhost:8999") break except requests.exceptions.ConnectionError as e: time.sleep(0.1) timeout -= 0.1 if timeout < 0: raise RuntimeError("Server did not appear") from e yield d p.terminate()
def test_append_overlapping_divisions(): """Test raising of error when divisions overlapping.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice( ['hello', 'yo', 'people'], size=1000).astype("O")}) half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half - 10:], chunksize=100) ddf1.to_parquet(tmp) with pytest.raises(ValueError) as excinfo: ddf2.to_parquet(tmp, append=True) assert 'Appended divisions' in str(excinfo.value) ddf2.to_parquet(tmp, append=True, ignore_divisions=True)
def test_visualize(): pytest.importorskip("graphviz") pytest.importorskip("ipycytoscape") with tmpdir() as d: x = da.arange(5, chunks=2) x.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) x.visualize(filename=os.path.join(d, "mydask.pdf")) assert os.path.exists(os.path.join(d, "mydask.pdf")) visualize(x, 1, 2, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png")) dsk = {"a": 1, "b": (add, "a", 2), "c": (mul, "a", 1)} visualize(x, dsk, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png")) x = Tuple(dsk, ["a", "b", "c"]) visualize(x, filename=os.path.join(d, "mydask.png")) assert os.path.exists(os.path.join(d, "mydask.png")) x = Tuple(dsk, ["a", "b", "c"]) visualize(x, filename=os.path.join(d, "cyt"), engine="cytoscape") assert os.path.exists(os.path.join(d, "cyt.html")) visualize(x, filename=os.path.join(d, "cyt2.html"), engine="ipycytoscape") assert os.path.exists(os.path.join(d, "cyt2.html")) with dask.config.set(visualization__engine="cytoscape"): visualize(x, filename=os.path.join(d, "cyt3.html")) assert os.path.exists(os.path.join(d, "cyt3.html")) with pytest.raises(ValueError, match="not-real"): visualize(x, engine="not-real") # To see if visualize() works when the filename parameter is set to None # If the function raises an error, the test will fail x.visualize(filename=None)
def test_to_csv(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpfile('csv') as fn: a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) with tmpfile('csv') as fn: r = a.to_csv(fn, index=False, compute=False) r.compute() result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df)
def test_reading_empty_csv_files_with_path(): with tmpdir() as tdir: for k, content in enumerate(["0, 1, 2", "", "6, 7, 8"]): with open(os.path.join(tdir, str(k) + ".csv"), "w") as file: file.write(content) result = dd.read_csv( os.path.join(tdir, "*.csv"), include_path_column=True, converters={"path": parse_filename}, names=["A", "B", "C"], ).compute() df = pd.DataFrame( { "A": [0, 6], "B": [1, 7], "C": [2, 8], "path": ["0.csv", "2.csv"], } ) df["path"] = df["path"].astype("category") assert_eq(result, df, check_index=False)
def test_to_csv_header(header, header_first_partition_only, expected_first, expected_next): partition_count = 2 df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f'], 'y': [1, 2, 3, 4, 5, 6]}) ddf = dd.from_pandas(df, npartitions=partition_count) with tmpdir() as dn: # Test NO header case # (header=False, header_first_chunk_only not passed) ddf.to_csv(os.path.join(dn, "fooa*.csv"), index=False, header=header, header_first_partition_only=header_first_partition_only) filename = os.path.join(dn, 'fooa0.csv') with open(filename, 'r') as fp: line = fp.readline() assert line == expected_first os.remove(filename) filename = os.path.join(dn, 'fooa1.csv') with open(filename, 'r') as fp: line = fp.readline() assert line == expected_next os.remove(filename)
def test_categorical(): with tmpdir() as tmp: df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) to_parquet(tmp, ddf) ddf2 = read_parquet(tmp, categories=['x']) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat ddf2 = read_parquet(tmp) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = read_parquet(tmp, categories=[]) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame( { 'x': [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p' ], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }, index=[ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. ]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1 * (10 - i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') assert_eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df16, out)
def dir_server(): with tmpdir() as d: for fn in files: with open(os.path.join(d, fn), 'wb') as f: f.write(b'a' * 10000) if PY2: cmd = [sys.executable, '-m', 'SimpleHTTPServer', '8999'] else: cmd = [sys.executable, '-m', 'http.server', '8999'] p = subprocess.Popen(cmd, cwd=d) timeout = 10 while True: try: requests.get('http://localhost:8999') break except requests.exceptions.ConnectionError: time.sleep(0.1) timeout -= 0.1 if timeout < 0: raise RuntimeError('Server did not appear') yield d p.terminate()
def test_local(): with tmpdir() as tmp: tmp = str(tmp) data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")}) df = dd.from_pandas(data, chunksize=500) to_parquet(tmp, df, write_index=False, object_encoding='utf8') files = os.listdir(tmp) assert '_metadata' in files assert 'part.0.parquet' in files df2 = read_parquet(tmp, index=False) assert len(df2.divisions) > 1 out = df2.compute(get=dask.get).reset_index() for column in df.columns: assert (data[column] == out[column]).all()
def test_hdf_file_list(): pytest.importorskip("tables") df = pd.DataFrame({ "x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4] }, index=[1.0, 2.0, 3.0, 4.0]) with tmpdir() as tdir: df.iloc[:2].to_hdf(os.path.join(tdir, "one.h5"), "dataframe", format="table") df.iloc[2:].to_hdf(os.path.join(tdir, "two.h5"), "dataframe", format="table") with dask.config.set(scheduler="sync"): input_files = [ os.path.join(tdir, "one.h5"), os.path.join(tdir, "two.h5") ] res = dd.read_hdf(input_files, "dataframe") tm.assert_frame_equal(res.compute(), df)
def test_hdf_file_list(): pytest.importorskip('tables') df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4] }, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'), 'dataframe', format='table') df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'), 'dataframe', format='table') with dask.set_options(get=dask.get): input_files = [ os.path.join(tdir, 'one.h5'), os.path.join(tdir, 'two.h5') ] res = dd.read_hdf(input_files, 'dataframe') tm.assert_frame_equal(res.compute(), df)
def test_to_textfiles_name_function_warn(): seq = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ] a = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: with pytest.warns(None): a.to_textfiles(dn, name_function=str)
def test_to_hdf_link_optimizations(): """testing dask link levels is correct by calculating the depth of the dask graph""" pytest.importorskip('tables') df16 = pd.DataFrame( { 'x': [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p' ], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }, index=[ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. ]) a = dd.from_pandas(df16, 16) # saving to multiple hdf files, no links are needed # expected layers: from_pandas, to_hdf, list = depth of 3 with tmpdir() as dn: fn = os.path.join(dn, 'data*') d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 3 # saving to a single hdf file with multiple nodes # all subsequent nodes depend on the first # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4 with tmpfile() as fn: d = a.to_hdf(fn, '/data*', compute=False) assert dependency_depth(d.dask) == 4 # saving to a single hdf file with a single node # every node depends on the previous node # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17 with tmpfile() as fn: d = a.to_hdf(fn, '/data', compute=False) assert dependency_depth(d.dask) == 2 + a.npartitions