Example #1
0
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1*(10-i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df16, out)
Example #2
0
def test_to_hdf_thread():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df, 16)

    # test single file single node
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = pd.read_hdf(fn, '/data')
        eq(df, out)

    # test multiple files single node
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # test single file multiple nodes
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)
Example #3
0
def test_to_hdf_link_optimizations():
    """testing dask link levels is correct by calculating the depth of the dask graph"""
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # saving to multiple hdf files, no links are needed
    # expected layers: from_pandas, to_hdf, list = depth of 3
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 3

    # saving to a single hdf file with multiple nodes
    # all subsequent nodes depend on the first
    # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data*', compute=False)
        assert dependency_depth(d.dask) == 4

    # saving to a single hdf file with a single node
    # every node depends on the previous node
    # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 2 + a.npartitions
Example #4
0
def test_read_chunked(block):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient='records', lines=True)
        d = dd.read_json(fn, blocksize=block, sample=10)
        assert (d.npartitions > 1) or (block > 50)
        assert_eq(d, df, check_index=False)
Example #5
0
def test_hdf_globbing():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table')

        with dask.set_options(get=dask.get):
            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2)
            assert res.npartitions == 2
            tm.assert_frame_equal(res.compute(), df)

            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2, start=1, stop=3)
            expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data',
                                   start=1, stop=3)
            tm.assert_frame_equal(res.compute(), expected)

            res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
Example #6
0
def test_to_textfiles_name_function_preserves_order():
    seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p']
    b = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        b.to_textfiles(dn)

        out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute()
        assert seq == out
Example #7
0
def random_images(n, shape):
    with tmpdir() as dirname:
        for i in range(n):
            fn = os.path.join(dirname, 'image.%d.png' % i)
            x = np.random.randint(0, 255, size=shape).astype('i1')
            imsave(fn, x)

        yield os.path.join(dirname, '*.png')
Example #8
0
def test_to_hdf_multiple_files():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                               'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                         'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                               15, 16]},
                        index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.,
                               12., 13., 14., 15., 16.])
    b = dd.from_pandas(df16, 16)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df, out)

    # saving to multiple files making sure order is kept
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        b.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df16, out)

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        assert_eq(df, out)

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # test hdf object
    with tmpfile('h5') as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, '/data*')
            out = dd.read_hdf(fn, '/data*')
            assert_eq(df, out)
Example #9
0
def test_write_json_basic(orient):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient=orient, lines=False)
        actual = dd.read_json(fn, orient=orient, lines=False)
        out = actual.compute()
        if orient == 'values':
            out.columns = list(df.columns)
        assert_eq(out, df)
Example #10
0
def test_to_csv_multiple_files_cornercases():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]})
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, "data_*_*.csv")
            a.to_csv(fn)

    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]})
    a = dd.from_pandas(df16, 16)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df16)

    # test handling existing files when links are optimized out
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_1.csv')
        a.to_csv(fn, index=False)
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, mode='w', index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df)

    # test handling existing files when links are optimized out
    a = dd.from_pandas(df16, 16)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_01.csv')
        a.to_csv(fn, index=False)
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, mode='w', index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df16)

    # test handling existing files when mode isn't 'w'
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.csv')
        with pytest.raises(ValueError):
            a.to_csv(fn, mode='a')
Example #11
0
def test_to_csv_simple():
    df0 = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    df = dd.from_pandas(df0, npartitions=2)
    with tmpdir() as dir:
        dir = str(dir)
        df.to_csv(dir)
        assert os.listdir(dir)
        result = dd.read_csv(os.path.join(dir, '*')).compute()
    assert (result.x.values == df0.x.values).all()
Example #12
0
def test_roundtrip(df, write_kwargs, read_kwargs):
    with tmpdir() as tmp:
        tmp = str(tmp)
        if df.index.name is None:
            df.index.name = 'index'
        ddf = dd.from_pandas(df, npartitions=2)

        to_parquet(tmp, ddf, **write_kwargs)
        ddf2 = read_parquet(tmp, index=df.index.name, **read_kwargs)
        assert_eq(ddf, ddf2)
Example #13
0
def test_to_csv_series():
    df0 = pd.Series(['a', 'b', 'c', 'd'], index=[1., 2., 3., 4.])
    df = dd.from_pandas(df0, npartitions=2)
    with tmpdir() as dir:
        dir = str(dir)
        df.to_csv(dir, header=False)
        assert os.listdir(dir)
        result = dd.read_csv(os.path.join(dir, '*'), header=None,
                             names=['x']).compute()
    assert (result.x == df0).all()
Example #14
0
def test_json_compressed(compression):
    if compression == 'xz' and lzma is None:
        pytest.skip(
            "LZMA not available. Please install backports.lzma on Python 2."
        )

    with tmpdir() as path:
        dd.to_json(ddf, path, compression=compression)
        actual = dd.read_json(os.path.join(path, '*'),
                              compression=compression)
        assert_eq(df, actual.compute(), check_index=False)
Example #15
0
def test_zarr_distributed_roundtrip(loop):
    da = pytest.importorskip('dask.array')
    zarr = pytest.importorskip('zarr')
    assert_eq = da.utils.assert_eq
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            with tmpdir() as d:
                a = da.zeros((3, 3), chunks=(1, 1))
                a.to_zarr(d)
                a2 = da.from_zarr(d)
                assert_eq(a, a2)
                assert a2.chunks == a.chunks
Example #16
0
def test_to_hdf_modes_multiple_files():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    # appending a single partition to existing data
    a = dd.from_pandas(df, 1)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data2'), '/data')
        a.to_hdf(fn, '/data', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # appending two partitions to existing data
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data2'), '/data')
        a.to_hdf(fn, '/data', mode='a')
        out = dd.read_hdf(fn, '/data')
        eq(df.append(df), out)

    # overwriting a file with two partitions
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data1'), '/data')
        a.to_hdf(fn, '/data', mode='w')
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # overwriting a single partition, keeping other partitions
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data1'), '/data')
        a.to_hdf(fn, '/data', mode='a', append=False)
        out = dd.read_hdf(fn, '/data')
        eq(df.append(df), out)
Example #17
0
def test_hdf_file_list():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'), 'dataframe', format='table')
        df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'), 'dataframe', format='table')

        with dask.set_options(get=dask.get):
            input_files = [os.path.join(tdir, 'one.h5'), os.path.join(tdir, 'two.h5')]
            res = dd.read_hdf(input_files, 'dataframe')
            tm.assert_frame_equal(res.compute(), df)
Example #18
0
def test_categorical():
    with tmpdir() as tmp:
        df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100},
                          dtype='category')
        ddf = dd.from_pandas(df, npartitions=3)
        to_parquet(tmp, ddf)

        ddf2 = read_parquet(tmp, categories=['x'])

        assert ddf2.x.cat.categories.tolist() == ['a', 'b', 'c']
        ddf2.loc[:1000].compute()
        df.index.name = 'index'  # defaults to 'index' in this case
        assert assert_eq(df, ddf2)
Example #19
0
def test_to_csv_header_empty_dataframe(header, expected):
    dfe = pd.DataFrame({'x': [],
                       'y': []})
    ddfe = dd.from_pandas(dfe, npartitions=1)

    with tmpdir() as dn:
        ddfe.to_csv(os.path.join(dn, "fooe*.csv"), index=False, header=header)
        assert not os.path.exists(os.path.join(dn, "fooe1.csv"))
        filename = os.path.join(dn, 'fooe0.csv')
        with open(filename, 'r') as fp:
            line = fp.readline()
            assert line == expected
        os.remove(filename)
Example #20
0
def test_visualize():
    pytest.importorskip('graphviz')
    with tmpdir() as d:
        x = da.arange(5, chunks=2)
        x.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))
        x.visualize(filename=os.path.join(d, 'mydask.pdf'))
        assert os.path.exists(os.path.join(d, 'mydask.pdf'))
        visualize(x, 1, 2, filename=os.path.join(d, 'mydask.png'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))
        dsk = {'a': 1, 'b': (add, 'a', 2), 'c': (mul, 'a', 1)}
        visualize(x, dsk, filename=os.path.join(d, 'mydask.png'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))
Example #21
0
def test_to_csv():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]})

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpdir() as dn:
            a.to_csv(dn, index=False)
            result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True)
            assert_eq(result, df)

        with tmpdir() as dn:
            r = a.to_csv(dn, index=False, compute=False)
            dask.compute(*r, scheduler='sync')
            result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True)
            assert_eq(result, df)

        with tmpdir() as dn:
            fn = os.path.join(dn, 'data_*.csv')
            a.to_csv(fn, index=False)
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            assert_eq(result, df)
Example #22
0
def test_to_textfiles_inputs():
    B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    with tmpfile() as a:
        with tmpfile() as b:
            B.to_textfiles([a, b])
            assert os.path.exists(a)
            assert os.path.exists(b)

    with tmpdir() as dirname:
        B.to_textfiles(dirname)
        assert os.path.exists(dirname)
        assert os.path.exists(os.path.join(dirname, '0.part'))
    assert raises(ValueError, lambda: B.to_textfiles(5))
Example #23
0
def test_to_textfiles(ext, myopen):
    b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    with tmpdir() as dir:
        c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False)
        dask.compute(*c, get=dask.get)
        assert os.path.exists(os.path.join(dir, '1.' + ext))

        f = myopen(os.path.join(dir, '1.' + ext), 'rb')
        text = f.read()
        if hasattr(text, 'decode'):
            text = text.decode()
        assert 'xyz' in text
        f.close()
Example #24
0
def test_compression_multiple_files():
    with tmpdir() as tdir:
        f = gzip.open(os.path.join(tdir, 'a.csv.gz'), 'wb')
        f.write(text.encode())
        f.close()

        f = gzip.open(os.path.join(tdir, 'b.csv.gz'), 'wb')
        f.write(text.encode())
        f.close()

        df = dd.read_csv(os.path.join(tdir, '*.csv.gz'), compression='gzip')

        assert len(df.compute()) == (len(text.split('\n')) - 1) * 2
Example #25
0
def test_append_wo_index():
    """Test append with write_index=False."""
    with tmpdir() as tmp:
        df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                           'i64': np.arange(1000, dtype=np.int64),
                           'f': np.arange(1000, dtype=np.float64),
                           'bhello': np.random.choice(['hello', 'you', 'people'],
                                                      size=1000).astype("O")})
        half = len(df) // 2
        ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100)
        ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100)
        ddf1.to_parquet(tmp)
        with pytest.raises(ValueError) as excinfo:
            ddf2.to_parquet(tmp, write_index=False, append=True)

        assert 'Appended columns' in str(excinfo.value)

    with tmpdir() as tmp:
        ddf1.to_parquet(tmp, write_index=False)
        ddf2.to_parquet(tmp, write_index=False, append=True)

        ddf3 = read_parquet(tmp, index='f')
        assert_eq(df.set_index('f'), ddf3)
Example #26
0
def test_to_textfiles_encoding():
    b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2)
    for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]:
        with tmpdir() as dir:
            c = b.to_textfiles(os.path.join(dir, '*.' + ext), encoding='gb18030', compute=False)
            c.compute(get=dask.get)
            assert os.path.exists(os.path.join(dir, '1.' + ext))

            f = myopen(os.path.join(dir, '1.' + ext), 'rb')
            text = f.read()
            if hasattr(text, 'decode'):
                text = text.decode('gb18030')
            assert u'天气' in text
            f.close()
Example #27
0
def test_to_textfiles():
    b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]:
        with tmpdir() as dir:
            c = b.to_textfiles(os.path.join(dir, '*.' + ext), compute=False)
            c.compute(get=dask.get)
            assert os.path.exists(os.path.join(dir, '1.' + ext))

            f = myopen(os.path.join(dir, '1.' + ext), 'rb')
            text = f.read()
            if hasattr(text, 'decode'):
                text = text.decode()
            assert 'xyz' in text
            f.close()
Example #28
0
def test_ordering():
    with tmpdir() as tmp:
        tmp = str(tmp)
        df = pd.DataFrame({'a': [1, 2, 3],
                           'b': [10, 20, 30],
                           'c': [100, 200, 300]},
                          index=pd.Index([-1, -2, -3], name='myindex'),
                          columns=['c', 'a', 'b'])
        ddf = dd.from_pandas(df, npartitions=2)
        to_parquet(tmp, ddf)

        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

        ddf2 = read_parquet(tmp, index='myindex')
        assert_eq(ddf, ddf2)
Example #29
0
def test_to_textfiles_encoding():
    b = db.from_sequence([u'汽车', u'苹果', u'天气'], npartitions=2)
    for ext, myopen in [('gz', GzipFile), ('bz2', BZ2File), ('', open)]:
        with tmpdir() as dir:
            c = b.to_textfiles(os.path.join(dir, '*.' + ext),
                               encoding='gb18030',
                               compute=False)
            assert c.npartitions == b.npartitions
            c.compute(get=dask.get)
            assert os.path.exists(os.path.join(dir, '1.' + ext))

            f = myopen(os.path.join(dir, '1.' + ext), 'rb')
            text = f.read()
            if hasattr(text, 'decode'):
                text = text.decode('gb18030')
            assert u'天气' in text
            f.close()
Example #30
0
def test_to_csv_with_get():
    from dask.multiprocessing import get as mp_get
    flag = [False]

    def my_get(*args, **kwargs):
        flag[0] = True
        return mp_get(*args, **kwargs)

    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, npartitions=2)

    with tmpdir() as dn:
        ddf.to_csv(dn, index=False, scheduler=my_get)
        assert flag[0]
        result = dd.read_csv(os.path.join(dn, '*')).compute().reset_index(drop=True)
        assert_eq(result, df)
Example #31
0
def test_to_csv_line_ending():
    df = pd.DataFrame({"x": [0]})
    ddf = dd.from_pandas(df, npartitions=1)
    expected = {b"0\r\n", b"0\n"}  # either/or
    # For comparison...
    # unexpected = {b'0\r\r\n'}
    # This test addresses GH4809, and checks that only (at most) one
    #  '\r' character is written per line when writing to csv.
    #  In case it's correct (on UNIX) to have no '\r' at all, this test
    #  considers either '\r\n' or '\n' as appropriate line endings,
    #  but not '\r\r\n'.
    with tmpdir() as dn:
        ddf.to_csv(os.path.join(dn, "foo*.csv"), header=False, index=False)
        filename = os.path.join(dn, "foo0.csv")
        with open(filename, "rb") as f:
            raw = f.read()
    assert raw in expected
Example #32
0
def test_to_json_with_get():
    from dask.multiprocessing import get as mp_get

    flag = [False]

    def my_get(*args, **kwargs):
        flag[0] = True
        return mp_get(*args, **kwargs)

    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, npartitions=2)

    with tmpdir() as dn:
        ddf.to_json(dn, compute_kwargs={"scheduler": my_get})
        assert flag[0]
        result = dd.read_json(os.path.join(dn, "*"))
        assert_eq(result, df, check_index=False)
Example #33
0
def test_to_hdf_exceptions():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 1)

    # triggering too many asterisks error
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*')

    # triggering too many asterisks error
    with tmpfile() as fn:
        with pd.HDFStore(fn) as hdf:
            with pytest.raises(ValueError):
                a.to_hdf(hdf, '/data_*_*')
Example #34
0
def test_to_fmt_warns():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # testing warning when breaking order
    with tmpfile('h5') as fn:
        with pytest.warns(None):
            a.to_hdf(fn, '/data*', name_function=str)

    # testing warning when breaking order
    with tmpdir() as dn:
        with pytest.warns(None):
            fn = os.path.join(dn, "data_*.csv")
            a.to_csv(fn, name_function=str)
Example #35
0
def test_to_hdf_exceptions():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 1)

    # triggering too many asterisks error
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*')

    # triggering too many asterisks error
    with tmpfile() as fn:
        with pd.HDFStore(fn) as hdf:
            with pytest.raises(ValueError):
                a.to_hdf(hdf, '/data_*_*')
Example #36
0
def test_to_csv_with_get():
    from dask.multiprocessing import get as mp_get

    flag = [False]

    def my_get(*args, **kwargs):
        flag[0] = True
        return mp_get(*args, **kwargs)

    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, npartitions=2)

    with tmpdir() as dn:
        ddf.to_csv(dn, index=False, scheduler=my_get)
        assert flag[0]
        result = dd.read_csv(os.path.join(dn, "*")).compute().reset_index(drop=True)
        assert_eq(result, df)
Example #37
0
def test_append():
    """Test that appended parquet equal to the original one."""
    with tmpdir() as tmp:
        df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                           'i64': np.arange(1000, dtype=np.int64),
                           'f': np.arange(1000, dtype=np.float64),
                           'bhello': np.random.choice(['hello', 'yo', 'people'],
                                                      size=1000).astype("O")})
        df.index.name = 'index'

        half = len(df) // 2
        ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100)
        ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100)
        ddf1.to_parquet(tmp)
        ddf2.to_parquet(tmp, append=True)

        ddf3 = read_parquet(tmp)
        assert_eq(df, ddf3)
Example #38
0
def test_visualize():
    pytest.importorskip("graphviz")

    X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0)
    clf = SVC(random_state=0, gamma="auto")
    grid = {"C": [0.1, 0.5, 0.9]}
    gs = dcv.GridSearchCV(clf, param_grid=grid).fit(X, y)

    assert hasattr(gs, "dask_graph_")

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, param_grid=grid)
    with pytest.raises(NotFittedError):
        gs.visualize()
Example #39
0
def test_to_hdf_exceptions():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )
    a = dd.from_pandas(df, 1)

    # triggering too many asterisks error
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, "data_*.h5")
            a.to_hdf(fn, "/data_*")

    # triggering too many asterisks error
    with tmpfile() as fn:
        with pd.HDFStore(fn) as hdf:
            with pytest.raises(ValueError):
                a.to_hdf(hdf, "/data_*_*")
Example #40
0
def test_to_textfiles_encoding():
    b = db.from_sequence([u"汽车", u"苹果", u"天气"], npartitions=2)
    for ext, myopen in [("gz", GzipFile), ("bz2", BZ2File), ("", open)]:
        if ext == "bz2" and PY2:
            continue
        with tmpdir() as dir:
            c = b.to_textfiles(
                os.path.join(dir, "*." + ext), encoding="gb18030", compute=False
            )
            dask.compute(*c)
            assert os.path.exists(os.path.join(dir, "1." + ext))

            f = myopen(os.path.join(dir, "1." + ext), "rb")
            text = f.read()
            if hasattr(text, "decode"):
                text = text.decode("gb18030")
            assert u"天气" in text
            f.close()
def test_visualize():
    pytest.importorskip('graphviz')

    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = SVC(random_state=0)
    grid = {'C': [.1, .5, .9]}
    gs = dcv.GridSearchCV(clf, grid).fit(X, y)

    assert hasattr(gs, 'dask_graph_')

    with tmpdir() as d:
        gs.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()
Example #42
0
def test_ordering():
    with tmpdir() as tmp:
        tmp = str(tmp)
        df = pd.DataFrame(
            {
                'a': [1, 2, 3],
                'b': [10, 20, 30],
                'c': [100, 200, 300]
            },
            index=pd.Index([-1, -2, -3], name='myindex'),
            columns=['c', 'a', 'b'])
        ddf = dd.from_pandas(df, npartitions=2)
        to_parquet(tmp, ddf)

        pf = fastparquet.ParquetFile(tmp)
        assert pf.columns == ['myindex', 'c', 'a', 'b']

        ddf2 = read_parquet(tmp, index='myindex')
        assert_eq(ddf, ddf2)
def test_append_with_partition():
    with tmpdir() as tmp:
        df0 = pd.DataFrame({'lat': np.arange(0, 10), 'lon': np.arange(10, 20),
                            'value': np.arange(100, 110)})
        df0.index.name = 'index'
        df1 = pd.DataFrame({'lat': np.arange(10, 20), 'lon': np.arange(10, 20),
                            'value': np.arange(120, 130)})
        df1.index.name = 'index'
        dd_df0 = dd.from_pandas(df0, npartitions=1)
        dd_df1 = dd.from_pandas(df1, npartitions=1)
        dd.to_parquet(tmp, dd_df0, partition_on=['lon'])
        dd.to_parquet(tmp, dd_df1, partition_on=['lon'], append=True,
                      ignore_divisions=True)

        out = dd.read_parquet(tmp).compute()
    out['lon'] = out.lon.astype('int64')  # just to pass assert
    # sort required since partitioning breaks index order
    assert_eq(out.sort_values('value'), pd.concat([df0, df1])[out.columns],
              check_index=False)
Example #44
0
def test_visualize():
    pytest.importorskip('graphviz')
    with tmpdir() as d:
        x = da.arange(5, chunks=2)
        x.visualize(filename=os.path.join(d, 'mydask'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

        x.visualize(filename=os.path.join(d, 'mydask.pdf'))
        assert os.path.exists(os.path.join(d, 'mydask.pdf'))

        visualize(x, 1, 2, filename=os.path.join(d, 'mydask.png'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

        dsk = {'a': 1, 'b': (add, 'a', 2), 'c': (mul, 'a', 1)}
        visualize(x, dsk, filename=os.path.join(d, 'mydask.png'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))

        x = Tuple(dsk, ['a', 'b', 'c'])
        visualize(x, filename=os.path.join(d, 'mydask.png'))
        assert os.path.exists(os.path.join(d, 'mydask.png'))
Example #45
0
def test_visualize():
    pytest.importorskip("graphviz")
    with tmpdir() as d:
        x = da.arange(5, chunks=2)
        x.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        x.visualize(filename=os.path.join(d, "mydask.pdf"))
        assert os.path.exists(os.path.join(d, "mydask.pdf"))

        visualize(x, 1, 2, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        dsk = {"a": 1, "b": (add, "a", 2), "c": (mul, "a", 1)}
        visualize(x, dsk, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        x = Tuple(dsk, ["a", "b", "c"])
        visualize(x, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))
Example #46
0
def test_append_different_columns():
    """Test raising of error when non equal columns."""
    with tmpdir() as tmp:
        df1 = pd.DataFrame({'i32': np.arange(100, dtype=np.int32)})
        df2 = pd.DataFrame({'i64': np.arange(100, dtype=np.int64)})
        df3 = pd.DataFrame({'i32': np.arange(100, dtype=np.int64)})

        ddf1 = dd.from_pandas(df1, chunksize=2)
        ddf2 = dd.from_pandas(df2, chunksize=2)
        ddf3 = dd.from_pandas(df3, chunksize=2)

        ddf1.to_parquet(tmp)

        with pytest.raises(ValueError) as excinfo:
            ddf2.to_parquet(tmp, append=True)
        assert 'Appended columns' in str(excinfo.value)

        with pytest.raises(ValueError) as excinfo:
            ddf3.to_parquet(tmp, append=True)
        assert 'Appended dtypes' in str(excinfo.value)
Example #47
0
def dir_server():
    with tmpdir() as d:
        for fn in files:
            with open(os.path.join(d, fn), "wb") as f:
                f.write(b"a" * 10000)

        cmd = [sys.executable, "-m", "http.server", "8999"]
        p = subprocess.Popen(cmd, cwd=d)
        timeout = 10
        while True:
            try:
                requests.get("http://localhost:8999")
                break
            except requests.exceptions.ConnectionError as e:
                time.sleep(0.1)
                timeout -= 0.1
                if timeout < 0:
                    raise RuntimeError("Server did not appear") from e
        yield d
        p.terminate()
Example #48
0
def test_append_overlapping_divisions():
    """Test raising of error when divisions overlapping."""
    with tmpdir() as tmp:
        df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                           'i64': np.arange(1000, dtype=np.int64),
                           'f': np.arange(1000, dtype=np.float64),
                           'bhello': np.random.choice(
                               ['hello', 'yo', 'people'],
                               size=1000).astype("O")})
        half = len(df) // 2
        ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100)
        ddf2 = dd.from_pandas(df.iloc[half - 10:], chunksize=100)
        ddf1.to_parquet(tmp)

        with pytest.raises(ValueError) as excinfo:
            ddf2.to_parquet(tmp, append=True)

        assert 'Appended divisions' in str(excinfo.value)

        ddf2.to_parquet(tmp, append=True, ignore_divisions=True)
Example #49
0
def test_visualize():
    pytest.importorskip("graphviz")
    pytest.importorskip("ipycytoscape")
    with tmpdir() as d:
        x = da.arange(5, chunks=2)
        x.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        x.visualize(filename=os.path.join(d, "mydask.pdf"))
        assert os.path.exists(os.path.join(d, "mydask.pdf"))

        visualize(x, 1, 2, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        dsk = {"a": 1, "b": (add, "a", 2), "c": (mul, "a", 1)}
        visualize(x, dsk, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        x = Tuple(dsk, ["a", "b", "c"])
        visualize(x, filename=os.path.join(d, "mydask.png"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

        x = Tuple(dsk, ["a", "b", "c"])
        visualize(x, filename=os.path.join(d, "cyt"), engine="cytoscape")
        assert os.path.exists(os.path.join(d, "cyt.html"))

        visualize(x,
                  filename=os.path.join(d, "cyt2.html"),
                  engine="ipycytoscape")
        assert os.path.exists(os.path.join(d, "cyt2.html"))

        with dask.config.set(visualization__engine="cytoscape"):
            visualize(x, filename=os.path.join(d, "cyt3.html"))
            assert os.path.exists(os.path.join(d, "cyt3.html"))

        with pytest.raises(ValueError, match="not-real"):
            visualize(x, engine="not-real")

        # To see if visualize() works when the filename parameter is set to None
        # If the function raises an error, the test will fail
        x.visualize(filename=None)
Example #50
0
def test_to_csv():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpfile('csv') as fn:
            a.to_csv(fn, index=False)
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)

        with tmpfile('csv') as fn:
            r = a.to_csv(fn, index=False, compute=False)
            r.compute()
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)

        with tmpdir() as dn:
            fn = os.path.join(dn, 'data_*.csv')
            a.to_csv(fn, index=False)
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)
Example #51
0
def test_reading_empty_csv_files_with_path():
    with tmpdir() as tdir:
        for k, content in enumerate(["0, 1, 2", "", "6, 7, 8"]):
            with open(os.path.join(tdir, str(k) + ".csv"), "w") as file:
                file.write(content)
        result = dd.read_csv(
            os.path.join(tdir, "*.csv"),
            include_path_column=True,
            converters={"path": parse_filename},
            names=["A", "B", "C"],
        ).compute()
        df = pd.DataFrame(
            {
                "A": [0, 6],
                "B": [1, 7],
                "C": [2, 8],
                "path": ["0.csv", "2.csv"],
            }
        )
        df["path"] = df["path"].astype("category")
        assert_eq(result, df, check_index=False)
Example #52
0
def test_to_csv_header(header, header_first_partition_only, expected_first, expected_next):
    partition_count = 2
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f'],
                       'y': [1, 2, 3, 4, 5, 6]})
    ddf = dd.from_pandas(df, npartitions=partition_count)

    with tmpdir() as dn:
        # Test NO header case
        # (header=False, header_first_chunk_only not passed)
        ddf.to_csv(os.path.join(dn, "fooa*.csv"), index=False, header=header,
                   header_first_partition_only=header_first_partition_only)
        filename = os.path.join(dn, 'fooa0.csv')
        with open(filename, 'r') as fp:
            line = fp.readline()
            assert line == expected_first
        os.remove(filename)

        filename = os.path.join(dn, 'fooa1.csv')
        with open(filename, 'r') as fp:
            line = fp.readline()
            assert line == expected_next
        os.remove(filename)
Example #53
0
def test_categorical():
    with tmpdir() as tmp:
        df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category')
        ddf = dd.from_pandas(df, npartitions=3)
        to_parquet(tmp, ddf)

        ddf2 = read_parquet(tmp, categories=['x'])
        assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

        # autocat
        ddf2 = read_parquet(tmp)
        assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c']

        ddf2.loc[:1000].compute()
        df.index.name = 'index'  # defaults to 'index' in this case
        assert assert_eq(df, ddf2)

        # dereference cats
        ddf2 = read_parquet(tmp, categories=[])

        ddf2.loc[:1000].compute()
        assert (df.x == ddf2.x).all()
Example #54
0
File: test_hdf.py Project: jni/dask
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame(
        {
            'x': [
                'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                'm', 'n', 'o', 'p'
            ],
            'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
        },
        index=[
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
            16.
        ])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1 * (10 - i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        assert_eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df16, out)
Example #55
0
def dir_server():
    with tmpdir() as d:
        for fn in files:
            with open(os.path.join(d, fn), 'wb') as f:
                f.write(b'a' * 10000)

        if PY2:
            cmd = [sys.executable, '-m', 'SimpleHTTPServer', '8999']
        else:
            cmd = [sys.executable, '-m', 'http.server', '8999']
        p = subprocess.Popen(cmd, cwd=d)
        timeout = 10
        while True:
            try:
                requests.get('http://localhost:8999')
                break
            except requests.exceptions.ConnectionError:
                time.sleep(0.1)
                timeout -= 0.1
                if timeout < 0:
                    raise RuntimeError('Server did not appear')
        yield d
        p.terminate()
Example #56
0
def test_local():
    with tmpdir() as tmp:
        tmp = str(tmp)
        data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                             'i64': np.arange(1000, dtype=np.int64),
                             'f': np.arange(1000, dtype=np.float64),
                             'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")})
        df = dd.from_pandas(data, chunksize=500)

        to_parquet(tmp, df, write_index=False, object_encoding='utf8')

        files = os.listdir(tmp)
        assert '_metadata' in files
        assert 'part.0.parquet' in files

        df2 = read_parquet(tmp, index=False)

        assert len(df2.divisions) > 1

        out = df2.compute(get=dask.get).reset_index()

        for column in df.columns:
            assert (data[column] == out[column]).all()
Example #57
0
def test_hdf_file_list():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])

    with tmpdir() as tdir:
        df.iloc[:2].to_hdf(os.path.join(tdir, "one.h5"),
                           "dataframe",
                           format="table")
        df.iloc[2:].to_hdf(os.path.join(tdir, "two.h5"),
                           "dataframe",
                           format="table")

        with dask.config.set(scheduler="sync"):
            input_files = [
                os.path.join(tdir, "one.h5"),
                os.path.join(tdir, "two.h5")
            ]
            res = dd.read_hdf(input_files, "dataframe")
            tm.assert_frame_equal(res.compute(), df)
Example #58
0
def test_hdf_file_list():
    pytest.importorskip('tables')
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [1, 2, 3, 4]
    },
                      index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'),
                           'dataframe',
                           format='table')
        df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'),
                           'dataframe',
                           format='table')

        with dask.set_options(get=dask.get):
            input_files = [
                os.path.join(tdir, 'one.h5'),
                os.path.join(tdir, 'two.h5')
            ]
            res = dd.read_hdf(input_files, 'dataframe')
            tm.assert_frame_equal(res.compute(), df)
Example #59
0
def test_to_textfiles_name_function_warn():
    seq = [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
    ]
    a = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        with pytest.warns(None):
            a.to_textfiles(dn, name_function=str)
Example #60
0
File: test_hdf.py Project: jni/dask
def test_to_hdf_link_optimizations():
    """testing dask link levels is correct by calculating the depth of the dask graph"""
    pytest.importorskip('tables')
    df16 = pd.DataFrame(
        {
            'x': [
                'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                'm', 'n', 'o', 'p'
            ],
            'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
        },
        index=[
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
            16.
        ])
    a = dd.from_pandas(df16, 16)

    # saving to multiple hdf files, no links are needed
    # expected layers: from_pandas, to_hdf, list = depth of 3
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 3

    # saving to a single hdf file with multiple nodes
    # all subsequent nodes depend on the first
    # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data*', compute=False)
        assert dependency_depth(d.dask) == 4

    # saving to a single hdf file with a single node
    # every node depends on the previous node
    # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 2 + a.npartitions