Beispiel #1
0
def test_to_hdf_link_optimizations():
    """testing dask link levels is correct by calculating the depth of the dask graph"""
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # saving to multiple hdf files, no links are needed
    # expected layers: from_pandas, to_hdf, list = depth of 3
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 3

    # saving to a single hdf file with multiple nodes
    # all subsequent nodes depend on the first
    # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data*', compute=False)
        assert dependency_depth(d.dask) == 4

    # saving to a single hdf file with a single node
    # every node depends on the previous node
    # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17
    with tmpfile() as fn:
        d = a.to_hdf(fn, '/data', compute=False)
        assert dependency_depth(d.dask) == 2 + a.npartitions
Beispiel #2
0
def test_read_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data')
        try:
            dd.read_hdf(fn, 'data', chunksize=2)
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2)
        assert a.npartitions == 2
        assert a._known_dtype

        tm.assert_frame_equal(a.compute(), df)

        tm.assert_frame_equal(
              dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3).compute(),
              pd.read_hdf(fn, '/data', start=1, stop=3))

        assert sorted(dd.read_hdf(fn, '/data').dask) == \
               sorted(dd.read_hdf(fn, '/data').dask)
Beispiel #3
0
def test_to_hdf_thread():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df, 16)

    # test single file single node
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = pd.read_hdf(fn, '/data')
        eq(df, out)

    # test multiple files single node
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # test single file multiple nodes
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)
Beispiel #4
0
def test_tokenize_numpy_memmap():
    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        y = tokenize(np.load(fn, mmap_mode='r'))

    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        z = tokenize(np.load(fn, mmap_mode='r'))

    assert y != z

    with tmpfile('.npy') as fn:
        x = np.random.normal(size=(10, 10))
        np.save(fn, x)
        mm = np.load(fn, mmap_mode='r')
        mm2 = np.load(fn, mmap_mode='r')
        a = tokenize(mm[0, :])
        b = tokenize(mm[1, :])
        c = tokenize(mm[0:3, :])
        d = tokenize(mm[:, 0])
        assert len(set([a, b, c, d])) == 4
        assert tokenize(mm) == tokenize(mm2)
        assert tokenize(mm[1, :]) == tokenize(mm2[1, :])
Beispiel #5
0
def test_read_hdf(data, compare):
    pytest.importorskip('tables')
    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data')
        try:
            dd.read_hdf(fn, 'data', chunksize=2, mode='r')
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2, mode='r')
        assert a.npartitions == 2

        compare(a.compute(), data)

        compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3,
                            mode='r').compute(),
                pd.read_hdf(fn, '/data', start=1, stop=3))

        assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) ==
                sorted(dd.read_hdf(fn, '/data', mode='r').dask))

    with tmpfile('h5') as fn:
        sorted_data = data.sort_index()
        sorted_data.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2, sorted_index=True, mode='r')
        assert a.npartitions == 2

        compare(a.compute(), sorted_data)
Beispiel #6
0
def test_to_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    with tmpfile('h5') as fn:
        a.x.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    # test compute = False
    with tmpfile('h5') as fn:
        r = a.to_hdf(fn, '/data', compute=False)
        r.compute()
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])
Beispiel #7
0
def test_tokenize_numpy_memmap_no_filename():
    # GH 1562:
    with tmpfile('.npy') as fn1, tmpfile('.npy') as fn2:
        x = np.arange(5)
        np.save(fn1, x)
        np.save(fn2, x)

        a = np.load(fn1, mmap_mode='r')
        b = a + a
        assert tokenize(b) == tokenize(b)
Beispiel #8
0
def test_tokenize_numpy_memmap():
    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        y = tokenize(np.load(fn, mmap_mode='r'))

    with tmpfile('.npy') as fn:
        x = np.arange(5)
        np.save(fn, x)
        z = tokenize(np.load(fn, mmap_mode='r'))

    assert y != z
Beispiel #9
0
def test_to_csv_series():
    s = pd.Series([1, 2, 3], index=[10, 20, 30], name="foo")
    a = dd.from_pandas(s, 2)
    with tmpfile("csv") as fn:
        with tmpfile("csv") as fn2:
            a.to_csv(fn)
            s.to_csv(fn2)
            with open(fn) as f:
                adata = f.read()
            with open(fn2) as f:
                sdata = f.read()

            assert adata == sdata
Beispiel #10
0
def test_to_textfiles_inputs():
    B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    with tmpfile() as a:
        with tmpfile() as b:
            B.to_textfiles([a, b]).compute()
            assert os.path.exists(a)
            assert os.path.exists(b)

    with tmpfile() as dirname:
        B.to_textfiles(dirname).compute()
        assert os.path.exists(dirname)
        assert os.path.exists(os.path.join(dirname, '0.part'))
    assert raises(ValueError, lambda: B.to_textfiles(5))
Beispiel #11
0
def test_memmap():
    with tmpfile('npy') as fn_1:
        with tmpfile('npy') as fn_2:
            x = da.arange(100, chunks=15)
            target = np.memmap(fn_1, shape=x.shape, mode='w+', dtype=x.dtype)

            x.store(target)

            assert eq(target, x)

            np.save(fn_2, target)

            assert eq(np.load(fn_2, mmap_mode='r'), x)
Beispiel #12
0
def test_ensure_file_directory(mkdir):
    a = {'x': 1, 'y': {'a': 1}}
    with tmpfile(extension='yaml') as source:
        with tmpfile() as destination:
            if mkdir:
                os.mkdir(destination)
            with open(source, 'w') as f:
                yaml.dump(a, f)

            ensure_file(source=source, destination=destination)
            assert os.path.isdir(destination)
            [fn] = os.listdir(destination)
            assert os.path.split(fn)[1] == os.path.split(source)[1]
Beispiel #13
0
def test_to_hdf():
    pytest.importorskip("tables")
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)

    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_frame_equal(df, out[:])

    with tmpfile("h5") as fn:
        a.x.to_hdf(fn, "/data")
        out = pd.read_hdf(fn, "/data")
        tm.assert_series_equal(df.x, out[:])
Beispiel #14
0
def test_to_hdf_kwargs():
    pytest.importorskip('tables')
    df = pd.DataFrame({'A': ['a', 'aaaa']})
    ddf = dd.from_pandas(df, npartitions=2)
    with tmpfile('h5') as fn:
        ddf.to_hdf(fn, 'foo4', format='table', min_itemsize=4)
        df2 = pd.read_hdf(fn, 'foo4')
        tm.assert_frame_equal(df, df2)

    # test shorthand 't' for table
    with tmpfile('h5') as fn:
        ddf.to_hdf(fn, 'foo4', format='t', min_itemsize=4)
        df2 = pd.read_hdf(fn, 'foo4')
        tm.assert_frame_equal(df, df2)
Beispiel #15
0
def test_to_textfiles_inputs():
    B = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    with tmpfile() as a:
        with tmpfile() as b:
            B.to_textfiles([a, b])
            assert os.path.exists(a)
            assert os.path.exists(b)

    with tmpdir() as dirname:
        B.to_textfiles(dirname)
        assert os.path.exists(dirname)
        assert os.path.exists(os.path.join(dirname, '0.part'))

    with pytest.raises(TypeError):
        B.to_textfiles(5)
Beispiel #16
0
def test_read_json_error():
    with tmpfile('json') as f:
        with pytest.raises(ValueError):
            df.to_json(f, orient='split', lines=True)
        df.to_json(f, orient='split', lines=False)
        with pytest.raises(ValueError):
            dd.read_json(f, orient='split', blocksize=1)
Beispiel #17
0
def test_gh715():
    bin_data = u'\u20ac'.encode('utf-8')
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(bin_data)
        a = db.from_filenames(fn)
        assert a.compute()[0] == bin_data.decode('utf-8')
Beispiel #18
0
def test_to_textfiles_endlines():
    b = db.from_sequence(['a', 'b', 'c'], npartitions=1)
    with tmpfile() as fn:
        b.to_textfiles([fn])
        with open(fn, 'r') as f:
            result = f.readlines()
        assert result == ['a\n', 'b\n', 'c']
Beispiel #19
0
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1*(10-i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df16, out)
Beispiel #20
0
def test_gh606():
    encoding = 'utf-16-le'
    euro = u'\u20ac'
    yen = u'\u00a5'
    linesep = os.linesep

    bin_euro = u'\u20ac'.encode(encoding)
    bin_yen = u'\u00a5'.encode(encoding)
    bin_linesep = linesep.encode(encoding)

    data = (euro * 10) + linesep + (yen * 10) + linesep + (euro * 10)
    bin_data = data.encode(encoding)

    with tmpfile() as fn:
        with open(fn, 'w+b') as f:
            f.write(bin_data)
            f.seek(0)

            stop = len(bin_euro) * 10 + len(bin_linesep)
            res = textblock(f, 1, stop, encoding=encoding)
            assert res == ((yen * 10) + linesep).encode(encoding)

            stop = len(bin_euro) * 10 + len(bin_linesep)
            res = textblock(f, 0, stop, encoding=encoding)
            assert res == ((euro * 10) + linesep + (yen * 10) + linesep).encode(encoding)
Beispiel #21
0
def test_nextlinesep():
    lineseps = ('\r', '\n', '\r\n')
    encodings = ('utf-16-le', 'utf-8')
    for sep, encoding in product(lineseps, encodings):
        euro = u'\u20ac'
        yen = u'\u00a5'

        bin_euro = u'\u20ac'.encode(encoding)
        bin_yen = u'\u00a5'.encode(encoding)
        bin_sep = sep.encode(encoding)

        data = (euro * 10) + sep + (yen * 10) + sep + (euro * 10)
        bin_data = data.encode(encoding)

        with tmpfile() as fn:
            with open(fn, 'w+b') as f:
                f.write(bin_data)
                f.seek(0)

                start, stop = next_linesep(f, 5, encoding, sep)
                assert start == len(bin_euro) * 10
                assert stop == len(bin_euro) * 10 + len(sep.encode(encoding))

                seek = len(bin_euro) * 10 + len(bin_sep) + len(bin_yen)
                start, stop = next_linesep(f, seek, encoding, sep)

                exp_start = len(bin_euro) * 10 + len(bin_sep) + len(bin_yen) * 10
                exp_stop = exp_start + len(bin_sep)
                assert start == exp_start
                assert stop == exp_stop
Beispiel #22
0
def test_linecount_gzip():
    with tmpfile('gz') as fn:
        f = gzip.open(fn, 'wb')
        for line in text.split('\n'):
            f.write(line.encode('ascii'))
            f.write(b'\n')
        f.close()
        assert linecount(fn) == 7
Beispiel #23
0
def test_to_textfiles_endlines():
    b = db.from_sequence(['a', 'b', 'c'], npartitions=1)
    with tmpfile() as fn:
        for last_endline in False, True:
            b.to_textfiles([fn], last_endline=last_endline)
            with open(fn, 'r') as f:
                result = f.readlines()
            assert result == ['a\n', 'b\n', 'c\n' if last_endline else 'c']
Beispiel #24
0
def test_to_csv():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpfile('csv') as fn:
            a.to_csv(fn)
            result = pd.read_csv(fn, index_col=0)
            tm.assert_frame_equal(result, df)

    a = dd.from_pandas(df, npartitions)
    with tmpfile('csv') as fn:
        r = a.to_csv(fn, compute=False)
        r.compute()
        result = pd.read_csv(fn, index_col=0)
        tm.assert_frame_equal(result, df)
Beispiel #25
0
def test_linecount_bz2():
    with tmpfile('bz2') as fn:
        f = bz2.BZ2File(fn, 'wb')
        for line in text.split('\n'):
            f.write(line.encode('ascii'))
            f.write(b'\n')
        f.close()
        assert linecount(fn) == 7
Beispiel #26
0
def test_visualize_order():
    pytest.importorskip('matplotlib')
    x = da.arange(5, chunks=2)
    with tmpfile(extension='dot') as fn:
        x.visualize(color='order', filename=fn, cmap='RdBu')
        with open(fn) as f:
            text = f.read()
        assert 'color="#' in text
Beispiel #27
0
def test_to_hdf_modes_multiple_nodes():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    # appending a single partition to existing data
    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # overwriting a file with a single partition
    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='w')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # appending two partitions to existing data
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # overwriting a file with two partitions
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='w')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # overwriting a single partition, keeping other partitions
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data1')
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a', append=False)
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)
Beispiel #28
0
def test_read_hdf_doesnt_segfault():
    with tmpfile("h5") as fn:
        N = 40
        df = pd.DataFrame(np.random.randn(N, 3))
        with pd.HDFStore(fn, mode="w") as store:
            store.append("/x", df)

        ddf = dd.read_hdf(fn, "/x", chunksize=2)
        assert len(ddf) == N
Beispiel #29
0
def test_to_csv_gzip():
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0])

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpfile("csv") as fn:
            a.to_csv(fn, compression="gzip")
            result = pd.read_csv(fn, index_col=0, compression="gzip")
            tm.assert_frame_equal(result, df)
Beispiel #30
0
def test_from_bcolz_filename():
    bcolz = pytest.importorskip("bcolz")

    with tmpfile(".bcolz") as fn:
        t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Beispiel #31
0
def test_to_hdf_multiple_datasets():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    b = dd.from_pandas(df16, 16)

    # saving to multiple datasets making sure order is kept
    with tmpfile('h5') as fn:
        b.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple datasets
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # saving to multiple datasets with custom name_function
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data_*')
        eq(df, out)

        out = pd.read_hdf(fn, '/data_a')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, '/data_aa')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # triggering too many asterisks error
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*')

    # triggering too many asterisks error
    with tmpfile() as fn:
        with pd.HDFStore(fn) as hdf:
            with pytest.raises(ValueError):
                a.to_hdf(hdf, '/data_*_*')

    # test hdf object
    with tmpfile('h5') as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, '/data*')
            out = dd.read_hdf(fn, '/data*')
            eq(df, out)
Beispiel #32
0
def test_to_hdf_lock_delays():
    pytest.importorskip("tables")
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df16, 16)

    # adding artificial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1 * (10 - i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple hdf files
    # adding artificial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, "data*")
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df16, out)
Beispiel #33
0
def test_to_hdf_schedulers(scheduler, npartitions):
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df, npartitions=npartitions)

    # test single file single node
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data", scheduler=scheduler)
        out = pd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # test multiple files single node
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data", scheduler=scheduler)
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # test single file multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*", scheduler=scheduler)
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)
Beispiel #34
0
def test_to_hdf_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # saving to multiple nodes making sure order is kept
    with tmpfile("h5") as fn:
        b.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple datasets with custom name_function
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data_*")
        assert_eq(df, out)

        out = pd.read_hdf(fn, "/data_a")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, "/data_aa")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test multiple nodes with hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            b.to_hdf(hdf, "/data*")
            out = dd.read_hdf(fn, "/data*")
            assert_eq(df16, out)
Beispiel #35
0
def db():
    with tmpfile() as f:
        uri = "sqlite:///%s" % f
        df.to_sql("test", uri, index=True, if_exists="replace")
        yield uri
Beispiel #36
0
def tmp_db_uri():
    with tmpfile() as f:
        yield "sqlite:///%s" % f
Beispiel #37
0
def test_roundtrip_from_pandas(engine):
    with tmpfile() as fn:
        df = pd.DataFrame({'x': [1, 2, 3]})
        fastparquet.write(fn, df)
        ddf = dd.io.parquet.read_parquet(fn, index=False, engine=engine)
        assert_eq(df, ddf)
Beispiel #38
0
def test_to_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [1, 2, 3, 4]
    },
                      index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    with tmpfile('h5') as fn:
        a.x.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    # saving to multiple datasets
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        tm.assert_frame_equal(df, out.compute())

    # saving to multiple files
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out.compute())

    # saving to multiple datasets with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data_*')
        tm.assert_frame_equal(df, out.compute())

        out = pd.read_hdf(fn, '/data_a')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, '/data_aa')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to multiple files with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out.compute())

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to different datasets in multiple files with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
Beispiel #39
0
def test_to_hdf_link_optimizations():
    """testing dask link levels is correct by calculating the depth of the dask graph"""
    pytest.importorskip("tables")
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df16, 16)

    # saving to multiple hdf files, no links are needed
    # expected layers: from_pandas, to_hdf, list = depth of 3
    with tmpdir() as dn:
        fn = os.path.join(dn, "data*")
        d = a.to_hdf(fn, "/data", compute=False)
        assert dependency_depth(d.dask) == 3

    # saving to a single hdf file with multiple nodes
    # all subsequent nodes depend on the first
    # expected layers: from_pandas, first to_hdf(creates file+node), subsequent to_hdfs, list = 4
    with tmpfile() as fn:
        d = a.to_hdf(fn, "/data*", compute=False)
        assert dependency_depth(d.dask) == 4

    # saving to a single hdf file with a single node
    # every node depends on the previous node
    # expected layers: from_pandas, to_hdf times npartitions(15), list = 2 + npartitions = 17
    with tmpfile() as fn:
        d = a.to_hdf(fn, "/data", compute=False)
        assert dependency_depth(d.dask) == 2 + a.npartitions