Esempio n. 1
0
def test_from_bcolz():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert str(d.dtypes['a']) == 'category'
    assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
    assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']

    d = dd.from_bcolz(t, chunksize=2, index='x')
    L = list(d.index.compute(get=get_sync))
    assert L == [1, 2, 3] or L == [1, 3, 2]
Esempio n. 2
0
def test_from_bcolz_no_lock():
    bcolz = pytest.importorskip("bcolz")
    locktype = type(Lock())

    t = bcolz.ctable(
        [[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], chunklen=2
    )
    a = dd.from_bcolz(t, chunksize=2)
    b = dd.from_bcolz(t, chunksize=2, lock=True)
    c = dd.from_bcolz(t, chunksize=2, lock=False)
    assert_eq(a, b)
    assert_eq(a, c)

    assert not any(isinstance(item, locktype) for v in c.dask.values() for item in v)
Esempio n. 3
0
def test_from_bcolz():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d._known_dtype
    assert d.npartitions == 2
    assert str(d.dtypes['a']) == 'category'
    assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
    assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']
    L = list(d.index.compute(get=get_sync))
    assert L == [0, 1, 2]

    d = dd.from_bcolz(t, chunksize=2, index='x')
    L = list(d.index.compute(get=get_sync))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
        dd.from_bcolz(t, chunksize=2).dask))
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
        dd.from_bcolz(t, chunksize=3).dask))

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4., 'b'))
    t.flush()

    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk))
Esempio n. 4
0
def test_from_bcolz():
    bcolz = pytest.importorskip("bcolz")

    t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
                     names=["x", "y", "a"])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert is_categorical_dtype(d.dtypes["a"])
    assert list(d.x.compute(scheduler="sync")) == [1, 2, 3]
    assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"]
    L = list(d.index.compute(scheduler="sync"))
    assert L == [0, 1, 2]

    d = dd.from_bcolz(t, chunksize=2, index="x")
    L = list(d.index.compute(scheduler="sync"))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
        dd.from_bcolz(t, chunksize=2).dask)
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
        dd.from_bcolz(t, chunksize=3).dask)

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4.0, "b"))
    t.flush()

    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
Esempio n. 5
0
def test_from_bcolz():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert str(d.dtypes['a']) == 'category'
    assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
    assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']

    d = dd.from_bcolz(t, chunksize=2, index='x')
    L = list(d.index.compute(get=get_sync))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) == \
           sorted(dd.from_bcolz(t, chunksize=2).dask)
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \
           sorted(dd.from_bcolz(t, chunksize=3).dask)

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4., 'b'))
    t.flush()

    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \
           sorted(dsk)
Esempio n. 6
0
def test_from_bcolz():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert is_categorical_dtype(d.dtypes['a'])
    assert list(d.x.compute(scheduler='sync')) == [1, 2, 3]
    assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a']
    L = list(d.index.compute(scheduler='sync'))
    assert L == [0, 1, 2]

    d = dd.from_bcolz(t, chunksize=2, index='x')
    L = list(d.index.compute(scheduler='sync'))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) ==
            sorted(dd.from_bcolz(t, chunksize=2).dask))
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
            sorted(dd.from_bcolz(t, chunksize=3).dask))

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4., 'b'))
    t.flush()

    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
            sorted(dsk))
Esempio n. 7
0
def test_from_bcolz_no_lock():
    bcolz = pytest.importorskip('bcolz')
    locktype = type(Lock())

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'], chunklen=2)
    a = dd.from_bcolz(t, chunksize=2)
    b = dd.from_bcolz(t, chunksize=2, lock=True)
    c = dd.from_bcolz(t, chunksize=2, lock=False)
    assert_eq(a, b)
    assert_eq(a, c)

    assert not any(isinstance(item, locktype)
                   for v in c.dask.values()
                   for item in v)
Esempio n. 8
0
def test_from_bcolz_no_lock():
    bcolz = pytest.importorskip('bcolz')
    locktype = type(Lock())

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'],
                     chunklen=2)
    a = dd.from_bcolz(t, chunksize=2)
    b = dd.from_bcolz(t, chunksize=2, lock=True)
    c = dd.from_bcolz(t, chunksize=2, lock=False)
    eq(a, b)
    eq(a, c)

    assert not any(
        isinstance(item, locktype) for v in c.dask.values() for item in v)
Esempio n. 9
0
def test_from_bcolz_column_order():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    df = dd.from_bcolz(t, chunksize=2)
    assert list(df.loc[0].compute().columns) == ['x', 'y', 'a']
Esempio n. 10
0
def test_from_bcolz_column_order():
    bcolz = pytest.importorskip("bcolz")

    t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
                     names=["x", "y", "a"])
    df = dd.from_bcolz(t, chunksize=2)
    assert list(df.loc[0].compute().columns) == ["x", "y", "a"]
Esempio n. 11
0
def test_from_bcolz():
    try:
        import bcolz
    except ImportError:
        return

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert str(d.dtypes['a']) == 'category'
    assert list(d.x.compute(get=dask.get)) == [1, 2, 3]
    assert list(d.a.compute(get=dask.get)) == ['a', 'b', 'a']

    d = dd.from_bcolz(t, chunksize=2, index='x')
    assert list(d.index.compute()) == [1, 2, 3]
Esempio n. 12
0
def test_from_bcolz():
    try:
        import bcolz
    except ImportError:
        return

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert str(d.dtypes['a']) == 'category'
    assert list(d.x.compute(get=dask.get)) == [1, 2, 3]
    assert list(d.a.compute(get=dask.get)) == ['a', 'b', 'a']

    d = dd.from_bcolz(t, chunksize=2, index='x')
    assert list(d.index.compute()) == [1, 2, 3]
Esempio n. 13
0
def test_from_bcolz_column_order():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    df = dd.from_bcolz(t, chunksize=2)
    assert list(df.loc[0].compute().columns) == ['x', 'y', 'a']
Esempio n. 14
0
def test_from_bcolz_filename():
    bcolz = pytest.importorskip("bcolz")

    with tmpfile(".bcolz") as fn:
        t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"], rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 15
0
def test_from_bcolz_filename():
    bcolz = pytest.importorskip('bcolz')

    with tmpfile('.bcolz') as fn:
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'],
                         rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 16
0
def test_from_bcolz_filename():
    bcolz = pytest.importorskip('bcolz')

    with tmpfile('.bcolz') as fn:
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'],
                         rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 17
0
def test_from_bcolz_filename():
    try:
        import bcolz
    except ImportError:
        return
    with tmpfile('.bcolz') as fn:
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'],
                         rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 18
0
def test_from_bcolz_filename():
    try:
        import bcolz
    except ImportError:
        return
    with tmpfile('.bcolz') as fn:
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'],
                         rootdir=fn)
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 19
0
def test_from_bcolz_filename():
    bcolz = pytest.importorskip("bcolz")

    with tmpfile(".bcolz") as fn:
        t = bcolz.ctable(
            [[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
            names=["x", "y", "a"],
            rootdir=fn,
        )
        t.flush()

        d = dd.from_bcolz(fn, chunksize=2)
        assert list(d.x.compute()) == [1, 2, 3]
Esempio n. 20
0
def test_from_bcolz():
    bcolz = pytest.importorskip("bcolz")

    t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert str(d.dtypes["a"]) == "category"
    assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
    assert list(d.a.compute(get=get_sync)) == ["a", "b", "a"]

    d = dd.from_bcolz(t, chunksize=2, index="x")
    L = list(d.index.compute(get=get_sync))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask)
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask)

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4.0, "b"))
    t.flush()

    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
Esempio n. 21
0
    def check(i):
        t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
                         names=["x", "y", "a"])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert is_categorical_dtype(d.dtypes["a"])
        assert list(d.x.compute(scheduler="sync")) == [1, 2, 3]
        assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"]

        d = dd.from_bcolz(t, chunksize=2, index="x")
        L = list(d.index.compute(scheduler="sync"))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
            dd.from_bcolz(t, chunksize=2).dask)
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
            dd.from_bcolz(t, chunksize=3).dask)
Esempio n. 22
0
    def check():
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert str(d.dtypes['a']) == 'category'
        assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
        assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']

        d = dd.from_bcolz(t, chunksize=2, index='x')
        L = list(d.index.compute(get=get_sync))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
            dd.from_bcolz(t, chunksize=2).dask))
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
            dd.from_bcolz(t, chunksize=3).dask))
Esempio n. 23
0
    def check():
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert str(d.dtypes['a']) == 'category'
        assert list(d.x.compute(get=get_sync)) == [1, 2, 3]
        assert list(d.a.compute(get=get_sync)) == ['a', 'b', 'a']

        d = dd.from_bcolz(t, chunksize=2, index='x')
        L = list(d.index.compute(get=get_sync))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) == \
               sorted(dd.from_bcolz(t, chunksize=2).dask)
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) != \
               sorted(dd.from_bcolz(t, chunksize=3).dask)
Esempio n. 24
0
    def check(i):
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert is_categorical_dtype(d.dtypes['a'])
        assert list(d.x.compute(scheduler='sync')) == [1, 2, 3]
        assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a']

        d = dd.from_bcolz(t, chunksize=2, index='x')
        L = list(d.index.compute(scheduler='sync'))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) ==
                sorted(dd.from_bcolz(t, chunksize=2).dask))
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
                sorted(dd.from_bcolz(t, chunksize=3).dask))
Esempio n. 25
0
    if len(sys.argv)>5: p.y           = sys.argv[5]
    if len(sys.argv)>6: p.categories  = sys.argv[6:]

from dask.cache import Cache
Cache(p.cachesize).register()


filetypes_storing_categories = {'parq','castra'}


read = odict([(f,odict()) for f in ["parq","bcolz","feather","castra","h5","csv"]])
               
read["csv"]     ["dask"]   = lambda filepath,p:  dd.read_csv(filepath, usecols=p.columns)
read["h5"]      ["dask"]   = lambda filepath,p:  dd.read_hdf(filepath, p.base, chunksize=p.chunksize, columns=p.columns)
read["castra"]  ["dask"]   = lambda filepath,p:  dd.from_castra(filepath)
read["bcolz"]   ["dask"]   = lambda filepath,p:  dd.from_bcolz(filepath, chunksize=1000000)
read["parq"]    ["dask"]   = lambda filepath,p:  dd.io.parquet.read_parquet(filepath,index=False, categories=p.categories, columns=p.columns)

read["csv"]     ["pandas"] = lambda filepath,p:  pd.read_csv(filepath, usecols=p.columns)
read["h5"]      ["pandas"] = lambda filepath,p:  pd.read_hdf(filepath, p.base, columns=p.columns)
read["feather"] ["pandas"] = lambda filepath,p:  feather.read_dataframe(filepath)
read["parq"]    ["pandas"] = lambda filepath,p:  fp.ParquetFile(filepath).to_pandas()


write = odict([(f,odict()) for f in ["parq","snappy.parq","gz.parq","bcolz","feather","castra","h5","csv"]])

write["csv"]          ["dask"]   = lambda df,filepath,p:  df.to_csv(filepath.replace(".csv","*.csv"))
write["h5"]           ["dask"]   = lambda df,filepath,p:  df.to_hdf(filepath, p.base)
write["castra"]       ["dask"]   = lambda df,filepath,p:  df.to_castra(filepath,categories=p.categories)
write["parq"]         ["dask"]   = lambda df,filepath,p:  dd.io.parquet.to_parquet(filepath, df) ## **p.parq_opts
write["snappy.parq"]  ["dask"]   = lambda df,filepath,p:  dd.io.parquet.to_parquet(filepath, df, compression='SNAPPY') ## **p.parq_opts