Example #1
0
def test_avro(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    avro_files = {
        '/tmp/test/1.avro': avro_bytes,
        '/tmp/test/2.avro': avro_bytes
    }

    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'wb') as f:
                f.write(v)

            assert hdfs.info(k)['size'] > 0

        L = yield _read_avro('/tmp/test/*.avro', lazy=False)
        assert isinstance(L, list)
        assert all(isinstance(x, Future) for x in L)

        results = yield e._gather(L)
        assert all(isinstance(r, list) for r in results)
        assert results[0][:5] == data[:5]
        assert results[-1][-5:] == data[-5:]

        L = yield _read_avro('/tmp/test/*.avro', lazy=True)
        assert isinstance(L, list)
        assert all(isinstance(x, Value) for x in L)

    yield e._shutdown()
Example #2
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'wb') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n', lazy=False)
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1

        yield e._shutdown()
Example #3
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_csv('/tmp/test/*.csv',
                                   lineterminator='\n',
                                   header=True,
                                   collection=False,
                                   lazy=False)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = read_csv('/tmp/test/*.csv',
                              lineterminator='\n',
                              header=True,
                              collection=True,
                              lazy=False)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Example #4
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        results = e.compute(values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)

        yield e._shutdown()
Example #5
0
def test_read_bytes(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'a' * int(1e8)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes('hdfs://' + fn)
        assert sample[:5] == b'aaaaa'
        assert len(values[0]) == len(blocks)

        while not s.host_restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values[0]} == set(s.host_restrictions)
        assert {v.key for v in values[0]} == set(s.loose_restrictions)

        futures = c.compute(values[0])
        results = yield c._gather(futures)
        assert b''.join(results) == data
        assert s.host_restrictions
Example #6
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                values = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                     lineterminator='\n',
                                     collection=False, header=0)
                futures = e.compute(values)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                 lineterminator='\n',
                                 collection=True, header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Example #7
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(s['address'], loop=loop) as e:
                values = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                     lineterminator='\n',
                                     collection=False,
                                     header=0)
                futures = e.compute(values)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = dd.read_csv('hdfs://%s/*.csv' % basedir,
                                 lineterminator='\n',
                                 collection=True,
                                 header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Example #8
0
def test__read_text(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('/tmp/test/text.2.txt', 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('/tmp/test/other.txt', 'wb') as f:
            f.write('a b\nc d'.encode())

        b = read_text('/tmp/test/text.*.txt',
                             collection=True, lazy=True)
        yield gen.sleep(0.5)
        assert not s.tasks

        future = e.compute(b.str.strip().str.split().map(len))
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = read_text('/tmp/test/other.txt',
                             collection=True, lazy=False)
        future = e.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']

        L = read_text('/tmp/test/text.*.txt',
                             collection=False, lazy=False)
        assert all(isinstance(x, Future) for x in L)

        L = read_text('/tmp/test/text.*.txt',
                             collection=False, lazy=True)
        assert all(isinstance(x, Value) for x in L)
Example #9
0
def test_avro(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    avro_files = {'/tmp/test/1.avro': avro_bytes,
                  '/tmp/test/2.avro': avro_bytes}

    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'w') as f:
                f.write(v)

            assert hdfs.info(k)['size'] > 0

        L = yield _read_avro('/tmp/test/*.avro', lazy=False)
        assert isinstance(L, list)
        assert all(isinstance(x, Future) for x in L)

        results = yield e._gather(L)
        assert all(isinstance(r, list) for r in results)
        assert results[0][:5] == data[:5]
        assert results[-1][-5:] == data[-5:]

        L = yield _read_avro('/tmp/test/*.avro', lazy=True)
        assert isinstance(L, list)
        assert all(isinstance(x, Value) for x in L)
Example #10
0
def test__read_text(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('%s/other.txt' % basedir, 'wb') as f:
            f.write('a b\nc d'.encode())

        b = db.read_text('hdfs://%s/text.*.txt' % basedir)
        yield gen.sleep(0.5)
        assert not s.tasks

        import dask
        b.compute(get=dask.get)

        coll = b.str.strip().str.split().map(len)

        future = c.compute(coll)
        yield gen.sleep(0.5)
        result = yield future
        assert result == [2, 2, 2, 2, 2, 2]

        b = db.read_text('hdfs://%s/other.txt' % basedir)
        b = c.persist(b)
        future = c.compute(b.str.split().flatten())
        result = yield future
        assert result == ['a', 'b', 'c', 'd']
Example #11
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
Example #12
0
def test_read_bytes(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'a' * int(1e8)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes('hdfs://' + fn)
        assert sample[:5] == b'aaaaa'
        assert len(values[0]) == len(blocks)

        while not s.host_restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values[0]} == set(s.host_restrictions)
        assert {v.key for v in values[0]} == set(s.loose_restrictions)

        futures = c.compute(values[0])
        results = yield c._gather(futures)
        assert b''.join(results) == data
        assert s.host_restrictions
Example #13
0
def test_read_bytes(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        blocks = hdfs.get_block_locations(fn)
        assert len(blocks) > 1

        sample, values = read_bytes(fn, hdfs=hdfs)
        assert sample[:5] == b'aaaaa'
        assert len(values) == len(blocks)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.tasks

        assert {v.key for v in values} == set(s.restrictions)
        assert {v.key for v in values} == set(s.loose_restrictions)

        futures = e.compute(values)
        results = yield e._gather(futures)
        assert b''.join(results) == data
        assert s.restrictions
Example #14
0
def dont_test_dataframes(s, a):  # slow
    pytest.importorskip('pandas')
    n = 3000000
    fn = '/tmp/test/file.csv'
    with make_hdfs() as hdfs:
        data = (b'name,amount,id\r\n' +
                b'Alice,100,1\r\nBob,200,2\r\n' * n)
        with hdfs.open(fn, 'w') as f:
            f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n')
        assert len(futures) > 1

        def load(b, **kwargs):
            assert b
            from io import BytesIO
            import pandas as pd
            bio = BytesIO(b)
            return pd.read_csv(bio, **kwargs)

        dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1)
        dfs2 = yield e._gather(dfs)
        assert sum(map(len, dfs2)) == n * 2 - 1
Example #15
0
def test__read_text(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('%s/other.txt' % basedir, 'wb') as f:
            f.write('a b\nc d'.encode())

        b = db.read_text('hdfs://%s/text.*.txt' % basedir)
        yield gen.sleep(0.5)
        assert not s.tasks

        import dask
        b.compute(get=dask.get)

        coll = b.str.strip().str.split().map(len)

        future = c.compute(coll)
        yield gen.sleep(0.5)
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = db.read_text('hdfs://%s/other.txt' % basedir)
        b = c.persist(b)
        future = c.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']
Example #16
0
def test_read_csv_with_names(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = read_csv('/tmp/test/*.csv', names=['amount', 'name'],
                             lineterminator='\n', lazy=False)
        assert list(df.columns) == ['amount', 'name']
Example #17
0
def test_read_csv_with_names(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = dd.read_csv('hdfs:///tmp/test/*.csv', names=['amount', 'name'],
                             lineterminator='\n')
        assert list(df.columns) == ['amount', 'name']
Example #18
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                b = read_text('/tmp/test/*.txt', lazy=False)
                assert list(b.str.upper()) == ['HELLO', 'WORLD']
Example #19
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Client(('127.0.0.1', s['port']), loop=loop):
                b = db.read_text('hdfs:///tmp/test/*.txt')
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Example #20
0
def test_read_csv_with_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = dd.read_csv('hdfs://%s/*.csv' % basedir,
                         names=['amount', 'name'],
                         lineterminator='\n')
        assert list(df.columns) == ['amount', 'name']
Example #21
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                b = read_text('/tmp/test/*.txt', lazy=False)
                assert list(b.str.upper()) == ['HELLO', 'WORLD']
Example #22
0
def test_read_text_sync(loop):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/data.txt' % basedir, 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Client(s['address'], loop=loop) as e:
                b = db.read_text('hdfs://%s/*.txt' % basedir)
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Example #23
0
def test_read_text_sync(loop):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/data.txt' % basedir, 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Client(('127.0.0.1', s['port']), loop=loop):
                b = db.read_text('hdfs://%s/*.txt' % basedir)
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Example #24
0
def test__read_text_json_endline(e, s, a):
    import json
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write(b'{"x": 1}\n{"x": 2}\n')

        b = read_text('/tmp/test/text.1.txt').map(json.loads)
        result = yield e.compute(b)._result()

        assert result == [{"x": 1}, {"x": 2}]
Example #25
0
def test__read_text_json_endline(e, s, a):
    import json
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write(b'{"x": 1}\n{"x": 2}\n')

        b = db.read_text('hdfs:///tmp/test/text.1.txt').map(json.loads)
        result = yield e.compute(b)._result()

        assert result == [{"x": 1}, {"x": 2}]
Example #26
0
def test__read_text_json_endline(e, s, a):
    import json
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write(b'{"x": 1}\n{"x": 2}\n')

        b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads)
        result = yield e.compute(b)

        assert result == [{"x": 1}, {"x": 2}]
Example #27
0
def test__read_text_json_endline(e, s, a):
    import json
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write(b'{"x": 1}\n{"x": 2}\n')

        b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads)
        result = yield e.compute(b)._result()

        assert result == [{"x": 1}, {"x": 2}]
Example #28
0
def test_read_csv_sync_compute(loop):
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                df = dd.read_csv('hdfs://%s/*.csv' % basedir, collection=True)
                assert df.amount.sum().compute(get=e.get) == 1000
Example #29
0
def test_read_csv(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n')
        result = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Example #30
0
def test__read_text_unicode(e, s, a, b):
    data = b'abcd\xc3\xa9'
    with make_hdfs() as (hdfs, basedir):
        fn = '%s/data.txt' % basedir
        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = db.read_text('hdfs://' + fn, collection=False)
        result = yield e.compute(f[0])._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0].strip()) == 5
Example #31
0
def test__read_text_unicode(e, s, a, b):
    fn = '/tmp/test/data.txt'
    data = b'abcd\xc3\xa9'
    with make_hdfs() as hdfs:
        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = read_text(fn, collection=False, lazy=False)
        result = yield f[0]._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0]) == 5
Example #32
0
def test__read_text_unicode(e, s, a, b):
    data = b'abcd\xc3\xa9'
    with make_hdfs() as (hdfs, basedir):
        fn = '%s/data.txt' % basedir
        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = db.read_text('hdfs://' + fn, collection=False)
        result = yield e.compute(f[0])
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0].strip()) == 5
Example #33
0
def test_read_csv(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = dd.read_csv('hdfs:///tmp/test/*.csv', lineterminator='\n')
        result = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Example #34
0
def test__read_text_unicode(e, s, a, b):
    fn = '/tmp/test/data.txt'
    data = b'abcd\xc3\xa9'
    with make_hdfs() as hdfs:
        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = read_text(fn, collection=False, lazy=False)
        result = yield f[0]._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0]) == 5
Example #35
0
def test_read_csv_sync_compute(loop):
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as (hdfs, basedir):
            with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Client(s['address'], loop=loop) as e:
                df = dd.read_csv('hdfs://%s/*.csv' % basedir, collection=True)
                assert df.amount.sum().compute(get=e.get) == 1000
Example #36
0
def test_read_csv_sync_compute(loop):
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                df = dd.read_csv('hdfs:///tmp/test/*.csv', collection=True)
                assert df.amount.sum().compute(get=e.get) == 1000
Example #37
0
def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as (hdfs, basedir):
            data = b'a' * int(1e3)

            for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(s['address'], loop=loop) as e:
                sample, values = read_bytes('hdfs://%s/file.*' % basedir)
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]
Example #38
0
def test_read_csv_with_names(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = yield _read_csv('/tmp/test/*.csv', names=['amount', 'name'],
                             lineterminator='\n', lazy=False)
        assert list(df.columns) == ['amount', 'name']

        yield e._shutdown()
Example #39
0
def test_get_block_locations_nested():
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        L = get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6
Example #40
0
def test_read_bytes_sync(loop):
    with make_hdfs() as hdfs:
        data = b'a' * int(1e3)

        for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
            with hdfs.open(fn, 'w', repl=1) as f:
                f.write(data)

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_bytes('/tmp/test/file.*')
                results = e.gather(futures)
                assert b''.join(results) == 100 * data
Example #41
0
def test_get_block_locations_nested():
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        L =  get_block_locations(hdfs, '/tmp/test/')
        assert len(L) == 6
Example #42
0
def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as (hdfs, basedir):
            data = b'a' * int(1e3)

            for fn in ['%s/file.%d' % (basedir, i) for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                sample, values = read_bytes('hdfs://%s/file.*' % basedir)
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]
Example #43
0
def test_read_bytes_sync(loop):
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as hdfs:
            data = b'a' * int(1e3)

            for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_bytes('/tmp/test/file.*', lazy=False)
                results = e.gather(futures)
                assert b''.join(results) == 100 * data
Example #44
0
def test_read_csv(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = read_csv('/tmp/test/*.csv', lineterminator='\n', lazy=False)
        assert df._known_dtype
        result = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Example #45
0
def test_read_bytes_sync(loop, nworkers):
    with cluster(nworkers=nworkers) as (s, workers):
        with make_hdfs() as hdfs:
            data = b'a' * int(1e3)

            for fn in ['/tmp/test/file.%d' % i for i in range(100)]:
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

            with Client(('127.0.0.1', s['port']), loop=loop) as e:
                sample, values = read_bytes('hdfs:///tmp/test/file.*')
                results = delayed(values).compute()
                assert [b''.join(r) for r in results] == 100 * [data]
Example #46
0
def test_write_bytes_2(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        path = 'hdfs://%s/' % basedir
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path)
        futures = c.compute(out)
        results = yield c._gather(futures)
        assert len(hdfs.ls(basedir)) == 5

        sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
        futures = c.compute(list(concat(vals)))
        results = yield c._gather(futures)
        assert data == results
Example #47
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'abc\n' * int(1e3)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c')

        assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
        assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
Example #48
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        data = b'abc\n' * int(1e3)
        fn = '%s/file' % basedir

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n')
        _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c')

        assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
        assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
Example #49
0
def test_write_bytes_2(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        path = 'hdfs://%s/' % basedir
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path)
        futures = c.compute(out)
        results = yield c._gather(futures)
        assert len(hdfs.ls(basedir)) == 5

        sample, vals = read_bytes('hdfs://%s/*.part' % basedir)
        futures = c.compute(list(concat(vals)))
        results = yield c._gather(futures)
        assert data == results
Example #50
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'abc\n' * int(1e3)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c')

        assert [f.key for f in x] == [f.key for f in y]
        assert [f.key for f in x] != [f.key for f in z]
Example #51
0
def test_read_csv_lazy(e, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/1.csv' % basedir, 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('%s/2.csv' % basedir, 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n')
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)
        assert result == 1 + 2 + 3 + 4
Example #52
0
def test_read_csv_lazy(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = dd.read_csv('hdfs:///tmp/test/*.csv', lineterminator='\n')
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)._result()
        assert result == 1 + 2 + 3 + 4
Example #53
0
def test_deterministic_key_names(e, s, a, b):
    with make_hdfs() as hdfs:
        data = b'abc\n' * int(1e3)
        fn = '/tmp/test/file'

        with hdfs.open(fn, 'wb', replication=1) as f:
            f.write(data)

        x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n')
        z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c')

        assert [f.key for f in x] == [f.key for f in y]
        assert [f.key for f in x] != [f.key for f in z]
Example #54
0
def test_avro_sync(loop):
    avro_files = {'/tmp/test/1.avro': avro_bytes,
                  '/tmp/test/2.avro': avro_bytes}
    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'w') as f:
                f.write(v)

        with cluster(nworkers=1) as (s, [a]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_avro('/tmp/test/*.avro')
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert L[0][:5] == data[:5]
Example #55
0
def test_read_csv_lazy(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = read_csv('/tmp/test/*.csv', lazy=True, lineterminator='\n')
        assert df._known_dtype
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)._result()
        assert result == 1 + 2 + 3 + 4
Example #56
0
def test_get_block_locations_nested():
    with make_hdfs() as (hdfs, basedir):
        data = b'a'

        for i in range(3):
            hdfs.mkdir('%s/data-%d' % (basedir, i))
            for j in range(2):
                fn = '%s/data-%d/file-%d.csv' % (basedir, i, j)
                with hdfs.open(fn, 'wb', replication=1) as f:
                    f.write(data)

        L =  [hdfs.get_block_locations(fn)
              for fn in hdfs.glob('%s/*/*.csv' % basedir)]
        L = list(concat(L))
        assert len(L) == 6
Example #57
0
def test_get_block_locations():
    with make_hdfs() as hdfs:
        data = b'a' * int(1e8)  # todo: reduce block size to speed up test
        fn_1 = '/tmp/test/file1'
        fn_2 = '/tmp/test/file2'

        with hdfs.open(fn_1, 'w', repl=1) as f:
            f.write(data)
        with hdfs.open(fn_2, 'w', repl=1) as f:
            f.write(data)

        L =  get_block_locations(hdfs, '/tmp/test/')
        assert L == get_block_locations(hdfs, fn_1) + get_block_locations(hdfs, fn_2)
        assert L[0]['filename'] == L[1]['filename'] == fn_1
        assert L[2]['filename'] == L[3]['filename'] == fn_2
Example #58
0
def test_read_csv(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n')
        result, = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Example #59
0
def test_write_bytes(e, s, a, b):
    from dask.bytes.core import write_bytes, read_bytes
    with make_hdfs() as hdfs:
        path = 'hdfs:///tmp/test/'
        data = [b'test data %i' % i for i in range(5)]
        values = [delayed(d) for d in data]
        out = write_bytes(values, path, hdfs=hdfs)
        futures = e.compute(out)
        results = yield e._gather(futures)
        assert len(hdfs.ls('/tmp/test/')) == 5

        sample, vals = read_bytes('hdfs:///tmp/test/*.part',
                                    hdfs=hdfs, lazy=True)
        futures = e.compute(vals)
        results = yield e._gather(futures)
        assert data == results
Example #60
0
def test_avro_sync(loop):
    avro_files = {
        '/tmp/test/1.avro': avro_bytes,
        '/tmp/test/2.avro': avro_bytes
    }
    with make_hdfs() as hdfs:
        for k, v in avro_files.items():
            with hdfs.open(k, 'wb') as f:
                f.write(v)

        with cluster(nworkers=1) as (s, [a]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_avro('/tmp/test/*.avro', lazy=False)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert L[0][:5] == data[:5]