Esempio n. 1
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=3) as (s, [a, b, c]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_csv('/tmp/test/*.csv',
                                   lineterminator='\n',
                                   collection=False,
                                   lazy=False,
                                   header=0)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = read_csv('/tmp/test/*.csv',
                              lineterminator='\n',
                              collection=True,
                              lazy=False,
                              header=0)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Esempio n. 2
0
def test_read_csv_with_names(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = read_csv('/tmp/test/*.csv', names=['amount', 'name'],
                             lineterminator='\n', lazy=False)
        assert list(df.columns) == ['amount', 'name']
Esempio n. 3
0
def test_read_csv_with_names(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        df = read_csv('/tmp/test/*.csv',
                      names=['amount', 'name'],
                      lineterminator='\n',
                      lazy=False)
        assert list(df.columns) == ['amount', 'name']
Esempio n. 4
0
def test_read_csv(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = read_csv('/tmp/test/*.csv', lineterminator='\n', lazy=False)
        assert df._known_dtype
        result = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Esempio n. 5
0
def test_read_csv(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = read_csv('/tmp/test/*.csv', lineterminator='\n', lazy=False)
        assert df._known_dtype
        result = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
Esempio n. 6
0
def test_read_csv_sync(loop):
    import dask.dataframe as dd
    import pandas as pd
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                futures = read_csv('/tmp/test/*.csv', lineterminator='\n',
                                   header=True, collection=False)
                assert all(isinstance(f, Future) for f in futures)
                L = e.gather(futures)
                assert isinstance(L[0], pd.DataFrame)
                assert list(L[0].columns) == ['name', 'amount', 'id']

                df = read_csv('/tmp/test/*.csv', lineterminator='\n',
                              header=True, collection=True)
                assert isinstance(df, dd.DataFrame)
                assert list(df.head().iloc[0]) == ['Alice', 100, 1]
Esempio n. 7
0
def test_read_csv_sync_compute(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                for lazy in [True, False]:
                    df = read_csv('/tmp/test/*.csv', collection=True, lazy=lazy)
                    assert df.amount.sum().compute(get=e.get) == 1000
Esempio n. 8
0
def test_read_csv_sync_compute(loop):
    import dask.dataframe as dd
    import pandas as pd
    with cluster(nworkers=1) as (s, [a]):
        with make_hdfs() as hdfs:
            with hdfs.open('/tmp/test/1.csv', 'wb') as f:
                f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

            with hdfs.open('/tmp/test/2.csv', 'wb') as f:
                f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

            with Executor(('127.0.0.1', s['port']), loop=loop) as e:
                for lazy in [True, False]:
                    df = read_csv('/tmp/test/*.csv',
                                  collection=True,
                                  lazy=lazy)
                    assert df.amount.sum().compute(get=e.get) == 1000