Exemple #1
0
def test_read_csv_of_modified_file_has_different_name():
    with filetext(text) as fn:
        a = read_csv(fn)
        with open(fn, 'a') as f:
            f.write('\nGeorge,700')
            os.fsync(f)
        b = read_csv(fn)

        assert sorted(a.dask) != sorted(b.dask)
Exemple #2
0
def test_read_csv_has_deterministic_name():
    with filetext(text) as fn:
        a = read_csv(fn)
        b = read_csv(fn)
        assert a._name == b._name
        assert sorted(a.dask.keys()) == sorted(b.dask.keys())
        assert isinstance(a._name, str)

        c = read_csv(fn, skiprows=1, na_values=[0])
        assert a._name != c._name
Exemple #3
0
def test_read_csv_has_deterministic_name():
    with filetext(text) as fn:
        a = read_csv(fn)
        b = read_csv(fn)
        assert a._name == b._name
        assert sorted(a.dask.keys()) == sorted(b.dask.keys())
        assert isinstance(a._name, str)

        c = read_csv(fn, skiprows=1, na_values=[0])
        assert a._name != c._name
Exemple #4
0
def test_read_csv_of_modified_file_has_different_name():
    with filetext(text) as fn:
        mtime = os.path.getmtime(fn)
        sleep(1)
        a = read_csv(fn)
        sleep(1)
        with open(fn, "a") as f:
            f.write("\nGeorge,700")
            os.fsync(f)
        b = read_csv(fn)

        assert sorted(a.dask) != sorted(b.dask)
Exemple #5
0
def test_read_csv_of_modified_file_has_different_name():
    with filetext(text) as fn:
        mtime = os.path.getmtime(fn)
        sleep(1)
        a = read_csv(fn)
        sleep(1)
        with open(fn, 'a') as f:
            f.write('\nGeorge,700')
            os.fsync(f)
        b = read_csv(fn)

        assert sorted(a.dask) != sorted(b.dask)
Exemple #6
0
def test_read_multiple_csv():
    try:
        with open("_foo.1.csv", "w") as f:
            f.write(text)
        with open("_foo.2.csv", "w") as f:
            f.write(text)
        df = dd.read_csv("_foo.*.csv")

        assert len(read_csv("_foo.*.csv").compute()) == len(read_csv("_foo.1.csv").compute()) * 2
    finally:
        os.remove("_foo.1.csv")
        os.remove("_foo.2.csv")
Exemple #7
0
def test_multiple_read_csv_has_deterministic_name():
    try:
        with open("_foo.1.csv", "w") as f:
            f.write(text)
        with open("_foo.2.csv", "w") as f:
            f.write(text)
        a = read_csv("_foo.*.csv")
        b = read_csv("_foo.*.csv")

        assert sorted(a.dask.keys()) == sorted(b.dask.keys())
    finally:
        os.remove("_foo.1.csv")
        os.remove("_foo.2.csv")
Exemple #8
0
def test_multiple_read_csv_has_deterministic_name():
    try:
        with open('_foo.1.csv', 'w') as f:
            f.write(text)
        with open('_foo.2.csv', 'w') as f:
            f.write(text)
        a = read_csv('_foo.*.csv')
        b = read_csv('_foo.*.csv')

        assert sorted(a.dask.keys()) == sorted(b.dask.keys())
    finally:
        os.remove('_foo.1.csv')
        os.remove('_foo.2.csv')
Exemple #9
0
def test_read_multiple_csv():
    try:
        with open('_foo.1.csv', 'w') as f:
            f.write(text)
        with open('_foo.2.csv', 'w') as f:
            f.write(text)
        df = read_csv('_foo.*.csv')

        assert (len(read_csv('_foo.*.csv').compute()) == len(
            read_csv('_foo.1.csv').compute()) * 2)
    finally:
        os.remove('_foo.1.csv')
        os.remove('_foo.2.csv')
Exemple #10
0
def test_read_multiple_csv():
    try:
        with open('_foo.1.csv', 'w') as f:
            f.write(text)
        with open('_foo.2.csv', 'w') as f:
            f.write(text)
        df = read_csv('_foo.*.csv')

        assert (len(read_csv('_foo.*.csv').compute()) ==
                len(read_csv('_foo.1.csv').compute()) * 2)
    finally:
        os.remove('_foo.1.csv')
        os.remove('_foo.2.csv')
Exemple #11
0
def test_csv_expands_dtypes():
    with filetext(text) as fn:
        a = read_csv(fn, chunkbytes=30, dtype={})
        a_kwargs = list(a.dask.values())[0][-1]

        b = read_csv(fn, chunkbytes=30)
        b_kwargs = list(b.dask.values())[0][-1]

        assert a_kwargs['dtype'] == b_kwargs['dtype']

        a = read_csv(fn, chunkbytes=30, dtype={'amount': float})
        a_kwargs = list(a.dask.values())[0][-1]

        assert a_kwargs['dtype']['amount'] == float
Exemple #12
0
def test_read_gzip_csv():
    with filetext(text.encode(), open=gzip.open) as fn:
        f = read_csv(fn, chunkbytes=30, compression='gzip')
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get).sort('name')
        assert (result.values == pd.read_csv(fn, compression='gzip').sort('name').values).all()
Exemple #13
0
def test_index_col():
    with filetext(text) as fn:
        try:
            f = read_csv(fn, chunkbytes=30, index_col="name")
            assert False
        except ValueError as e:
            assert "set_index" in str(e)
Exemple #14
0
def test_read_csv():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get).sort('name')
        assert (result.values == pd.read_csv(fn).sort('name').values).all()
Exemple #15
0
def test_read_multiple_csv():
    try:
        with open('_foo.1.csv', 'w') as f:
            f.write(text)
        with open('_foo.2.csv', 'w') as f:
            f.write(text)
        df = dd.read_csv('_foo.*.csv', chunkbytes=30)
        assert df._known_dtype
        assert df.npartitions > 2

        assert (len(read_csv('_foo.*.csv').compute()) == len(
            read_csv('_foo.1.csv').compute()) * 2)

    finally:
        os.remove('_foo.1.csv')
        os.remove('_foo.2.csv')
Exemple #16
0
def test_read_multiple_csv():
    try:
        with open('_foo.1.csv', 'w') as f:
            f.write(text)
        with open('_foo.2.csv', 'w') as f:
            f.write(text)
        df = dd.read_csv('_foo.*.csv', chunkbytes=30)
        assert df._known_dtype
        assert df.npartitions > 2

        assert (len(read_csv('_foo.*.csv').compute()) ==
                len(read_csv('_foo.1.csv').compute()) * 2)

    finally:
        os.remove('_foo.1.csv')
        os.remove('_foo.2.csv')
Exemple #17
0
def test_index_col():
    with filetext(text) as fn:
        try:
            f = read_csv(fn, chunkbytes=30, index_col='name')
            assert False
        except ValueError as e:
            assert 'set_index' in str(e)
Exemple #18
0
def test_read_csv():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get).sort('name')
        assert (result.values == pd.read_csv(fn).sort('name').values).all()
Exemple #19
0
def test_read_csv_categorize_and_index():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=20, index='amount')
        assert f.index.compute().name == 'amount'

        expected = pd.read_csv(fn).set_index('amount')
        expected['name'] = expected.name.astype('category')
        assert eq(f, expected)
Exemple #20
0
def test_read_csv_categorize_and_index():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=20, index='amount')
        assert f.index.compute().name == 'amount'

        expected = pd.read_csv(fn).set_index('amount')
        expected['name'] = expected.name.astype('category')
        assert eq(f, expected)
Exemple #21
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True)
        assert list(f.dtypes) == ["category", "i8"]

        expected = pd.read_csv(fn)
        expected["name"] = expected.name.astype("category")

        assert (f.dtypes == expected.dtypes).all()
        assert len(f.compute().name.cat.categories) == 6
Exemple #22
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True)
        assert list(f.dtypes) == ['category', 'i8']

        expected = pd.read_csv(fn)
        expected['name'] = expected.name.astype('category')

        assert (f.dtypes == expected.dtypes).all()
        assert len(f.compute().name.cat.categories) == 6
Exemple #23
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True)
        assert list(f.dtypes) == ['category', 'i8']

        expected = pd.read_csv(fn)
        expected['name'] = expected.name.astype('category')

        assert (f.dtypes == expected.dtypes).all()
        assert len(f.compute().name.cat.categories) == 6
Exemple #24
0
def test_consistent_dtypes():
    text = """
    name,amount
    Alice,100.5
    Bob,-200.5
    Charlie,300
    Dennis,400
    Edith,-500
    Frank,600
    """.strip()

    with filetext(text) as fn:
        df = read_csv(fn, chunkbytes=30)
        assert isinstance(df.amount.sum().compute(), float)
Exemple #25
0
def test_consistent_dtypes():
    text = """
    name,amount
    Alice,100.5
    Bob,-200.5
    Charlie,300
    Dennis,400
    Edith,-500
    Frank,600
    """.strip()

    with filetext(text) as fn:
        df = read_csv(fn, chunkbytes=30)
        assert isinstance(df.amount.sum().compute(), float)
Exemple #26
0
def test_read_csv_categorize_and_index():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=20, index='amount')
        result = f.compute(get=get_sync)
        assert result.index.name == 'amount'

        blocks = dd.core.get(f.dask, f._keys(), get=get_sync)
        for i, block in enumerate(blocks):
            if i < len(f.divisions):
                assert (block.index <= f.divisions[i + 1]).all()
            if i > 0:
                assert (block.index > f.divisions[i]).all()

        expected = pd.read_csv(fn).set_index('amount')
        expected['name'] = expected.name.astype('category')

        result = result.sort()
        expected = expected.sort()

        assert eq(result, expected)
Exemple #27
0
def test_read_csv_categorize_with_parse_dates():
    with filetext(datetime_csv_file) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=['when'])
        assert list(f.dtypes) == ['category', 'i8', 'M8[ns]']
Exemple #28
0
def test_read_csv_with_nrows():
    with filetext(text) as fn:
        f = read_csv(fn, nrows=3)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions == 1
        assert eq(read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
Exemple #29
0
def test_read_csv_categorize_with_parse_dates():
    with filetext(datetime_csv_file) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=["when"])
        assert list(f.dtypes) == ["category", "i8", "M8[ns]"]
Exemple #30
0
def test_usecols():
    with filetext(datetime_csv_file) as fn:
        df = read_csv(fn, chunkbytes=30, usecols=['when', 'amount'])
        expected = pd.read_csv(fn, usecols=['when', 'amount'])
        assert (df.compute().values == expected.values).all()
Exemple #31
0
def test_read_csv_with_nrows():
    with filetext(text) as fn:
        f = read_csv(fn, nrows=3)
        assert list(f.columns) == ["name", "amount"]
        assert f.npartitions == 1
        assert eq(read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
Exemple #32
0
def test_read_csv_categorize_with_parse_dates():
    with filetext(datetime_csv_file) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=['when'])
        assert list(f.dtypes) == ['category', 'i8', 'M8[ns]']
Exemple #33
0
def test_usecols():
    with filetext(datetime_csv_file) as fn:
        df = read_csv(fn, chunkbytes=30, usecols=['when', 'amount'])
        expected = pd.read_csv(fn, usecols=['when', 'amount'])
        assert (df.compute().values == expected.values).all()