コード例 #1
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_modification_time_open_files(open_files):
    with filetexts(files, mode='b'):
        a = open_files('.test.accounts.*')
        b = open_files('.test.accounts.*')

        assert [aa._key for aa in a] == [bb._key for bb in b]

    sleep(1)

    double = lambda x: x + x
    with filetexts(valmap(double, files), mode='b'):
        c = open_files('.test.accounts.*')

    assert [aa._key for aa in a] != [cc._key for cc in c]
コード例 #2
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_registered_open_files():
    from dask.bytes.core import open_files
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        data = compute(*[file.read() for file in myfiles])
        assert list(data) == [files[k] for k in sorted(files)]
コード例 #3
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_compression_binary(fmt):
    from dask.bytes.core import open_files
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_files('.test.accounts.*', compression=fmt)
        data = compute(*[file.read() for file in myfiles])
        assert list(data) == [files[k] for k in sorted(files)]
コード例 #4
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_bad_compression():
    from dask.bytes.core import read_bytes, open_files, open_text_files
    with filetexts(files, mode='b'):
        for func in [read_bytes, open_files, open_text_files]:
            with pytest.raises(ValueError):
                sample, values = func('.test.accounts.*',
                                      compression='not-found')
コード例 #5
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_read_bytes_delimited():
    with filetexts(files, mode='b'):
        for bs in [5, 15, 45, 1500]:
            _, values = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'\n')
            _, values2 = read_bytes('.test.accounts*',
                                    blocksize=bs, delimiter=b'foo')
            assert ([a.key for a in concat(values)] !=
                    [b.key for b in concat(values2)])

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b'\n') for r in res)
            ourlines = b''.join(res).split(b'\n')
            testlines = b"".join(files[k] for k in sorted(files)).split(b'\n')
            assert ourlines == testlines

            # delimiter not at the end
            d = b'}'
            _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b'}') for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test
コード例 #6
0
ファイル: test_local.py プロジェクト: caseyclements/dask
def test_read_bytes_blocksize_types(blocksize):
    with filetexts(files, mode='b'):
        sample, vals = read_bytes('.test.account*', blocksize=blocksize)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b'\n')
        testlines = b"".join(files.values()).split(b'\n')
        assert set(ourlines) == set(testlines)
コード例 #7
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_registered_read_bytes():
    from dask.bytes.core import read_bytes
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*')

        results = compute(*concat(values))
        assert set(results) == set(files.values())
コード例 #8
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_skiprows(dd_read, pd_read, files):
    files = {name: comment_header + b'\n' + content for name, content in files.items()}
    skip = len(comment_header.splitlines())
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', skiprows=skip)
        expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)])
        assert_eq(df, expected_df, check_dtype=False)
コード例 #9
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_categorical_dtypes():
    text1 = normalize_text("""
    fruit,count
    apple,10
    apple,25
    pear,100
    orange,15
    """)

    text2 = normalize_text("""
    fruit,count
    apple,200
    banana,300
    orange,400
    banana,10
    """)

    with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}):
        df = dd.read_csv('foo.*.csv', dtype={'fruit': 'category'}, blocksize=25)
        assert df.fruit.dtype == 'category'
        assert not has_known_categories(df.fruit)
        res = df.compute()
        assert res.fruit.dtype == 'category'
        assert (sorted(res.fruit.cat.categories) ==
                ['apple', 'banana', 'orange', 'pear'])
コード例 #10
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_header_None():
    with filetexts({'.tmp.1.csv': '1,2',
                    '.tmp.2.csv': '',
                    '.tmp.3.csv': '3,4'}):
        df = dd.read_csv('.tmp.*.csv', header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        assert_eq(df.compute().reset_index(drop=True), expected)
コード例 #11
0
ファイル: test_local.py プロジェクト: gameduell/dask
def test_open_files():
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        for lazy_file, data_file in zip(myfiles, sorted(files)):
            with lazy_file as f:
                x = f.read()
                assert x == files[data_file]
コード例 #12
0
ファイル: test_text.py プロジェクト: mmngreco/dask
def test_errors():
    with filetexts({'.test.foo': b'Jos\xe9\nAlice'}, mode='b'):
        with pytest.raises(UnicodeDecodeError):
            read_text('.test.foo', encoding='ascii').compute()

        result = read_text('.test.foo', encoding='ascii', errors='ignore')
        result = result.compute(get=get)
        assert result == ['Jos\n', 'Alice']
コード例 #13
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_skiprows_as_list(dd_read, pd_read, files, units):
    files = {name: (comment_header + b'\n' +
                    content.replace(b'\n', b'\n' + units, 1)) for name, content in files.items()}
    skip = [0, 1, 2, 3, 5]
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', skiprows=skip)
        expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)])
        assert_eq(df, expected_df, check_dtype=False)
コード例 #14
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_read_csv_include_path_column_as_str(dd_read, files):
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', include_path_column='filename',
                     converters={'filename': parse_filename})
        filenames = df.filename.compute().unique()
        assert '2014-01-01.csv' in filenames
        assert '2014-01-02.csv' not in filenames
        assert '2014-01-03.csv' in filenames
コード例 #15
0
ファイル: test_bag.py プロジェクト: gameduell/dask
def test_read_text():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert (set(line.strip() for line in db.read_text(fns)) ==
                set('ABCD'))
        assert (set(line.strip() for line in db.read_text('a*.log')) ==
                set('ABCD'))

    pytest.raises(ValueError, lambda: db.read_text('non-existent-*-path'))
コード例 #16
0
ファイル: test_local.py プロジェクト: caseyclements/dask
def test_urlpath_expand_read():
    """Make sure * is expanded in file paths when reading."""
    # when reading, globs should be expanded to read files by mask
    with filetexts(csv_files, mode='b'):
        _, _, paths = get_fs_token_paths('.*.csv')
        assert len(paths) == 2
        _, _, paths = get_fs_token_paths(['.*.csv'])
        assert len(paths) == 2
コード例 #17
0
ファイル: test_bag.py プロジェクト: BabeNovelty/dask
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
コード例 #18
0
ファイル: test_csv.py プロジェクト: jseabold/dask
def test_read_csv_files():
    with filetexts(files, mode='b'):
        df = read_csv('2014-01-*.csv')
        eq(df, expected, check_dtype=False)

        fn = '2014-01-01.csv'
        df = read_csv(fn)
        expected2 = pd.read_csv(BytesIO(files[fn]))
        eq(df, expected2, check_dtype=False)
コード例 #19
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_read_csv_files(dd_read, pd_read, files):
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv')
        assert_eq(df, expected, check_dtype=False)

        fn = '2014-01-01.csv'
        df = dd_read(fn)
        expected2 = pd_read(BytesIO(files[fn]))
        assert_eq(df, expected2, check_dtype=False)
コード例 #20
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_read_csv_files_list(dd_read, pd_read, files):
    with filetexts(files, mode='b'):
        subset = sorted(files)[:2]  # Just first 2
        sol = pd.concat([pd_read(BytesIO(files[k])) for k in subset])
        res = dd_read(subset)
        assert_eq(res, sol, check_dtype=False)

        with pytest.raises(ValueError):
            dd_read([])
コード例 #21
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_robust_column_mismatch():
    files = csv_files.copy()
    k = sorted(files)[-1]
    files[k] = files[k].replace(b'name', b'Name')
    with filetexts(files, mode='b'):
        ddf = dd.read_csv('2014-01-*.csv')
        df = pd.read_csv('2014-01-01.csv')
        assert (df.columns == ddf.columns).all()
        assert_eq(ddf, ddf)
コード例 #22
0
ファイル: test_local.py プロジェクト: gameduell/dask
def test_registered_open_files():
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*')
        assert len(myfiles) == len(files)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k] for k in sorted(files)]
コード例 #23
0
ファイル: test_local.py プロジェクト: gameduell/dask
def test_compression_text(fmt):
    files2 = valmap(compression.compress[fmt], files)
    with filetexts(files2, mode='b'):
        myfiles = open_text_files('.test.accounts.*', compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k].decode() for k in sorted(files)]
コード例 #24
0
ファイル: test_text.py プロジェクト: mrocklin/dask
def test_files_per_partition():
    files3 = {'{:02}.txt'.format(n): 'line from {:02}' for n in range(20)}
    with filetexts(files3):
        b = read_text('*.txt', files_per_partition=10)

        l = len(b.take(100, npartitions=1))
        assert l == 10, "10 files should be grouped into one partition"

        assert b.count().compute() == 20, "All 20 lines should be read"
コード例 #25
0
ファイル: test_local.py プロジェクト: fortizc/dask
def test_read_bytes_blocksize_float():
    with filetexts(files, mode='b'):
        sample, vals = read_bytes('.test.account*', blocksize=5.0)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b'\n')
        testlines = b"".join(files.values()).split(b'\n')
        assert set(ourlines) == set(testlines)

        with pytest.raises(TypeError):
            read_bytes('.test.account*', blocksize=5.5)
コード例 #26
0
ファイル: test_local.py プロジェクト: fortizc/dask
def test_with_paths():
    pathlib = pytest.importorskip('pathlib')
    with filetexts(files, mode='b'):
        url = pathlib.Path('./.test.accounts.*')
        sample, values = read_bytes(url, blocksize=None)
        assert sum(map(len, values)) == len(files)
    with pytest.raises(OSError):
        # relative path doesn't work
        url = pathlib.Path('file://.test.accounts.*')
        read_bytes(url, blocksize=None)
コード例 #27
0
ファイル: test_csv.py プロジェクト: caseyclements/dask
def test_read_csv_include_path_column_is_dtype_category(dd_read, files):
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', include_path_column=True)
        assert df.path.dtype == 'category'
        assert has_known_categories(df.path)

        dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False)
        result = dfs[0].compute()
        assert result.path.dtype == 'category'
        assert has_known_categories(result.path)
コード例 #28
0
ファイル: test_local.py プロジェクト: caseyclements/dask
def test_open_files_text_mode(encoding):
    with filetexts(files, mode='b'):
        myfiles = open_files('.test.accounts.*', mode='rt', encoding=encoding)
        assert len(myfiles) == len(files)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k].decode(encoding)
                              for k in sorted(files)]
コード例 #29
0
ファイル: test_local.py プロジェクト: ankravch/dask
def test_compression(fmt, blocksize):
    compress = compression.compress[fmt]
    files2 = valmap(compress, files)
    with filetexts(files2, mode='b'):
        sample, values = read_bytes('.test.accounts.*.json',
                blocksize=blocksize, delimiter=b'\n', compression=fmt)
        assert sample[:5] == files[sorted(files)[0]][:5]

        results = compute(*concat(values))
        assert (b''.join(results) ==
                b''.join([files[k] for k in sorted(files)]))
コード例 #30
0
ファイル: test_local.py プロジェクト: chrislaing/dask
def test_read_bytes_sample_delimiter():
    with filetexts(files, mode='b'):
        sample, values = read_bytes('.test.accounts.*',
                                    sample=80, delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=80, delimiter=b'\n')
        assert sample.endswith(b'\n')
        sample, values = read_bytes('.test.accounts.1.json',
                                    sample=2, delimiter=b'\n')
        assert sample.endswith(b'\n')
コード例 #31
0
ファイル: test_local.py プロジェクト: m-rossi/dask
def test_read_bytes_block():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, 1500]:
            sample, vals = read_bytes(".test.account*", blocksize=bs)
            assert list(
                map(len,
                    vals)) == [max((len(v) // bs), 1) for v in files.values()]

            results = compute(*concat(vals))
            assert sum(len(r) for r in results) == sum(
                len(v) for v in files.values())

            ourlines = b"".join(results).split(b"\n")
            testlines = b"".join(files.values()).split(b"\n")
            assert set(ourlines) == set(testlines)
コード例 #32
0
def test_compression(fmt, blocksize):
    compress = compression.compress[fmt]
    files2 = valmap(compress, files)
    with filetexts(files2, mode="b"):
        sample, values = read_bytes(
            ".test.accounts.*.json",
            blocksize=blocksize,
            delimiter=b"\n",
            compression=fmt,
        )
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
コード例 #33
0
ファイル: test_csv.py プロジェクト: pitrou/dask
def test_warn_non_seekable_files(capsys):
    files2 = valmap(compress['gzip'], files)
    with filetexts(files2, mode='b'):
        df = read_csv('2014-01-*.csv', compression='gzip')
        assert df.npartitions == 3
        out, err = capsys.readouterr()
        assert 'gzip' in err
        assert 'blocksize=None' in err

        df = read_csv('2014-01-*.csv', compression='gzip', blocksize=None)
        out, err = capsys.readouterr()
        assert not err and not out

        with pytest.raises(NotImplementedError):
            df = read_csv('2014-01-*.csv', compression='foo')
コード例 #34
0
ファイル: test_csv.py プロジェクト: LvdKnaap/BinPacking
def test_read_csv_compression(fmt, blocksize):
    if fmt not in compress:
        pytest.skip("compress function not provided for %s" % fmt)
    files2 = valmap(compress[fmt], csv_files)
    with filetexts(files2, mode="b"):
        if fmt and blocksize:
            with pytest.warns(UserWarning):
                df = dd.read_csv("2014-01-*.csv", compression=fmt, blocksize=blocksize)
        else:
            df = dd.read_csv("2014-01-*.csv", compression=fmt, blocksize=blocksize)
        assert_eq(
            df.compute(scheduler="sync").reset_index(drop=True),
            expected.reset_index(drop=True),
            check_dtype=False,
        )
コード例 #35
0
ファイル: test_text.py プロジェクト: trentwatt/dask
def test_complex_delimiter():
    longstr = "abc\ndef\n123\n$$$$\ndog\ncat\nfish\n\n\r\n$$$$hello"
    with filetexts({".test.delim.txt": longstr}):
        assert read_text(".test.delim.txt", linedelimiter="$$$$").count().compute() == 3
        assert (
            read_text(".test.delim.txt", linedelimiter="$$$$", blocksize=2)
            .count()
            .compute()
            == 3
        )
        vals = read_text(".test.delim.txt", linedelimiter="$$$$").compute()
        assert vals[-1] == "hello"
        assert vals[0].endswith("$$$$")
        vals = read_text(".test.delim.txt", linedelimiter="$$$$", blocksize=2).compute()
        assert vals[-1] == "hello"
        assert vals[0].endswith("$$$$")
コード例 #36
0
ファイル: test_local.py プロジェクト: xvr-hlt/dask
def test_open_files_compression(mode, fmt):
    if fmt == "zip" and sys.version_info.minor == 5:
        pytest.skip("zipfile is read-only on py35")
    if fmt not in compress:
        pytest.skip("compression function not provided")
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode="b"):
        myfiles = open_files(".test.accounts.*", mode=mode, compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        sol = [files[k] for k in sorted(files)]
        if mode == "rt":
            sol = [b.decode() for b in sol]
        assert list(data) == sol
コード例 #37
0
ファイル: test_csv.py プロジェクト: zmyer/dask
def test_auto_blocksize_csv(monkeypatch):
    psutil = pytest.importorskip('psutil')
    try:
        from unittest import mock
    except ImportError:
        mock = pytest.importorskip('mock')
    total_memory = psutil.virtual_memory().total
    cpu_count = psutil.cpu_count()
    mock_read_bytes = mock.Mock(wraps=read_bytes)
    monkeypatch.setattr(dask.dataframe.io.csv, 'read_bytes', mock_read_bytes)

    expected_block_size = auto_blocksize(total_memory, cpu_count)
    with filetexts(csv_files, mode='b'):
        dd.read_csv('2014-01-01.csv')
        assert mock_read_bytes.called
        assert mock_read_bytes.call_args[1]['blocksize'] == expected_block_size
コード例 #38
0
ファイル: test_csv.py プロジェクト: zhuomingliang/dask
def test_head_partial_line_fix():
    files = {
        ".overflow1.csv": (
            "a,b\n0,'abcdefghijklmnopqrstuvwxyz'\n1,'abcdefghijklmnopqrstuvwxyz'"
        ),
        ".overflow2.csv": ("a,b\n111111,-11111\n222222,-22222\n333333,-33333\n"),
    }
    with filetexts(files):
        # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code.
        dd.read_csv(".overflow1.csv", sample=52)

        # 35 characters is cuts off before the second number on the last line
        # Should sample to end of line, otherwise pandas will infer `b` to be
        # a float dtype
        df = dd.read_csv(".overflow2.csv", sample=35)
        assert (df.dtypes == "i8").all()
コード例 #39
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_names():
    with filetexts(files, mode="b"):
        _, a = read_bytes(".test.accounts.*")
        _, b = read_bytes(".test.accounts.*")
        a = list(concat(a))
        b = list(concat(b))

        assert [aa._key for aa in a] == [bb._key for bb in b]

        sleep(1)
        for fn in files:
            with open(fn, "ab") as f:
                f.write(b"x")

        _, c = read_bytes(".test.accounts.*")
        c = list(concat(c))
        assert [aa._key for aa in a] != [cc._key for cc in c]
コード例 #40
0
def test_head_partial_line_fix():
    files = {'.overflow1.csv': ('a,b\n'
                                '0,"abcdefghijklmnopqrstuvwxyz"\n'
                                '1,"abcdefghijklmnopqrstuvwxyz"'),
             '.overflow2.csv': ('a,b\n'
                                '111111,-11111\n'
                                '222222,-22222\n'
                                '333333,-33333\n')}
    with filetexts(files):
        # 64 byte file, 52 characters is mid-quote; this should not cause exception in head-handling code.
        dd.read_csv('.overflow1.csv', sample=52)

        # 35 characters is cuts off before the second number on the last line
        # Should sample to end of line, otherwise pandas will infer `b` to be
        # a float dtype
        df = dd.read_csv('.overflow2.csv', sample=35)
        assert (df.dtypes == 'i8').all()
コード例 #41
0
ファイル: test_text.py プロジェクト: drmaize/compvision
def test_read_text(fmt, bs, encoding):
    compress = compression.compress[fmt]
    files2 = dict((k, compress(v.encode(encoding))) for k, v in files.items())
    with filetexts(files2, mode='b'):
        b = read_text('.test.accounts.*.json',
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        L, = compute(b)
        assert ''.join(L) == expected

        blocks = read_text('.test.accounts.*.json',
                           compression=fmt,
                           blocksize=bs,
                           encoding=encoding,
                           collection=False)
        L = compute(*blocks)
        assert ''.join(line for block in L for line in block) == expected
コード例 #42
0
def test_read_csv_compression(fmt, blocksize):
    if fmt and fmt not in compress:
        pytest.skip("compress function not provided for %s" % fmt)
    suffix = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}.get(fmt, "")
    files2 = valmap(compress[fmt], csv_files) if fmt else csv_files
    renamed_files = {k + suffix: v for k, v in files2.items()}
    with filetexts(renamed_files, mode="b"):
        # This test is using `compression="infer"` (the default) for
        # read_csv.  The paths must have the appropriate extension.
        if fmt and blocksize:
            with pytest.warns(UserWarning):
                df = dd.read_csv("2014-01-*.csv" + suffix, blocksize=blocksize)
        else:
            df = dd.read_csv("2014-01-*.csv" + suffix, blocksize=blocksize)
        assert_eq(
            df.compute(scheduler="sync").reset_index(drop=True),
            expected.reset_index(drop=True),
            check_dtype=False,
        )
コード例 #43
0
ファイル: test_csv.py プロジェクト: zmyer/dask
def test_consistent_dtypes_2():
    text1 = normalize_text("""
    name,amount
    Alice,100
    Bob,-200
    Charlie,300
    """)

    text2 = normalize_text("""
    name,amount
    1,400
    2,-500
    Frank,600
    """)

    with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}):
        df = dd.read_csv('foo.*.csv', blocksize=25)
        assert df.name.dtype == object
        assert df.name.compute().dtype == object
コード例 #44
0
ファイル: test_csv.py プロジェクト: zhuomingliang/dask
def test_warn_non_seekable_files():
    files2 = valmap(compress["gzip"], csv_files)
    with filetexts(files2, mode="b"):

        with pytest.warns(UserWarning) as w:
            df = dd.read_csv("2014-01-*.csv", compression="gzip")
            assert df.npartitions == 3

        assert len(w) == 1
        msg = str(w[0].message)
        assert "gzip" in msg
        assert "blocksize=None" in msg

        with pytest.warns(None) as w:
            df = dd.read_csv("2014-01-*.csv", compression="gzip", blocksize=None)
        assert len(w) == 0

        with pytest.raises(NotImplementedError):
            with pytest.warns(UserWarning):  # needed for pytest
                df = dd.read_csv("2014-01-*.csv", compression="foo")
コード例 #45
0
def test_warn_non_seekable_files():
    files2 = valmap(compress['gzip'], csv_files)
    with filetexts(files2, mode='b'):

        with pytest.warns(UserWarning) as w:
            df = dd.read_csv('2014-01-*.csv', compression='gzip')
            assert df.npartitions == 3

        assert len(w) == 1
        msg = str(w[0].message)
        assert 'gzip' in msg
        assert 'blocksize=None' in msg

        with pytest.warns(None) as w:
            df = dd.read_csv('2014-01-*.csv', compression='gzip',
                             blocksize=None)
        assert len(w) == 0

        with pytest.raises(NotImplementedError):
            with pytest.warns(UserWarning):  # needed for pytest
                df = dd.read_csv('2014-01-*.csv', compression='foo')
コード例 #46
0
def test_read_text(fmt, bs, encoding, include_path):
    if fmt not in utils.compress:
        pytest.skip("compress function not provided for %s" % fmt)
    compress = utils.compress[fmt]
    files2 = {k: compress(v.encode(encoding)) for k, v in files.items()}
    with filetexts(files2, mode="b"):
        b = read_text(".test.accounts.*.json",
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        (L, ) = compute(b)
        assert "".join(L) == expected

        o = read_text(
            sorted(files),
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            include_path=include_path,
        )
        b = o.pluck(0) if include_path else o
        (L, ) = compute(b)
        assert "".join(L) == expected
        if include_path:
            (paths, ) = compute(o.pluck(1))
            expected_paths = list(
                concat([[k] * v.count("\n") for k, v in files.items()]))
            assert len(paths) == len(expected_paths)
            for path, expected_path in zip(paths, expected_paths):
                assert path.endswith(expected_path)

        blocks = read_text(
            ".test.accounts.*.json",
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            collection=False,
        )
        L = compute(*blocks)
        assert "".join(line for block in L for line in block) == expected
コード例 #47
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_read_bytes_delimited():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, "1.5 kB"]:
            _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"\n")
            _, values2 = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"foo")
            assert [a.key for a in concat(values)] != [b.key for b in concat(values2)]

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b"\n") for r in res)
            ourlines = b"".join(res).split(b"\n")
            testlines = b"".join(files[k] for k in sorted(files)).split(b"\n")
            assert ourlines == testlines

            # delimiter not at the end
            d = b"}"
            _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b"}") for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test
コード例 #48
0
ファイル: test_csv.py プロジェクト: zmyer/dask
def test_read_csv_sensitive_to_enforce():
    with filetexts(csv_files, mode='b'):
        a = dd.read_csv('2014-01-*.csv', enforce=True)
        b = dd.read_csv('2014-01-*.csv', enforce=False)
        assert a._name != b._name
コード例 #49
0
def test_read_csv_compression(fmt, blocksize):
    files2 = valmap(compress[fmt], csv_files)
    with filetexts(files2, mode='b'):
        df = dd.read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize)
        assert_eq(df.compute(scheduler='sync').reset_index(drop=True),
                  expected.reset_index(drop=True), check_dtype=False)
コード例 #50
0
def test_read_csv_include_path_column_with_duplicate_name(dd_read, files):
    with filetexts(files, mode='b'):
        with pytest.raises(ValueError):
            dd_read('2014-01-*.csv', include_path_column='name')
コード例 #51
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_read_bytes_blocksize_float_errs():
    with filetexts(files, mode="b"):
        with pytest.raises(TypeError):
            read_bytes(".test.account*", blocksize=5.5)
コード例 #52
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_read_bytes_no_sample():
    with filetexts(files, mode="b"):
        sample, _ = read_bytes(".test.accounts.1.json", sample=False)
        assert sample is False
コード例 #53
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_parse_sample_bytes():
    with filetexts(files, mode="b"):
        sample, values = read_bytes(".test.accounts.*", sample="40 B")
        assert len(sample) == 40
コード例 #54
0
ファイル: test_csv.py プロジェクト: manishvishnoi2/DM3
def test_read_csv_include_path_column_with_duplicate_name(dd_read, files):
    with filetexts(files, mode="b"):
        with pytest.raises(ValueError):
            dd_read("2014-01-*.csv", include_path_column="name")
コード例 #55
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_bad_compression():
    with filetexts(files, mode="b"):
        for func in [read_bytes, open_files]:
            with pytest.raises(ValueError):
                sample, values = func(".test.accounts.*", compression="not-found")
コード例 #56
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_with_urls():
    with filetexts(files, mode="b"):
        # OS-independent file:// URI with glob *
        url = to_uri(".test.accounts.") + "*"
        sample, values = read_bytes(url, blocksize=None)
        assert sum(map(len, values)) == len(files)
コード例 #57
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_read_bytes_include_path():
    with filetexts(files, mode="b"):
        _, _, paths = read_bytes(".test.accounts.*", include_path=True)
        assert {os.path.split(path)[1] for path in paths} == set(files.keys())
コード例 #58
0
ファイル: test_local.py プロジェクト: bigmpc/dask
def test_read_bytes_blocksize_none():
    with filetexts(files, mode="b"):
        sample, values = read_bytes(".test.accounts.*", blocksize=None)
        assert sum(map(len, values)) == len(files)
コード例 #59
0
ファイル: test_csv.py プロジェクト: zmyer/dask
def test_multiple_read_csv_has_deterministic_name():
    with filetexts({'_foo.1.csv': csv_text, '_foo.2.csv': csv_text}):
        a = dd.read_csv('_foo.*.csv')
        b = dd.read_csv('_foo.*.csv')

        assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
コード例 #60
0
ファイル: test_csv.py プロジェクト: manishvishnoi2/DM3
def test_read_csv_no_sample():
    with filetexts(csv_files, mode="b") as fn:
        df = dd.read_csv(fn, sample=False)
        assert list(df.columns) == ["name", "amount", "id"]