Ejemplo n.º 1
0
def test_error_if_sample_is_too_small():
    text = ('AAAAA,BBBBB,CCCCC,DDDDD,EEEEE\n'
            '1,2,3,4,5\n'
            '6,7,8,9,10\n'
            '11,12,13,14,15')
    with filetext(text) as fn:
        # Sample size stops mid header row
        sample = 20
        with pytest.raises(ValueError):
            dd.read_csv(fn, sample=sample)

        # Saying no header means this is fine
        assert_eq(dd.read_csv(fn, sample=sample, header=None),
                  pd.read_csv(fn, header=None))

    skiptext = ('# skip\n'
                '# these\n'
                '# lines\n')

    text = skiptext + text
    with filetext(text) as fn:
        # Sample size stops mid header row
        sample = 20 + len(skiptext)
        with pytest.raises(ValueError):
            dd.read_csv(fn, sample=sample, skiprows=3)

        # Saying no header means this is fine
        assert_eq(dd.read_csv(fn, sample=sample, header=None, skiprows=3),
                  pd.read_csv(fn, header=None, skiprows=3))
Ejemplo n.º 2
0
def test_assume_missing():
    text = 'numbers,names,more_numbers,integers\n'
    for i in range(1000):
        text += '1,foo,2,3\n'
    text += '1.5,bar,2.5,3\n'
    with filetext(text) as fn:
        sol = pd.read_csv(fn)

        # assume_missing affects all columns
        res = dd.read_csv(fn, sample=50, assume_missing=True)
        assert_eq(res, sol.astype({'integers': float}))

        # assume_missing doesn't override specified dtypes
        res = dd.read_csv(fn, sample=50, assume_missing=True,
                          dtype={'integers': 'int64'})
        assert_eq(res, sol)

        # assume_missing works with dtype=None
        res = dd.read_csv(fn, sample=50, assume_missing=True, dtype=None)
        assert_eq(res, sol.astype({'integers': float}))

    text = 'numbers,integers\n'
    for i in range(1000):
        text += '1,2\n'
    text += '1.5,2\n'

    with filetext(text) as fn:
        sol = pd.read_csv(fn)

        # assume_missing ignored when all dtypes specifed
        df = dd.read_csv(fn, sample=30, dtype='int64', assume_missing=True)
        assert df.numbers.dtype == 'int64'
Ejemplo n.º 3
0
def test_read_csv():
    with filetext(text) as fn:
        f = dd.read_csv(fn, chunksize=3)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions == 2
        assert eq(f, pd.read_csv(fn))

    with filetext(text) as fn:
        f = dd.read_csv(fn, chunksize=4)
        assert f.npartitions == 2

        f = dd.read_csv(fn)
Ejemplo n.º 4
0
def test_late_dtypes():
    text = 'numbers,names,more_numbers,integers\n'
    for i in range(1000):
        text += '1,foo,2,3\n'
    text += '1.5,bar,2.5,3\n'
    with filetext(text) as fn:
        sol = pd.read_csv(fn)
        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50).compute(get=get_sync)

        msg = ("Mismatched dtypes found.\n"
               "Expected integers, but found floats for columns:\n"
               "- 'more_numbers'\n"
               "- 'numbers'\n"
               "\n"
               "To fix, specify dtypes manually by adding:\n"
               "\n"
               "dtype={'more_numbers': float,\n"
               "       'numbers': float}\n"
               "\n"
               "to the call to `read_csv`/`read_table`.\n"
               "\n"
               "Alternatively, provide `assume_missing=True` to interpret "
               "all unspecified integer columns as floats.")

        assert str(e.value) == msg

        # Specifying dtypes works
        res = dd.read_csv(fn, sample=50,
                          dtype={'more_numbers': float, 'numbers': float})
        assert_eq(res, sol)
Ejemplo n.º 5
0
def test_index_col():
    with filetext(text) as fn:
        try:
            f = dd.read_csv(fn, chunkbytes=30, index_col='name')
            assert False
        except ValueError as e:
            assert 'set_index' in str(e)
Ejemplo n.º 6
0
def test_parse_dates_multi_column():
    with filetext(pdmc_text) as fn:
        ddf = dd.read_csv(fn, parse_dates=[['date', 'time']])
        df = pd.read_csv(fn, parse_dates=[['date', 'time']])

        assert (df.columns == ddf.columns).all()
        assert len(df) == len(ddf)
Ejemplo n.º 7
0
def test_read_csv_sep():
    with filetext(sep_text) as fn:
        ddf = dd.read_csv(fn, sep="###")
        df = pd.read_csv(fn, sep="###")

        assert (df.columns == ddf.columns).all()
        assert len(df) == len(ddf)
Ejemplo n.º 8
0
def test_read_csv():
    with filetext(text) as fn:
        f = dd.read_csv(fn, chunkbytes=30)
        assert list(f.columns) == ["name", "amount"]
        assert f.npartitions > 1
        result = f.compute(get=dask.get).sort("name")
        assert (result.values == pd.read_csv(fn).sort("name").values).all()
Ejemplo n.º 9
0
def test_read_gzip_csv():
    with filetext(text.encode(), open=gzip.open) as fn:
        f = dd.read_csv(fn, chunkbytes=30, compression="gzip")
        assert list(f.columns) == ["name", "amount"]
        assert f.npartitions > 1
        result = f.compute(get=dask.get).sort("name")
        assert (result.values == pd.read_csv(fn, compression="gzip").sort("name").values).all()
Ejemplo n.º 10
0
def test_index_col():
    with filetext(text) as fn:
        try:
            f = read_csv(fn, chunkbytes=30, index_col="name")
            assert False
        except ValueError as e:
            assert "set_index" in str(e)
Ejemplo n.º 11
0
def test_read_csv(dd_read, pd_read, text, sep):
    with filetext(text) as fn:
        f = dd_read(fn, blocksize=30, lineterminator=os.linesep, sep=sep)
        assert list(f.columns) == ['name', 'amount']
        # index may be different
        result = f.compute(scheduler='sync').reset_index(drop=True)
        assert_eq(result, pd_read(fn, sep=sep))
Ejemplo n.º 12
0
def test_index_col():
    with filetext(csv_text) as fn:
        try:
            dd.read_csv(fn, blocksize=30, index_col='name')
            assert False
        except ValueError as e:
            assert 'set_index' in str(e)
Ejemplo n.º 13
0
def test_read_gzip_csv():
    with filetext(text.encode(), open=gzip.open) as fn:
        f = dd.read_csv(fn, chunkbytes=30, compression='gzip')
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get)
        assert eq(result.reset_index(drop=True),
                  pd.read_csv(fn, compression='gzip'))
Ejemplo n.º 14
0
def test_read_csv_header_issue_823():
    text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t')
    with filetext(text) as fn:
        df = dd.read_csv(fn, sep='\t')
        assert_eq(df, pd.read_csv(fn, sep='\t'))

        df = dd.read_csv(fn, delimiter='\t')
        assert_eq(df, pd.read_csv(fn, delimiter='\t'))
Ejemplo n.º 15
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = dd.read_csv(fn, chunksize=3, categorize=True)
        assert list(f.dtypes) == ['category', 'i8']

        expected = pd.read_csv(fn)
        expected['name'] = expected.name.astype('category')
        assert eq(f, expected)
Ejemplo n.º 16
0
def test_read_csv():
    with filetext(text) as fn:
        f = dd.read_csv(fn, chunkbytes=30, lineterminator=os.linesep)
        assert list(f.columns) == ['name', 'amount']
        result = f.compute(get=dask.get)
        # index may be different
        assert eq(result.reset_index(drop=True),
                  pd.read_csv(fn))
Ejemplo n.º 17
0
def test_read_csv_categorize_and_index():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=20, index='amount')
        assert f.index.compute().name == 'amount'

        expected = pd.read_csv(fn).set_index('amount')
        expected['name'] = expected.name.astype('category')
        assert eq(f, expected)
Ejemplo n.º 18
0
def test_read_csv_with_datetime_index_partitions_n():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4],
                         parse_dates=['Date'])
        # because fn is so small, by default, set chunksize small
        ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4],
                          parse_dates=['Date'], chunkbytes=400)
        eq(df, ddf)
Ejemplo n.º 19
0
def test_from_pandas_with_datetime_index():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4],
                         parse_dates=['Date'])
        ddf = dd.from_pandas(df, 2)
        eq(df, ddf)
        ddf = dd.from_pandas(df, chunksize=2)
        eq(df, ddf)
Ejemplo n.º 20
0
def test_string_blocksize():
    with filetext(timeseries) as fn:
        a = dd.read_csv(fn, blocksize='30B')
        b = dd.read_csv(fn, blocksize='30')
        assert a.npartitions == b.npartitions

        c = dd.read_csv(fn, blocksize='64MiB')
        assert c.npartitions == 1
Ejemplo n.º 21
0
def test_read_csv_of_modified_file_has_different_name():
    with filetext(text) as fn:
        a = read_csv(fn)
        with open(fn, 'a') as f:
            f.write('\nGeorge,700')
            os.fsync(f)
        b = read_csv(fn)

        assert sorted(a.dask) != sorted(b.dask)
Ejemplo n.º 22
0
def test_read_csv():
    with filetext(text) as fn:
        f = dd.read_csv(fn, chunkbytes=30)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get)
        # index may be different
        assert eq(result.reset_index(drop=True),
                  pd.read_csv(fn))
Ejemplo n.º 23
0
def test_report_dtype_correction_on_csvs():
    text = 'numbers,names\n'
    for i in range(1000):
        text += '1,foo\n'
    text += '1.5,bar\n'
    with filetext(text) as fn:
        with pytest.raises(ValueError) as e:
            dd.read_csv(fn).compute(get=get_sync)
        assert "'numbers': 'float64'" in str(e)
Ejemplo n.º 24
0
def test_read_csv_has_deterministic_name():
    with filetext(csv_text) as fn:
        a = dd.read_csv(fn)
        b = dd.read_csv(fn)
        assert a._name == b._name
        assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
        assert isinstance(a._name, str)

        c = dd.read_csv(fn, skiprows=1, na_values=[0])
        assert a._name != c._name
Ejemplo n.º 25
0
def test_textblock(myopen, compression):
    text = b'123 456 789 abc def ghi'.replace(b' ', os.linesep.encode())
    with filetext(text, open=myopen, mode='wb') as fn:
        text = ''.join(textblock(fn, 1, 11, compression)).encode()
        assert text == ('456 789 '.replace(' ', os.linesep)).encode()
        assert set(map(len, text.split())) == set([3])

        k = 3 + len(os.linesep)
        assert ''.join(textblock(fn, 0, k, compression)).encode() == ('123' + os.linesep).encode()
        assert ''.join(textblock(fn, k, k, compression)).encode() == b''
Ejemplo n.º 26
0
def test_read_csv_with_datetime_index_partitions_one():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=["Date"])
        # chunkbytes set to explicitly set to single chunk
        ddf = dd.read_csv(fn, index="Date", header=0, usecols=[0, 4], parse_dates=["Date"], chunkbytes=10000000)
        eq(df, ddf)

        # because fn is so small, by default, this will only be one chunk
        ddf = dd.read_csv(fn, index="Date", header=0, usecols=[0, 4], parse_dates=["Date"])
        eq(df, ddf)
Ejemplo n.º 27
0
def test_textblock(myopen, compression):
    text = b"123 456 789 abc def ghi".replace(b" ", os.linesep.encode())
    with filetext(text, open=myopen, mode="wb") as fn:
        text = "".join(textblock(fn, 1, 11, compression)).encode()
        assert text == ("456 789 ".replace(" ", os.linesep)).encode()
        assert set(map(len, text.split())) == set([3])

        k = 3 + len(os.linesep)
        assert "".join(textblock(fn, 0, k, compression)).encode() == ("123" + os.linesep).encode()
        assert "".join(textblock(fn, k, k, compression)).encode() == b""
Ejemplo n.º 28
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True)
        assert list(f.dtypes) == ['category', 'i8']

        expected = pd.read_csv(fn)
        expected['name'] = expected.name.astype('category')

        assert (f.dtypes == expected.dtypes).all()
        assert len(f.compute().name.cat.categories) == 6
Ejemplo n.º 29
0
def test_read_csv_categorize():
    with filetext(text) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True)
        assert list(f.dtypes) == ["category", "i8"]

        expected = pd.read_csv(fn)
        expected["name"] = expected.name.astype("category")

        assert (f.dtypes == expected.dtypes).all()
        assert len(f.compute().name.cat.categories) == 6
Ejemplo n.º 30
0
def test_read_csv_of_modified_file_has_different_name():
    with filetext(csv_text) as fn:
        sleep(1)
        a = dd.read_csv(fn)
        sleep(1)
        with open(fn, 'a') as f:
            f.write('\nGeorge,700')
            os.fsync(f)
        b = dd.read_csv(fn)

        assert sorted(a.dask, key=str) != sorted(b.dask, key=str)
Ejemplo n.º 31
0
def test_read_csv_skiprows_range():
    with filetext(csv_text) as fn:
        f = dd.read_csv(fn, skiprows=range(5))
        result = f
        expected = pd.read_csv(fn, skiprows=range(5))
        assert_eq(result, expected)
Ejemplo n.º 32
0
def test_usecols():
    with filetext(timeseries) as fn:
        df = dd.read_csv(fn, blocksize=30, usecols=["High", "Low"])
        expected = pd.read_csv(fn, usecols=["High", "Low"])
        assert (df.compute().values == expected.values).all()
Ejemplo n.º 33
0
def test_empty_csv_file():
    with filetext("a,b") as fn:
        df = dd.read_csv(fn, header=0)
        assert len(df.compute()) == 0
        assert list(df.columns) == ["a", "b"]
Ejemplo n.º 34
0
def test_file_size():
    counts = (len(text), len(text) + text.count('\n'))
    with filetext(text) as fn:
        assert file_size(fn) in counts
    with filetext(text.encode(), open=GzipFile) as fn:
        assert file_size(fn, 'gzip') in counts
Ejemplo n.º 35
0
def test_read_csv_with_nrows():
    with filetext(text) as fn:
        f = dd.read_csv(fn, nrows=3)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions == 1
        assert eq(dd.read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
Ejemplo n.º 36
0
def test_read_csv_categorize_with_parse_dates():
    with filetext(datetime_csv_file) as fn:
        f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=['when'])
        assert list(f.dtypes) == ['category', 'i8', 'M8[ns]']
Ejemplo n.º 37
0
def test_empty_csv_file():
    with filetext('a,b') as fn:
        df = dd.read_csv(fn, header=0)
        assert len(df.compute()) == 0
        assert list(df.columns) == ['a', 'b']
Ejemplo n.º 38
0
def test_read_csv_large_skiprows(dd_read, pd_read, text, skip):
    names = ["name", "amount"]
    with filetext(text) as fn:
        actual = dd_read(fn, skiprows=skip, names=names)
        assert_eq(actual, pd_read(fn, skiprows=skip, names=names))
Ejemplo n.º 39
0
def test_usecols():
    with filetext(datetime_csv_file) as fn:
        df = dd.read_csv(fn, chunkbytes=30, usecols=['when', 'amount'])
        expected = pd.read_csv(fn, usecols=['when', 'amount'])
        assert (df.compute().values == expected.values).all()
Ejemplo n.º 40
0
def test_csv_with_integer_names():
    with filetext('alice,1\nbob,2') as fn:
        df = dd.read_csv(fn, header=None)
        assert list(df.columns) == [0, 1]
Ejemplo n.º 41
0
def test_none_usecols():
    with filetext(text) as fn:
        df = dd.read_csv(fn, usecols=None)
        eq(df, pd.read_csv(fn, usecols=None))
Ejemplo n.º 42
0
def test_read_csv_singleton_dtype():
    data = b"a,b\n1,2\n3,4\n5,6"
    with filetext(data, mode="wb") as fn:
        assert_eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))
Ejemplo n.º 43
0
def test_late_dtypes():
    text = "numbers,names,more_numbers,integers,dates\n"
    for i in range(1000):
        text += "1,,2,3,2017-10-31 00:00:00\n"
    text += "1.5,bar,2.5,3,4998-01-01 00:00:00\n"

    date_msg = (
        "\n"
        "\n"
        "-------------------------------------------------------------\n"
        "\n"
        "The following columns also failed to properly parse as dates:\n"
        "\n"
        "- dates\n"
        "\n"
        "This is usually due to an invalid value in that column. To\n"
        "diagnose and fix it's recommended to drop these columns from the\n"
        "`parse_dates` keyword, and manually convert them to dates later\n"
        "using `dd.to_datetime`."
    )

    with filetext(text) as fn:
        sol = pd.read_csv(fn)
        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "+--------------+---------+----------+\n"
            "| Column       | Found   | Expected |\n"
            "+--------------+---------+----------+\n"
            "| more_numbers | float64 | int64    |\n"
            "| names        | object  | float64  |\n"
            "| numbers      | float64 | int64    |\n"
            "+--------------+---------+----------+\n"
            "\n"
            "- names\n"
            "  ValueError(.*)\n"
            "\n"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n"
            "\n"
            "dtype={'more_numbers': 'float64',\n"
            "       'names': 'object',\n"
            "       'numbers': 'float64'}\n"
            "\n"
            "to the call to `read_csv`/`read_table`."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50, parse_dates=["dates"]).compute(scheduler="sync")
        assert e.match(msg + date_msg)

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50).compute(scheduler="sync")
        assert e.match(msg)

        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "+--------------+---------+----------+\n"
            "| Column       | Found   | Expected |\n"
            "+--------------+---------+----------+\n"
            "| more_numbers | float64 | int64    |\n"
            "| numbers      | float64 | int64    |\n"
            "+--------------+---------+----------+\n"
            "\n"
            "Usually this is due to dask's dtype inference failing, and\n"
            "*may* be fixed by specifying dtypes manually by adding:\n"
            "\n"
            "dtype={'more_numbers': 'float64',\n"
            "       'numbers': 'float64'}\n"
            "\n"
            "to the call to `read_csv`/`read_table`.\n"
            "\n"
            "Alternatively, provide `assume_missing=True` to interpret\n"
            "all unspecified integer columns as floats."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50, dtype={"names": "O"}).compute(scheduler="sync")
        assert str(e.value) == msg

        with pytest.raises(ValueError) as e:
            dd.read_csv(
                fn, sample=50, parse_dates=["dates"], dtype={"names": "O"}
            ).compute(scheduler="sync")
        assert str(e.value) == msg + date_msg

        msg = (
            "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n"
            "\n"
            "The following columns failed to properly parse as dates:\n"
            "\n"
            "- dates\n"
            "\n"
            "This is usually due to an invalid value in that column. To\n"
            "diagnose and fix it's recommended to drop these columns from the\n"
            "`parse_dates` keyword, and manually convert them to dates later\n"
            "using `dd.to_datetime`."
        )

        with pytest.raises(ValueError) as e:
            dd.read_csv(
                fn,
                sample=50,
                parse_dates=["dates"],
                dtype={"more_numbers": float, "names": object, "numbers": float},
            ).compute(scheduler="sync")
        assert str(e.value) == msg

        # Specifying dtypes works
        res = dd.read_csv(
            fn,
            sample=50,
            dtype={"more_numbers": float, "names": object, "numbers": float},
        )
        assert_eq(res, sol)
Ejemplo n.º 44
0
def test_windows_line_terminator():
    text = "a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7"
    with filetext(text) as fn:
        df = dd.read_csv(fn, blocksize=5, lineterminator="\r\n")
        assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7
        assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
Ejemplo n.º 45
0
def test_filesize(myopen, compression):
    text = b'123 456 789 abc def ghi'.replace(b' ', os.linesep.encode())
    with filetext(text, open=myopen, mode='wb') as fn:
        assert file_size(fn, compression) == len(text)
Ejemplo n.º 46
0
def test_read_csv_slash_r():
    data = b'0,my\n1,data\n' * 1000 + b'2,foo\rbar'
    with filetext(data, mode='wb') as fn:
        dd.read_csv(fn, header=None, sep=',', lineterminator='\n',
                    names=['a','b'], blocksize=200).compute(get=dask.get)
Ejemplo n.º 47
0
def test_infer_header():
    with filetext('name,val\nAlice,100\nNA,200') as fn:
        assert infer_header(fn) == True
    with filetext('Alice,100\nNA,200') as fn:
        assert infer_header(fn) == False
Ejemplo n.º 48
0
def test_textblock_multibyte_linesep():
    text = b'12 34 56 78'.replace(b' ', b'\r\n')
    with filetext(text, mode='wb') as fn:
        text = [line.encode()
                for line in textblock(fn, 5, 13, linesep='\r\n', buffersize=2)]
        assert text == [line.encode() for line in ('56\r\n', '78')]
Ejemplo n.º 49
0
def test_read_csv_singleton_dtype():
    data = b'a,b\n1,2\n3,4\n5,6'
    with filetext(data, mode='wb') as fn:
        eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))