def test_error_if_sample_is_too_small(): text = ('AAAAA,BBBBB,CCCCC,DDDDD,EEEEE\n' '1,2,3,4,5\n' '6,7,8,9,10\n' '11,12,13,14,15') with filetext(text) as fn: # Sample size stops mid header row sample = 20 with pytest.raises(ValueError): dd.read_csv(fn, sample=sample) # Saying no header means this is fine assert_eq(dd.read_csv(fn, sample=sample, header=None), pd.read_csv(fn, header=None)) skiptext = ('# skip\n' '# these\n' '# lines\n') text = skiptext + text with filetext(text) as fn: # Sample size stops mid header row sample = 20 + len(skiptext) with pytest.raises(ValueError): dd.read_csv(fn, sample=sample, skiprows=3) # Saying no header means this is fine assert_eq(dd.read_csv(fn, sample=sample, header=None, skiprows=3), pd.read_csv(fn, header=None, skiprows=3))
def test_assume_missing(): text = 'numbers,names,more_numbers,integers\n' for i in range(1000): text += '1,foo,2,3\n' text += '1.5,bar,2.5,3\n' with filetext(text) as fn: sol = pd.read_csv(fn) # assume_missing affects all columns res = dd.read_csv(fn, sample=50, assume_missing=True) assert_eq(res, sol.astype({'integers': float})) # assume_missing doesn't override specified dtypes res = dd.read_csv(fn, sample=50, assume_missing=True, dtype={'integers': 'int64'}) assert_eq(res, sol) # assume_missing works with dtype=None res = dd.read_csv(fn, sample=50, assume_missing=True, dtype=None) assert_eq(res, sol.astype({'integers': float})) text = 'numbers,integers\n' for i in range(1000): text += '1,2\n' text += '1.5,2\n' with filetext(text) as fn: sol = pd.read_csv(fn) # assume_missing ignored when all dtypes specifed df = dd.read_csv(fn, sample=30, dtype='int64', assume_missing=True) assert df.numbers.dtype == 'int64'
def test_read_csv(): with filetext(text) as fn: f = dd.read_csv(fn, chunksize=3) assert list(f.columns) == ['name', 'amount'] assert f.npartitions == 2 assert eq(f, pd.read_csv(fn)) with filetext(text) as fn: f = dd.read_csv(fn, chunksize=4) assert f.npartitions == 2 f = dd.read_csv(fn)
def test_late_dtypes(): text = 'numbers,names,more_numbers,integers\n' for i in range(1000): text += '1,foo,2,3\n' text += '1.5,bar,2.5,3\n' with filetext(text) as fn: sol = pd.read_csv(fn) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50).compute(get=get_sync) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "- 'more_numbers'\n" "- 'numbers'\n" "\n" "To fix, specify dtypes manually by adding:\n" "\n" "dtype={'more_numbers': float,\n" " 'numbers': float}\n" "\n" "to the call to `read_csv`/`read_table`.\n" "\n" "Alternatively, provide `assume_missing=True` to interpret " "all unspecified integer columns as floats.") assert str(e.value) == msg # Specifying dtypes works res = dd.read_csv(fn, sample=50, dtype={'more_numbers': float, 'numbers': float}) assert_eq(res, sol)
def test_index_col(): with filetext(text) as fn: try: f = dd.read_csv(fn, chunkbytes=30, index_col='name') assert False except ValueError as e: assert 'set_index' in str(e)
def test_parse_dates_multi_column(): with filetext(pdmc_text) as fn: ddf = dd.read_csv(fn, parse_dates=[['date', 'time']]) df = pd.read_csv(fn, parse_dates=[['date', 'time']]) assert (df.columns == ddf.columns).all() assert len(df) == len(ddf)
def test_read_csv_sep(): with filetext(sep_text) as fn: ddf = dd.read_csv(fn, sep="###") df = pd.read_csv(fn, sep="###") assert (df.columns == ddf.columns).all() assert len(df) == len(ddf)
def test_read_csv(): with filetext(text) as fn: f = dd.read_csv(fn, chunkbytes=30) assert list(f.columns) == ["name", "amount"] assert f.npartitions > 1 result = f.compute(get=dask.get).sort("name") assert (result.values == pd.read_csv(fn).sort("name").values).all()
def test_read_gzip_csv(): with filetext(text.encode(), open=gzip.open) as fn: f = dd.read_csv(fn, chunkbytes=30, compression="gzip") assert list(f.columns) == ["name", "amount"] assert f.npartitions > 1 result = f.compute(get=dask.get).sort("name") assert (result.values == pd.read_csv(fn, compression="gzip").sort("name").values).all()
def test_index_col(): with filetext(text) as fn: try: f = read_csv(fn, chunkbytes=30, index_col="name") assert False except ValueError as e: assert "set_index" in str(e)
def test_read_csv(dd_read, pd_read, text, sep): with filetext(text) as fn: f = dd_read(fn, blocksize=30, lineterminator=os.linesep, sep=sep) assert list(f.columns) == ['name', 'amount'] # index may be different result = f.compute(scheduler='sync').reset_index(drop=True) assert_eq(result, pd_read(fn, sep=sep))
def test_index_col(): with filetext(csv_text) as fn: try: dd.read_csv(fn, blocksize=30, index_col='name') assert False except ValueError as e: assert 'set_index' in str(e)
def test_read_gzip_csv(): with filetext(text.encode(), open=gzip.open) as fn: f = dd.read_csv(fn, chunkbytes=30, compression='gzip') assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get) assert eq(result.reset_index(drop=True), pd.read_csv(fn, compression='gzip'))
def test_read_csv_header_issue_823(): text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t') with filetext(text) as fn: df = dd.read_csv(fn, sep='\t') assert_eq(df, pd.read_csv(fn, sep='\t')) df = dd.read_csv(fn, delimiter='\t') assert_eq(df, pd.read_csv(fn, delimiter='\t'))
def test_read_csv_categorize(): with filetext(text) as fn: f = dd.read_csv(fn, chunksize=3, categorize=True) assert list(f.dtypes) == ['category', 'i8'] expected = pd.read_csv(fn) expected['name'] = expected.name.astype('category') assert eq(f, expected)
def test_read_csv(): with filetext(text) as fn: f = dd.read_csv(fn, chunkbytes=30, lineterminator=os.linesep) assert list(f.columns) == ['name', 'amount'] result = f.compute(get=dask.get) # index may be different assert eq(result.reset_index(drop=True), pd.read_csv(fn))
def test_read_csv_categorize_and_index(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=20, index='amount') assert f.index.compute().name == 'amount' expected = pd.read_csv(fn).set_index('amount') expected['name'] = expected.name.astype('category') assert eq(f, expected)
def test_read_csv_with_datetime_index_partitions_n(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) # because fn is so small, by default, set chunksize small ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4], parse_dates=['Date'], chunkbytes=400) eq(df, ddf)
def test_from_pandas_with_datetime_index(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) ddf = dd.from_pandas(df, 2) eq(df, ddf) ddf = dd.from_pandas(df, chunksize=2) eq(df, ddf)
def test_string_blocksize(): with filetext(timeseries) as fn: a = dd.read_csv(fn, blocksize='30B') b = dd.read_csv(fn, blocksize='30') assert a.npartitions == b.npartitions c = dd.read_csv(fn, blocksize='64MiB') assert c.npartitions == 1
def test_read_csv_of_modified_file_has_different_name(): with filetext(text) as fn: a = read_csv(fn) with open(fn, 'a') as f: f.write('\nGeorge,700') os.fsync(f) b = read_csv(fn) assert sorted(a.dask) != sorted(b.dask)
def test_read_csv(): with filetext(text) as fn: f = dd.read_csv(fn, chunkbytes=30) assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get) # index may be different assert eq(result.reset_index(drop=True), pd.read_csv(fn))
def test_report_dtype_correction_on_csvs(): text = 'numbers,names\n' for i in range(1000): text += '1,foo\n' text += '1.5,bar\n' with filetext(text) as fn: with pytest.raises(ValueError) as e: dd.read_csv(fn).compute(get=get_sync) assert "'numbers': 'float64'" in str(e)
def test_read_csv_has_deterministic_name(): with filetext(csv_text) as fn: a = dd.read_csv(fn) b = dd.read_csv(fn) assert a._name == b._name assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str) assert isinstance(a._name, str) c = dd.read_csv(fn, skiprows=1, na_values=[0]) assert a._name != c._name
def test_textblock(myopen, compression): text = b'123 456 789 abc def ghi'.replace(b' ', os.linesep.encode()) with filetext(text, open=myopen, mode='wb') as fn: text = ''.join(textblock(fn, 1, 11, compression)).encode() assert text == ('456 789 '.replace(' ', os.linesep)).encode() assert set(map(len, text.split())) == set([3]) k = 3 + len(os.linesep) assert ''.join(textblock(fn, 0, k, compression)).encode() == ('123' + os.linesep).encode() assert ''.join(textblock(fn, k, k, compression)).encode() == b''
def test_read_csv_with_datetime_index_partitions_one(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=["Date"]) # chunkbytes set to explicitly set to single chunk ddf = dd.read_csv(fn, index="Date", header=0, usecols=[0, 4], parse_dates=["Date"], chunkbytes=10000000) eq(df, ddf) # because fn is so small, by default, this will only be one chunk ddf = dd.read_csv(fn, index="Date", header=0, usecols=[0, 4], parse_dates=["Date"]) eq(df, ddf)
def test_textblock(myopen, compression): text = b"123 456 789 abc def ghi".replace(b" ", os.linesep.encode()) with filetext(text, open=myopen, mode="wb") as fn: text = "".join(textblock(fn, 1, 11, compression)).encode() assert text == ("456 789 ".replace(" ", os.linesep)).encode() assert set(map(len, text.split())) == set([3]) k = 3 + len(os.linesep) assert "".join(textblock(fn, 0, k, compression)).encode() == ("123" + os.linesep).encode() assert "".join(textblock(fn, k, k, compression)).encode() == b""
def test_read_csv_categorize(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=30, categorize=True) assert list(f.dtypes) == ['category', 'i8'] expected = pd.read_csv(fn) expected['name'] = expected.name.astype('category') assert (f.dtypes == expected.dtypes).all() assert len(f.compute().name.cat.categories) == 6
def test_read_csv_categorize(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=30, categorize=True) assert list(f.dtypes) == ["category", "i8"] expected = pd.read_csv(fn) expected["name"] = expected.name.astype("category") assert (f.dtypes == expected.dtypes).all() assert len(f.compute().name.cat.categories) == 6
def test_read_csv_of_modified_file_has_different_name(): with filetext(csv_text) as fn: sleep(1) a = dd.read_csv(fn) sleep(1) with open(fn, 'a') as f: f.write('\nGeorge,700') os.fsync(f) b = dd.read_csv(fn) assert sorted(a.dask, key=str) != sorted(b.dask, key=str)
def test_read_csv_skiprows_range(): with filetext(csv_text) as fn: f = dd.read_csv(fn, skiprows=range(5)) result = f expected = pd.read_csv(fn, skiprows=range(5)) assert_eq(result, expected)
def test_usecols(): with filetext(timeseries) as fn: df = dd.read_csv(fn, blocksize=30, usecols=["High", "Low"]) expected = pd.read_csv(fn, usecols=["High", "Low"]) assert (df.compute().values == expected.values).all()
def test_empty_csv_file(): with filetext("a,b") as fn: df = dd.read_csv(fn, header=0) assert len(df.compute()) == 0 assert list(df.columns) == ["a", "b"]
def test_file_size(): counts = (len(text), len(text) + text.count('\n')) with filetext(text) as fn: assert file_size(fn) in counts with filetext(text.encode(), open=GzipFile) as fn: assert file_size(fn, 'gzip') in counts
def test_read_csv_with_nrows(): with filetext(text) as fn: f = dd.read_csv(fn, nrows=3) assert list(f.columns) == ['name', 'amount'] assert f.npartitions == 1 assert eq(dd.read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
def test_read_csv_categorize_with_parse_dates(): with filetext(datetime_csv_file) as fn: f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=['when']) assert list(f.dtypes) == ['category', 'i8', 'M8[ns]']
def test_empty_csv_file(): with filetext('a,b') as fn: df = dd.read_csv(fn, header=0) assert len(df.compute()) == 0 assert list(df.columns) == ['a', 'b']
def test_read_csv_large_skiprows(dd_read, pd_read, text, skip): names = ["name", "amount"] with filetext(text) as fn: actual = dd_read(fn, skiprows=skip, names=names) assert_eq(actual, pd_read(fn, skiprows=skip, names=names))
def test_usecols(): with filetext(datetime_csv_file) as fn: df = dd.read_csv(fn, chunkbytes=30, usecols=['when', 'amount']) expected = pd.read_csv(fn, usecols=['when', 'amount']) assert (df.compute().values == expected.values).all()
def test_csv_with_integer_names(): with filetext('alice,1\nbob,2') as fn: df = dd.read_csv(fn, header=None) assert list(df.columns) == [0, 1]
def test_none_usecols(): with filetext(text) as fn: df = dd.read_csv(fn, usecols=None) eq(df, pd.read_csv(fn, usecols=None))
def test_read_csv_singleton_dtype(): data = b"a,b\n1,2\n3,4\n5,6" with filetext(data, mode="wb") as fn: assert_eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))
def test_late_dtypes(): text = "numbers,names,more_numbers,integers,dates\n" for i in range(1000): text += "1,,2,3,2017-10-31 00:00:00\n" text += "1.5,bar,2.5,3,4998-01-01 00:00:00\n" date_msg = ( "\n" "\n" "-------------------------------------------------------------\n" "\n" "The following columns also failed to properly parse as dates:\n" "\n" "- dates\n" "\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`." ) with filetext(text) as fn: sol = pd.read_csv(fn) msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "+--------------+---------+----------+\n" "| Column | Found | Expected |\n" "+--------------+---------+----------+\n" "| more_numbers | float64 | int64 |\n" "| names | object | float64 |\n" "| numbers | float64 | int64 |\n" "+--------------+---------+----------+\n" "\n" "- names\n" " ValueError(.*)\n" "\n" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n" "\n" "dtype={'more_numbers': 'float64',\n" " 'names': 'object',\n" " 'numbers': 'float64'}\n" "\n" "to the call to `read_csv`/`read_table`." ) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50, parse_dates=["dates"]).compute(scheduler="sync") assert e.match(msg + date_msg) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50).compute(scheduler="sync") assert e.match(msg) msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "+--------------+---------+----------+\n" "| Column | Found | Expected |\n" "+--------------+---------+----------+\n" "| more_numbers | float64 | int64 |\n" "| numbers | float64 | int64 |\n" "+--------------+---------+----------+\n" "\n" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n" "\n" "dtype={'more_numbers': 'float64',\n" " 'numbers': 'float64'}\n" "\n" "to the call to `read_csv`/`read_table`.\n" "\n" "Alternatively, provide `assume_missing=True` to interpret\n" "all unspecified integer columns as floats." ) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50, dtype={"names": "O"}).compute(scheduler="sync") assert str(e.value) == msg with pytest.raises(ValueError) as e: dd.read_csv( fn, sample=50, parse_dates=["dates"], dtype={"names": "O"} ).compute(scheduler="sync") assert str(e.value) == msg + date_msg msg = ( "Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n" "\n" "The following columns failed to properly parse as dates:\n" "\n" "- dates\n" "\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`." ) with pytest.raises(ValueError) as e: dd.read_csv( fn, sample=50, parse_dates=["dates"], dtype={"more_numbers": float, "names": object, "numbers": float}, ).compute(scheduler="sync") assert str(e.value) == msg # Specifying dtypes works res = dd.read_csv( fn, sample=50, dtype={"more_numbers": float, "names": object, "numbers": float}, ) assert_eq(res, sol)
def test_windows_line_terminator(): text = "a,b\r\n1,2\r\n2,3\r\n3,4\r\n4,5\r\n5,6\r\n6,7" with filetext(text) as fn: df = dd.read_csv(fn, blocksize=5, lineterminator="\r\n") assert df.b.sum().compute() == 2 + 3 + 4 + 5 + 6 + 7 assert df.a.sum().compute() == 1 + 2 + 3 + 4 + 5 + 6
def test_filesize(myopen, compression): text = b'123 456 789 abc def ghi'.replace(b' ', os.linesep.encode()) with filetext(text, open=myopen, mode='wb') as fn: assert file_size(fn, compression) == len(text)
def test_read_csv_slash_r(): data = b'0,my\n1,data\n' * 1000 + b'2,foo\rbar' with filetext(data, mode='wb') as fn: dd.read_csv(fn, header=None, sep=',', lineterminator='\n', names=['a','b'], blocksize=200).compute(get=dask.get)
def test_infer_header(): with filetext('name,val\nAlice,100\nNA,200') as fn: assert infer_header(fn) == True with filetext('Alice,100\nNA,200') as fn: assert infer_header(fn) == False
def test_textblock_multibyte_linesep(): text = b'12 34 56 78'.replace(b' ', b'\r\n') with filetext(text, mode='wb') as fn: text = [line.encode() for line in textblock(fn, 5, 13, linesep='\r\n', buffersize=2)] assert text == [line.encode() for line in ('56\r\n', '78')]
def test_read_csv_singleton_dtype(): data = b'a,b\n1,2\n3,4\n5,6' with filetext(data, mode='wb') as fn: eq(pd.read_csv(fn, dtype=float), dd.read_csv(fn, dtype=float))