def test_read_csv_of_modified_file_has_different_name(): with filetext(text) as fn: a = read_csv(fn) with open(fn, 'a') as f: f.write('\nGeorge,700') os.fsync(f) b = read_csv(fn) assert sorted(a.dask) != sorted(b.dask)
def test_read_csv_has_deterministic_name(): with filetext(text) as fn: a = read_csv(fn) b = read_csv(fn) assert a._name == b._name assert sorted(a.dask.keys()) == sorted(b.dask.keys()) assert isinstance(a._name, str) c = read_csv(fn, skiprows=1, na_values=[0]) assert a._name != c._name
def test_read_csv_of_modified_file_has_different_name(): with filetext(text) as fn: mtime = os.path.getmtime(fn) sleep(1) a = read_csv(fn) sleep(1) with open(fn, "a") as f: f.write("\nGeorge,700") os.fsync(f) b = read_csv(fn) assert sorted(a.dask) != sorted(b.dask)
def test_read_csv_of_modified_file_has_different_name(): with filetext(text) as fn: mtime = os.path.getmtime(fn) sleep(1) a = read_csv(fn) sleep(1) with open(fn, 'a') as f: f.write('\nGeorge,700') os.fsync(f) b = read_csv(fn) assert sorted(a.dask) != sorted(b.dask)
def test_read_multiple_csv(): try: with open("_foo.1.csv", "w") as f: f.write(text) with open("_foo.2.csv", "w") as f: f.write(text) df = dd.read_csv("_foo.*.csv") assert len(read_csv("_foo.*.csv").compute()) == len(read_csv("_foo.1.csv").compute()) * 2 finally: os.remove("_foo.1.csv") os.remove("_foo.2.csv")
def test_multiple_read_csv_has_deterministic_name(): try: with open("_foo.1.csv", "w") as f: f.write(text) with open("_foo.2.csv", "w") as f: f.write(text) a = read_csv("_foo.*.csv") b = read_csv("_foo.*.csv") assert sorted(a.dask.keys()) == sorted(b.dask.keys()) finally: os.remove("_foo.1.csv") os.remove("_foo.2.csv")
def test_multiple_read_csv_has_deterministic_name(): try: with open('_foo.1.csv', 'w') as f: f.write(text) with open('_foo.2.csv', 'w') as f: f.write(text) a = read_csv('_foo.*.csv') b = read_csv('_foo.*.csv') assert sorted(a.dask.keys()) == sorted(b.dask.keys()) finally: os.remove('_foo.1.csv') os.remove('_foo.2.csv')
def test_read_multiple_csv(): try: with open('_foo.1.csv', 'w') as f: f.write(text) with open('_foo.2.csv', 'w') as f: f.write(text) df = read_csv('_foo.*.csv') assert (len(read_csv('_foo.*.csv').compute()) == len( read_csv('_foo.1.csv').compute()) * 2) finally: os.remove('_foo.1.csv') os.remove('_foo.2.csv')
def test_read_multiple_csv(): try: with open('_foo.1.csv', 'w') as f: f.write(text) with open('_foo.2.csv', 'w') as f: f.write(text) df = read_csv('_foo.*.csv') assert (len(read_csv('_foo.*.csv').compute()) == len(read_csv('_foo.1.csv').compute()) * 2) finally: os.remove('_foo.1.csv') os.remove('_foo.2.csv')
def test_csv_expands_dtypes(): with filetext(text) as fn: a = read_csv(fn, chunkbytes=30, dtype={}) a_kwargs = list(a.dask.values())[0][-1] b = read_csv(fn, chunkbytes=30) b_kwargs = list(b.dask.values())[0][-1] assert a_kwargs['dtype'] == b_kwargs['dtype'] a = read_csv(fn, chunkbytes=30, dtype={'amount': float}) a_kwargs = list(a.dask.values())[0][-1] assert a_kwargs['dtype']['amount'] == float
def test_read_gzip_csv(): with filetext(text.encode(), open=gzip.open) as fn: f = read_csv(fn, chunkbytes=30, compression='gzip') assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get).sort('name') assert (result.values == pd.read_csv(fn, compression='gzip').sort('name').values).all()
def test_index_col(): with filetext(text) as fn: try: f = read_csv(fn, chunkbytes=30, index_col="name") assert False except ValueError as e: assert "set_index" in str(e)
def test_read_csv(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=30) assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get).sort('name') assert (result.values == pd.read_csv(fn).sort('name').values).all()
def test_read_multiple_csv(): try: with open('_foo.1.csv', 'w') as f: f.write(text) with open('_foo.2.csv', 'w') as f: f.write(text) df = dd.read_csv('_foo.*.csv', chunkbytes=30) assert df._known_dtype assert df.npartitions > 2 assert (len(read_csv('_foo.*.csv').compute()) == len( read_csv('_foo.1.csv').compute()) * 2) finally: os.remove('_foo.1.csv') os.remove('_foo.2.csv')
def test_read_multiple_csv(): try: with open('_foo.1.csv', 'w') as f: f.write(text) with open('_foo.2.csv', 'w') as f: f.write(text) df = dd.read_csv('_foo.*.csv', chunkbytes=30) assert df._known_dtype assert df.npartitions > 2 assert (len(read_csv('_foo.*.csv').compute()) == len(read_csv('_foo.1.csv').compute()) * 2) finally: os.remove('_foo.1.csv') os.remove('_foo.2.csv')
def test_index_col(): with filetext(text) as fn: try: f = read_csv(fn, chunkbytes=30, index_col='name') assert False except ValueError as e: assert 'set_index' in str(e)
def test_read_csv_categorize_and_index(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=20, index='amount') assert f.index.compute().name == 'amount' expected = pd.read_csv(fn).set_index('amount') expected['name'] = expected.name.astype('category') assert eq(f, expected)
def test_read_csv_categorize(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=30, categorize=True) assert list(f.dtypes) == ["category", "i8"] expected = pd.read_csv(fn) expected["name"] = expected.name.astype("category") assert (f.dtypes == expected.dtypes).all() assert len(f.compute().name.cat.categories) == 6
def test_read_csv_categorize(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=30, categorize=True) assert list(f.dtypes) == ['category', 'i8'] expected = pd.read_csv(fn) expected['name'] = expected.name.astype('category') assert (f.dtypes == expected.dtypes).all() assert len(f.compute().name.cat.categories) == 6
def test_consistent_dtypes(): text = """ name,amount Alice,100.5 Bob,-200.5 Charlie,300 Dennis,400 Edith,-500 Frank,600 """.strip() with filetext(text) as fn: df = read_csv(fn, chunkbytes=30) assert isinstance(df.amount.sum().compute(), float)
def test_read_csv_categorize_and_index(): with filetext(text) as fn: f = read_csv(fn, chunkbytes=20, index='amount') result = f.compute(get=get_sync) assert result.index.name == 'amount' blocks = dd.core.get(f.dask, f._keys(), get=get_sync) for i, block in enumerate(blocks): if i < len(f.divisions): assert (block.index <= f.divisions[i + 1]).all() if i > 0: assert (block.index > f.divisions[i]).all() expected = pd.read_csv(fn).set_index('amount') expected['name'] = expected.name.astype('category') result = result.sort() expected = expected.sort() assert eq(result, expected)
def test_read_csv_categorize_with_parse_dates(): with filetext(datetime_csv_file) as fn: f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=['when']) assert list(f.dtypes) == ['category', 'i8', 'M8[ns]']
def test_read_csv_with_nrows(): with filetext(text) as fn: f = read_csv(fn, nrows=3) assert list(f.columns) == ['name', 'amount'] assert f.npartitions == 1 assert eq(read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
def test_read_csv_categorize_with_parse_dates(): with filetext(datetime_csv_file) as fn: f = read_csv(fn, chunkbytes=30, categorize=True, parse_dates=["when"]) assert list(f.dtypes) == ["category", "i8", "M8[ns]"]
def test_usecols(): with filetext(datetime_csv_file) as fn: df = read_csv(fn, chunkbytes=30, usecols=['when', 'amount']) expected = pd.read_csv(fn, usecols=['when', 'amount']) assert (df.compute().values == expected.values).all()
def test_read_csv_with_nrows(): with filetext(text) as fn: f = read_csv(fn, nrows=3) assert list(f.columns) == ["name", "amount"] assert f.npartitions == 1 assert eq(read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))