def test_ufunc_numpy_scalar_comparison(pandas, scalar): # Regression test for issue #3392 dask_compare = scalar >= dd.from_pandas(pandas, npartitions=3) pandas_compare = scalar >= pandas assert_eq(dask_compare, pandas_compare)
def test_dumps_serialize_numpy(df): header, frames = serialize(df) if 'compression' in header: frames = decompress(header, frames) df2 = deserialize(header, frames) assert_eq(df, df2)
def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1 * (10 - i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') assert_eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') assert_eq(df16, out)
def test_groupby_unaligned_index(): df = pd.DataFrame({'a': np.random.randint(0, 10, 50), 'b': np.random.randn(50), 'c': np.random.randn(50)}) ddf = dd.from_pandas(df, npartitions=5) filtered = df[df.b < 0.5] dfiltered = ddf[ddf.b < 0.5] ddf_group = dfiltered.groupby(ddf.a) ds_group = dfiltered.b.groupby(ddf.a) bad = [ddf_group.mean(), ddf_group.var(), ddf_group.b.nunique(), ddf_group.get_group(0), ds_group.mean(), ds_group.var(), ds_group.nunique(), ds_group.get_group(0)] for obj in bad: with pytest.raises(ValueError): obj.compute() def add1(x): return x + 1 df_group = filtered.groupby(df.a) good = [(ddf_group.apply(add1, meta=ddf), df_group.apply(add1)), (ddf_group.b.apply(add1, meta=ddf.b), df_group.b.apply(add1))] for (res, sol) in good: assert_eq(res, sol)
def test_groupby_agg_custom__mode(): # mode function passing intermediates as pure python objects around. to protect # results from pandas in apply use return results as single-item lists def agg_mode(s): def impl(s): res, = s.iloc[0] for i, in s.iloc[1:]: res = res.add(i, fill_value=0) return [res] return s.apply(impl) agg_func = dd.Aggregation( 'custom_mode', lambda s: s.apply(lambda s: [s.value_counts()]), agg_mode, lambda s: s.map(lambda i: i[0].argmax()), ) d = pd.DataFrame({ 'g0': [0, 0, 0, 1, 1] * 3, 'g1': [0, 0, 0, 1, 1] * 3, 'cc': [4, 5, 4, 6, 6] * 3, }) a = dd.from_pandas(d, npartitions=5) actual = a['cc'].groupby([a['g0'], a['g1']]).agg(agg_func) # cheat to get the correct index expected = pd.DataFrame({'g0': [0, 1], 'g1': [0, 1], 'cc': [4, 6]}) expected = expected['cc'].groupby([expected['g0'], expected['g1']]).agg('sum') assert_eq(actual, expected)
def test_set_index_divisions_sorted(): p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']}) p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']}) p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']}) ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3}, 'x', p1, [None, None, None, None]) df = ddf.compute() def throw(*args, **kwargs): raise Exception("Shouldn't have computed") with dask.set_options(get=throw): res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True) assert_eq(res, df.set_index('x')) with dask.set_options(get=throw): res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True) assert_eq(res, df.set_index('y')) # with sorted=True, divisions must be same length as df.divisions with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True) # Divisions must be sorted with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
def test_hash_pandas_object(obj): a = hash_pandas_object(obj) b = hash_pandas_object(obj) if isinstance(a, np.ndarray): np.testing.assert_equal(a, b) else: assert_eq(a, b)
def test_filters(tmpdir, write_engine, read_engine): fn = str(tmpdir) df = pd.DataFrame({'at': ['ab', 'aa', 'ba', 'da', 'bb']}) ddf = dd.from_pandas(df, npartitions=1) # Ok with 1 partition and filters ddf.repartition(npartitions=1, force=True).to_parquet(fn, write_index=False, engine=write_engine) ddf2 = dd.read_parquet(fn, index=False, engine=read_engine, filters=[('at', '==', 'aa')]).compute() assert_eq(ddf2, ddf) # with >1 partition and no filters ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) dd.read_parquet(fn, engine=read_engine).compute() assert_eq(ddf2, ddf) # with >1 partition and filters using base fastparquet if read_engine == 'fastparquet': ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) df2 = fastparquet.ParquetFile(fn).to_pandas(filters=[('at', '==', 'aa')]) assert len(df2) > 0 # with >1 partition and filters ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) dd.read_parquet(fn, engine=read_engine, filters=[('at', '==', 'aa')]).compute() assert len(ddf2) > 0
def test_no_index(tmpdir, write_engine, read_engine): fn = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) ddf = dd.from_pandas(df, npartitions=2) ddf.to_parquet(fn, write_index=False, engine=write_engine) ddf2 = dd.read_parquet(fn, engine=read_engine) assert_eq(df, ddf2, check_index=False)
def test_skiprows(dd_read, pd_read, files): files = {name: comment_header + b'\n' + content for name, content in files.items()} skip = len(comment_header.splitlines()) with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', skiprows=skip) expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)]) assert_eq(df, expected_df, check_dtype=False)
def test_error_if_sample_is_too_small(): text = ('AAAAA,BBBBB,CCCCC,DDDDD,EEEEE\n' '1,2,3,4,5\n' '6,7,8,9,10\n' '11,12,13,14,15') with filetext(text) as fn: # Sample size stops mid header row sample = 20 with pytest.raises(ValueError): dd.read_csv(fn, sample=sample) # Saying no header means this is fine assert_eq(dd.read_csv(fn, sample=sample, header=None), pd.read_csv(fn, header=None)) skiptext = ('# skip\n' '# these\n' '# lines\n') text = skiptext + text with filetext(text) as fn: # Sample size stops mid header row sample = 20 + len(skiptext) with pytest.raises(ValueError): dd.read_csv(fn, sample=sample, skiprows=3) # Saying no header means this is fine assert_eq(dd.read_csv(fn, sample=sample, header=None, skiprows=3), pd.read_csv(fn, header=None, skiprows=3))
def test_read_csv(dd_read, pd_read, text, sep): with filetext(text) as fn: f = dd_read(fn, blocksize=30, lineterminator=os.linesep, sep=sep) assert list(f.columns) == ['name', 'amount'] # index may be different result = f.compute(scheduler='sync').reset_index(drop=True) assert_eq(result, pd_read(fn, sep=sep))
def test_header_None(): with filetexts({'.tmp.1.csv': '1,2', '.tmp.2.csv': '', '.tmp.3.csv': '3,4'}): df = dd.read_csv('.tmp.*.csv', header=None) expected = pd.DataFrame({0: [1, 3], 1: [2, 4]}) assert_eq(df.compute().reset_index(drop=True), expected)
def test_pivot_table(aggfunc): df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100), 'B': np.random.randn(100), 'C': pd.Categorical(np.random.choice(list('abc'), size=100))}) ddf = dd.from_pandas(df, 5) res = dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=aggfunc) exp = pd.pivot_table(df, index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp) # method res = ddf.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) exp = df.pivot_table(index='A', columns='C', values='B', aggfunc=aggfunc) if aggfunc == 'count': # dask result cannot be int64 dtype depending on divisions because of NaN exp = exp.astype(np.float64) assert_eq(res, exp)
def test_frame_2ufunc_out(): input_matrix = np.random.randint(1, 100, size=(20, 2)) df = pd.DataFrame(input_matrix, columns=['A', 'B']) ddf = dd.from_pandas(df, 3) # column number mismatch df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 3)), columns=['X', 'Y', 'Z']) ddf_out = dd.from_pandas(df_out, 3) with pytest.raises(ValueError): np.sin(ddf, out=ddf_out) # types mismatch ddf_out = dd.from_pandas(pd.Series([0]),1) with pytest.raises(TypeError): np.sin(ddf, out=ddf_out) df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)), columns=['X', 'Y']) ddf_out = dd.from_pandas(df_out, 3) np.sin(ddf, out=ddf_out) np.add(ddf_out, 10, out=ddf_out) expected = pd.DataFrame(np.sin(input_matrix) + 10, columns=['A', 'B']) assert_eq(ddf_out, expected)
def test_writing_parquet_with_kwargs(tmpdir, engine): fn = str(tmpdir) path1 = os.path.join(fn, 'normal') path2 = os.path.join(fn, 'partitioned') pytest.importorskip("snappy") df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100)}) ddf = dd.from_pandas(df, npartitions=3) engine_kwargs = { 'pyarrow': { 'compression': 'snappy', 'coerce_timestamps': None, 'use_dictionary': True }, 'fastparquet': { 'compression': 'snappy', 'times': 'int64', 'fixed_text': None } } ddf.to_parquet(path1, engine=engine, **engine_kwargs[engine]) out = dd.read_parquet(path1, engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(out, ddf, check_index=(engine != 'fastparquet'), check_divisions=should_check_divs(engine)) # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets with dask.config.set(scheduler='sync'): ddf.to_parquet(path2, engine=engine, partition_on=['a'], **engine_kwargs[engine]) out = dd.read_parquet(path2, engine=engine).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val])
def test_concat(join): pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) ddf1 = dd.from_pandas(pdf1, 2) pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf2 = dd.from_pandas(pdf2, 2) # different columns pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'z': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf3 = dd.from_pandas(pdf3, 2) for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3)]: result = dd.concat([dd1, dd2], join=join) expected = pd.concat([pd1, pd2], join=join) assert_eq(result, expected) # test outer only, inner has a problem on pandas side for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]: result = dd.concat([dd1, dd2]) expected = pd.concat([pd1, pd2]) assert_eq(result, expected)
def test_from_dask_array_index(as_frame): s = dd.from_pandas(pd.Series(range(10), index=list('abcdefghij')), npartitions=3) if as_frame: s = s.to_frame() result = dd.from_dask_array(s.values, index=s.index) assert_eq(s, result)
def test_simple(db): # single chunk data = read_sql_table('test', db, npartitions=2, index_col='number' ).compute() assert (data.name == df.name).all() assert data.index.name == 'number' assert_eq(data, df)
def test_get_dummies(data): exp = pd.get_dummies(data) ddata = dd.from_pandas(data, 2) res = dd.get_dummies(ddata) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns)
def test_loc_on_pandas_datetimes(): df = pd.DataFrame({'x': [1, 2, 3]}, index=list(map(pd.Timestamp, ['2014', '2015', '2016']))) a = dd.from_pandas(df, 2) a.divisions = list(map(pd.Timestamp, a.divisions)) assert_eq(a.loc['2014': '2015'], a.loc['2014': '2015'])
def test_read_chunked(block): with tmpdir() as path: fn = os.path.join(path, '1.json') df.to_json(fn, orient='records', lines=True) d = dd.read_json(fn, blocksize=block, sample=10) assert (d.npartitions > 1) or (block > 50) assert_eq(d, df, check_index=False)
def test_time_rolling(before, after): window = before before = pd.Timedelta(before) after = pd.Timedelta(after) result = dts.map_overlap(lambda x: x.rolling(window).count(), before, after) expected = dts.compute().rolling(window).count() assert_eq(result, expected)
def test_dataframe_groupby_nunique_across_group_same_value(): strings = list('aaabbccccdddeee') data = list(map(int, '123111223323412')) ps = pd.DataFrame(dict(strings=strings, data=data)) s = dd.from_pandas(ps, npartitions=3) expected = ps.groupby('strings')['data'].nunique() assert_eq(s.groupby('strings')['data'].nunique(), expected)
def test_groupby_column_and_index_apply(group_args, apply_func): df = pd.DataFrame({'idx': [1, 1, 1, 2, 2, 2], 'a': [1, 2, 1, 2, 1, 2], 'b': np.arange(6)} ).set_index('idx') ddf = dd.from_pandas(df, npartitions=df.index.nunique()) ddf_no_divs = dd.from_pandas(df, npartitions=df.index.nunique(), sort=False) # Expected result expected = df.groupby(group_args).apply(apply_func) # Compute on dask DataFrame with divisions (no shuffling) result = ddf.groupby(group_args).apply(apply_func) assert_eq(expected, result, check_divisions=False) # Check that partitioning is preserved assert ddf.divisions == result.divisions # Check that no shuffling occurred. # The groupby operation should add only 1 task per partition assert len(result.dask) == (len(ddf.dask) + ddf.npartitions) # Compute on dask DataFrame without divisions (requires shuffling) result = ddf_no_divs.groupby(group_args).apply(apply_func) assert_eq(expected, result, check_divisions=False) # Check that divisions were preserved (all None in this case) assert ddf_no_divs.divisions == result.divisions # Crude check to see if shuffling was performed. # The groupby operation should add only more than 1 task per partition assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions)
def test_dataframe_groupby_nunique(): strings = list('aaabbccccdddeee') data = np.random.randn(len(strings)) ps = pd.DataFrame(dict(strings=strings, data=data)) s = dd.from_pandas(ps, npartitions=3) expected = ps.groupby('strings')['data'].nunique() assert_eq(s.groupby('strings')['data'].nunique(), expected)
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list('ABCDE'), index=list('abcdefghij')) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list('ABCDE'), index=list('fghijklmnopqr')) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list('CDEXYZ'), index=list('fghijklmnopqr')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ('All inputs have known divisions which cannot be ' 'concatenated in order. Specify ' 'interleave_partitions=True to ignore order') cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with pytest.raises(ValueError) as err: dd.concat(case) assert msg in str(err.value) assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) msg = "'join' must be 'inner' or 'outer'" with pytest.raises(ValueError) as err: dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True) assert msg in str(err.value)
def test_late_dtypes(): text = 'numbers,names,more_numbers,integers\n' for i in range(1000): text += '1,foo,2,3\n' text += '1.5,bar,2.5,3\n' with filetext(text) as fn: sol = pd.read_csv(fn) with pytest.raises(ValueError) as e: dd.read_csv(fn, sample=50).compute(get=get_sync) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "- 'more_numbers'\n" "- 'numbers'\n" "\n" "To fix, specify dtypes manually by adding:\n" "\n" "dtype={'more_numbers': float,\n" " 'numbers': float}\n" "\n" "to the call to `read_csv`/`read_table`.\n" "\n" "Alternatively, provide `assume_missing=True` to interpret " "all unspecified integer columns as floats.") assert str(e.value) == msg # Specifying dtypes works res = dd.read_csv(fn, sample=50, dtype={'more_numbers': float, 'numbers': float}) assert_eq(res, sol)
def test_read_csv_header_issue_823(): text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t') with filetext(text) as fn: df = dd.read_csv(fn, sep='\t') assert_eq(df, pd.read_csv(fn, sep='\t')) df = dd.read_csv(fn, delimiter='\t') assert_eq(df, pd.read_csv(fn, delimiter='\t'))
def test_skiprows_as_list(dd_read, pd_read, files, units): files = {name: (comment_header + b'\n' + content.replace(b'\n', b'\n' + units, 1)) for name, content in files.items()} skip = [0, 1, 2, 3, 5] with filetexts(files, mode='b'): df = dd_read('2014-01-*.csv', skiprows=skip) expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)]) assert_eq(df, expected_df, check_dtype=False)
def test_columns_index_with_multi_index(tmpdir, engine): fn = os.path.join(str(tmpdir), 'test.parquet') index = pd.MultiIndex.from_arrays( [np.arange(10), np.arange(10) + 1], names=['x0', 'x1']) df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index) df2 = df.reset_index(drop=False) if engine == 'fastparquet': fastparquet.write(fn, df, write_index=True) # fastparquet doesn't support multi-index with pytest.raises(ValueError): ddf = dd.read_parquet(fn, engine=engine) else: import pyarrow as pa pq.write_table(pa.Table.from_pandas(df), fn) # Pyarrow supports multi-index reads ddf = dd.read_parquet(fn, engine=engine) assert_eq(ddf, df) d = dd.read_parquet(fn, columns='a', engine=engine) assert_eq(d, df['a']) d = dd.read_parquet(fn, index=['a', 'b'], columns=['x0', 'x1'], engine=engine) assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']]) # Just index d = dd.read_parquet(fn, index=False, engine=engine) assert_eq(d, df2) d = dd.read_parquet(fn, index=['a'], engine=engine) assert_eq(d, df2.set_index('a')[['b']]) d = dd.read_parquet(fn, index=['x0'], engine=engine) assert_eq(d, df2.set_index('x0')[['a', 'b']]) # Just columns d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index('x1')[['x0', 'a']]) # Both index and columns d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine) assert_eq(d, df2[['x0', 'b']]) for index in ['x1', 'b']: d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index(index)[['x0', 'a']]) # Columns and index intersect for index in ['a', 'x0']: with pytest.raises(ValueError): d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) # Series output for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')), (False, 'b', df2), (False, 'x0', df2), ('a', 'x0', df2.set_index('a')), ('a', 'b', df2.set_index('a'))]: d = dd.read_parquet(fn, index=ind, columns=col, engine=engine) assert_eq(d, sol_df[col])
def rolling_functions_tests(p, d): # Old-fashioned rolling API with pytest.warns(FutureWarning): assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3)) assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3)) assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3)) assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3)) assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3)) assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3)) assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3)) assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3)) # see note around test_rolling_dataframe for logic concerning precision assert_eq(pd.rolling_skew(p, 3), dd.rolling_skew(d, 3), check_less_precise=True) assert_eq(pd.rolling_kurt(p, 3), dd.rolling_kurt(d, 3), check_less_precise=True) assert_eq(pd.rolling_quantile(p, 3, 0.5), dd.rolling_quantile(d, 3, 0.5)) assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad)) assert_eq(pd.rolling_window(p, 3, win_type='boxcar'), dd.rolling_window(d, 3, win_type='boxcar')) # Test with edge-case window sizes assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0)) assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1)) # Test with kwargs assert_eq(pd.rolling_sum(p, 3, min_periods=3), dd.rolling_sum(d, 3, min_periods=3))
def test_from_dask_array_index(as_frame): s = dd.from_pandas(pd.Series(range(10), index=list("abcdefghij")), npartitions=3) if as_frame: s = s.to_frame() result = dd.from_dask_array(s.values, index=s.index) assert_eq(s, result)
def test_window_sum_dataframe(stream): df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], cudf.Series([9, 21], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], cudf.Series([9, 21], index=['x', 'y'])) assert_eq(L[2], cudf.Series([9, 21], index=['x', 'y']))
def test_set_index_overlap(): A = pd.DataFrame({"key": [1, 2, 3, 4, 4, 5, 6, 7], "value": list("abcd" * 2)}) a = dd.from_pandas(A, npartitions=2) a = a.set_index("key", sorted=True) b = a.repartition(divisions=a.divisions) assert_eq(a, b)
def test_set_index_tasks(npartitions): df = pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100) // 0.2}, index=np.random.random(100)) ddf = dd.from_pandas(df, npartitions=npartitions) assert_eq(df.set_index('x'), ddf.set_index('x', shuffle='tasks')) assert_eq(df.set_index('y'), ddf.set_index('y', shuffle='tasks')) assert_eq(df.set_index(df.x), ddf.set_index(ddf.x, shuffle='tasks')) assert_eq(df.set_index(df.x + df.y), ddf.set_index(ddf.x + ddf.y, shuffle='tasks')) assert_eq(df.set_index(df.x + 1), ddf.set_index(ddf.x + 1, shuffle='tasks')) assert_eq(df.set_index(df.index), ddf.set_index(ddf.index, shuffle='tasks'))
def test_loc2d(): # index indexer is always regarded as slice for duplicated values assert_eq(d.loc[5, "a"], full.loc[5:5, "a"]) # assert_eq(d.loc[[5], 'a'], full.loc[[5], 'a']) assert_eq(d.loc[5, ["a"]], full.loc[5:5, ["a"]]) # assert_eq(d.loc[[5], ['a']], full.loc[[5], ['a']]) assert_eq(d.loc[3:8, "a"], full.loc[3:8, "a"]) assert_eq(d.loc[:8, "a"], full.loc[:8, "a"]) assert_eq(d.loc[3:, "a"], full.loc[3:, "a"]) assert_eq(d.loc[[8], "a"], full.loc[[8], "a"]) assert_eq(d.loc[3:8, ["a"]], full.loc[3:8, ["a"]]) assert_eq(d.loc[:8, ["a"]], full.loc[:8, ["a"]]) assert_eq(d.loc[3:, ["a"]], full.loc[3:, ["a"]]) # 3d with pytest.raises(pd.core.indexing.IndexingError): d.loc[3, 3, 3] # Series should raise with pytest.raises(pd.core.indexing.IndexingError): d.a.loc[3, 3] with pytest.raises(pd.core.indexing.IndexingError): d.a.loc[3:, 3] with pytest.raises(pd.core.indexing.IndexingError): d.a.loc[d.a % 2 == 0, 3]
def test_to_sql(npartitions, parallel): df_by_age = df.set_index("age") df_appended = pd.concat([ df, df, ]) ddf = dd.from_pandas(df, npartitions) ddf_by_age = ddf.set_index("age") # Simple round trip test: use existing "number" index_col with tmp_db_uri() as uri: ddf.to_sql("test", uri, parallel=parallel) result = read_sql_table("test", uri, "number") assert_eq(df, result) # Test writing no index, and reading back in with one of the other columns as index (`read_sql_table` requires # an index_col) with tmp_db_uri() as uri: ddf.to_sql("test", uri, parallel=parallel, index=False) result = read_sql_table("test", uri, "negish") assert_eq(df.set_index("negish"), result) result = read_sql_table("test", uri, "age") assert_eq(df_by_age, result) # Index by "age" instead with tmp_db_uri() as uri: ddf_by_age.to_sql("test", uri, parallel=parallel) result = read_sql_table("test", uri, "age") assert_eq(df_by_age, result) # Index column can't have "object" dtype if no partitions are provided with tmp_db_uri() as uri: ddf.set_index("name").to_sql("test", uri) with pytest.raises( TypeError, match= 'Provided index column is of type "object". If divisions is not provided the index column type must be numeric or datetime.', # noqa: E501 ): read_sql_table("test", uri, "name") # Test various "if_exists" values with tmp_db_uri() as uri: ddf.to_sql("test", uri) # Writing a table that already exists fails with pytest.raises(ValueError, match="Table 'test' already exists"): ddf.to_sql("test", uri) ddf.to_sql("test", uri, parallel=parallel, if_exists="append") result = read_sql_table("test", uri, "number") assert_eq(df_appended, result) ddf_by_age.to_sql("test", uri, parallel=parallel, if_exists="replace") result = read_sql_table("test", uri, "age") assert_eq(df_by_age, result) # Verify number of partitions returned, when compute=False with tmp_db_uri() as uri: result = ddf.to_sql("test", uri, parallel=parallel, compute=False) # the first result is from the "meta" insert actual = len(result.compute()) assert actual == npartitions
def test_set_index_drop(drop): pdf = pd.DataFrame({'A': list('ABAABBABAA'), 'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]}) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index('A', drop=drop), pdf.set_index('A', drop=drop)) assert_eq(ddf.set_index('B', drop=drop), pdf.set_index('B', drop=drop)) assert_eq(ddf.set_index('C', drop=drop), pdf.set_index('C', drop=drop)) assert_eq(ddf.set_index(ddf.A, drop=drop), pdf.set_index(pdf.A, drop=drop)) assert_eq(ddf.set_index(ddf.B, drop=drop), pdf.set_index(pdf.B, drop=drop)) assert_eq(ddf.set_index(ddf.C, drop=drop), pdf.set_index(pdf.C, drop=drop)) # numeric columns pdf = pd.DataFrame({0: list('ABAABBABAA'), 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]}) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index(0, drop=drop), pdf.set_index(0, drop=drop)) assert_eq(ddf.set_index(2, drop=drop), pdf.set_index(2, drop=drop))
def test_set_index_tasks(npartitions): df = pd.DataFrame( {"x": np.random.random(100), "y": np.random.random(100) // 0.2}, index=np.random.random(100), ) ddf = dd.from_pandas(df, npartitions=npartitions) assert_eq(df.set_index("x"), ddf.set_index("x", shuffle="tasks")) assert_eq(df.set_index("y"), ddf.set_index("y", shuffle="tasks")) assert_eq(df.set_index(df.x), ddf.set_index(ddf.x, shuffle="tasks")) assert_eq(df.set_index(df.x + df.y), ddf.set_index(ddf.x + ddf.y, shuffle="tasks")) assert_eq(df.set_index(df.x + 1), ddf.set_index(ddf.x + 1, shuffle="tasks")) assert_eq(df.set_index(df.index), ddf.set_index(ddf.index, shuffle="tasks"))
def test_to_hdf_lock_delays(): pytest.importorskip("tables") df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) a = dd.from_pandas(df16, 16) # adding artificial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1 * (10 - i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df16, out) # saving to multiple hdf files # adding artificial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, "data*") a = a.apply(delayed_nop, axis=1, meta=a) a.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df16, out)
def test_to_hdf_multiple_files(): pytest.importorskip("tables") df = pd.DataFrame( {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0] ) a = dd.from_pandas(df, 2) df16 = pd.DataFrame( { "x": [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ], "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], }, index=[ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ], ) b = dd.from_pandas(df16, 16) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df, out) # saving to multiple files making sure order is kept with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") b.to_hdf(fn, "/data") out = dd.read_hdf(fn, "/data") assert_eq(df16, out) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, "data_*.h5") a.to_hdf(fn, "/data", name_function=lambda i: "a" * (i + 1)) out = dd.read_hdf(fn, "/data") assert_eq(df, out) out = pd.read_hdf(os.path.join(dn, "data_a.h5"), "/data") tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, "data_aa.h5"), "/data") tm.assert_frame_equal(out, df.iloc[2:]) # test hdf object with tmpfile("h5") as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, "/data*") out = dd.read_hdf(fn, "/data*") assert_eq(df, out)
def test_set_index_divisions_compute(): d2 = d.set_index("b", divisions=[0, 2, 9], compute=False) d3 = d.set_index("b", divisions=[0, 2, 9], compute=True) assert_eq(d2, d3) assert_eq(d2, full.set_index("b")) assert_eq(d3, full.set_index("b")) assert len(d2.dask) > len(d3.dask) d4 = d.set_index(d.b, divisions=[0, 2, 9], compute=False) d5 = d.set_index(d.b, divisions=[0, 2, 9], compute=True) exp = full.copy() exp.index = exp.b assert_eq(d4, d5) assert_eq(d4, exp) assert_eq(d5, exp) assert len(d4.dask) > len(d5.dask)
def test_getitem_period_str(): df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.period_range("2011-01-01", freq="H", periods=100), ) ddf = dd.from_pandas(df, 10) # partial string slice # TODO(pandas) starting with pandas 1.2, __getitem__ with an implicit slice # is deprecated -> should we deprecate this in dask as well? if not PANDAS_GT_120: assert_eq(df["2011-01-02"], ddf["2011-01-02"]) assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"]) # same reso, dask result is always DataFrame df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.period_range("2011-01-01", freq="D", periods=100), ) ddf = dd.from_pandas(df, 50) if not PANDAS_GT_120: assert_eq(df["2011-01"], ddf["2011-01"]) assert_eq(df["2011"], ddf["2011"]) assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"]) assert_eq(df["2011":"2015"], ddf["2011":"2015"])
def test_loc2d_some_missing(): with pytest.warns(FutureWarning): assert_eq(d.loc[[3, 4, 3], ["a"]], full.loc[[3, 4, 3], ["a"]])
def test_loc_period_str(): # .loc with PeriodIndex doesn't support partial string indexing # https://github.com/pydata/pandas/issues/13429 # -> this started working in pandas 1.1 df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.period_range("2011-01-01", freq="H", periods=100), ) ddf = dd.from_pandas(df, 10) # partial string slice assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"]) assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"]) # same reso, dask result is always DataFrame df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.period_range("2011-01-01", freq="D", periods=100), ) ddf = dd.from_pandas(df, 50) assert_eq(df.loc["2011-01"], ddf.loc["2011-01"]) assert_eq(df.loc["2011"], ddf.loc["2011"]) assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"]) assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"])
def test_loc_with_series(): assert_eq(d.loc[d.a % 2 == 0], full.loc[full.a % 2 == 0]) assert sorted(d.loc[d.a % 2].dask) == sorted(d.loc[d.a % 2].dask) assert sorted(d.loc[d.a % 2].dask) != sorted(d.loc[d.a % 3].dask)
def test_getitem_timestamp_str(): df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.date_range("2011-01-01", freq="H", periods=100), ) ddf = dd.from_pandas(df, 10) # partial string slice # TODO(pandas) starting with pandas 1.2, __getitem__ with an implicit slice # is deprecated -> should we deprecate this in dask as well? assert_eq(df.loc["2011-01-02"], ddf["2011-01-02"]) assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"]) df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.date_range("2011-01-01", freq="D", periods=100), ) ddf = dd.from_pandas(df, 50) assert_eq(df.loc["2011-01"], ddf["2011-01"]) assert_eq(df.loc["2011"], ddf["2011"]) assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"]) assert_eq(df["2011":"2015"], ddf["2011":"2015"])
def test_set_index_nan_partition(): d[d.a > 3].set_index("a") # Set index with 1 null partition d[d.a > 1].set_index("a", sorted=True) # Set sorted index with 0 null partitions a = d[d.a > 3].set_index("a", sorted=True) # Set sorted index with 1 null partition assert_eq(a, a)
def test_loc_timestamp_str(): df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.date_range("2011-01-01", freq="H", periods=100), ) ddf = dd.from_pandas(df, 10) # partial string slice assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"]) assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"]) # same reso, dask result is always DataFrame assert_eq(df.loc["2011-01-02 10:00"].to_frame().T, ddf.loc["2011-01-02 10:00"], **CHECK_FREQ) # series assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], **CHECK_FREQ) assert_eq(df.A.loc["2011-01-02":"2011-01-10"], ddf.A.loc["2011-01-02":"2011-01-10"], **CHECK_FREQ) # slice with timestamp (dask result must be DataFrame) assert_eq(df.loc[pd.Timestamp("2011-01-02")].to_frame().T, ddf.loc[pd.Timestamp("2011-01-02")], **CHECK_FREQ) assert_eq(df.loc[pd.Timestamp("2011-01-02"):pd.Timestamp("2011-01-10")], ddf.loc[pd.Timestamp("2011-01-02"):pd.Timestamp("2011-01-10")], **CHECK_FREQ) assert_eq(df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T, ddf.loc[pd.Timestamp("2011-01-02 10:00")], **CHECK_FREQ) df = pd.DataFrame( { "A": np.random.randn(100), "B": np.random.randn(100) }, index=pd.date_range("2011-01-01", freq="M", periods=100), ) ddf = dd.from_pandas(df, 50) assert_eq(df.loc["2011-01"], ddf.loc["2011-01"]) assert_eq(df.loc["2011"], ddf.loc["2011"]) assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"]) assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"]) # series assert_eq(df.B.loc["2011-01"], ddf.B.loc["2011-01"]) assert_eq(df.B.loc["2011"], ddf.B.loc["2011"]) assert_eq(df.B.loc["2011-01":"2012-05"], ddf.B.loc["2011-01":"2012-05"]) assert_eq(df.B.loc["2011":"2015"], ddf.B.loc["2011":"2015"])
def test_rolling_axis(): df = pd.DataFrame(np.random.randn(20, 16)) ddf = dd.from_pandas(df, npartitions=3) assert_eq(df.rolling(3, axis=0).mean(), ddf.rolling(3, axis=0).mean()) assert_eq(df.rolling(3, axis=1).mean(), ddf.rolling(3, axis=1).mean()) assert_eq( df.rolling(3, min_periods=1, axis=1).mean(), ddf.rolling(3, min_periods=1, axis=1).mean()) assert_eq( df.rolling(3, axis='columns').mean(), ddf.rolling(3, axis='columns').mean()) assert_eq( df.rolling(3, axis='rows').mean(), ddf.rolling(3, axis='rows').mean()) s = df[3] ds = ddf[3] assert_eq(s.rolling(5, axis=0).std(), ds.rolling(5, axis=0).std())
def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5) assert_eq(d.loc[5], full.loc[5:5]) assert_eq(d.loc[3:8], full.loc[3:8]) assert_eq(d.loc[:8], full.loc[:8]) assert_eq(d.loc[3:], full.loc[3:]) assert_eq(d.loc[[5]], full.loc[[5]]) expected_warning = FutureWarning if not PANDAS_GT_100: # removed in pandas 1.0 with pytest.warns(expected_warning): assert_eq(d.loc[[3, 4, 1, 8]], full.loc[[3, 4, 1, 8]]) with pytest.warns(expected_warning): assert_eq(d.loc[[3, 4, 1, 9]], full.loc[[3, 4, 1, 9]]) with pytest.warns(expected_warning): assert_eq(d.loc[np.array([3, 4, 1, 9])], full.loc[np.array([3, 4, 1, 9])]) assert_eq(d.a.loc[5], full.a.loc[5:5]) assert_eq(d.a.loc[3:8], full.a.loc[3:8]) assert_eq(d.a.loc[:8], full.a.loc[:8]) assert_eq(d.a.loc[3:], full.a.loc[3:]) assert_eq(d.a.loc[[5]], full.a.loc[[5]]) if not PANDAS_GT_100: # removed in pandas 1.0 with pytest.warns(expected_warning): assert_eq(d.a.loc[[3, 4, 1, 8]], full.a.loc[[3, 4, 1, 8]]) with pytest.warns(expected_warning): assert_eq(d.a.loc[[3, 4, 1, 9]], full.a.loc[[3, 4, 1, 9]]) with pytest.warns(expected_warning): assert_eq(d.a.loc[np.array([3, 4, 1, 9])], full.a.loc[np.array([3, 4, 1, 9])]) assert_eq(d.a.loc[[]], full.a.loc[[]]) assert_eq(d.a.loc[np.array([])], full.a.loc[np.array([])]) pytest.raises(KeyError, lambda: d.loc[1000]) assert_eq(d.loc[1000:], full.loc[1000:]) assert_eq(d.loc[-2000:-1000], full.loc[-2000:-1000]) assert sorted(d.loc[5].dask) == sorted(d.loc[5].dask) assert sorted(d.loc[5].dask) != sorted(d.loc[6].dask)
def test_roundtrip_from_dask(tmpdir): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine="pyarrow") files = sorted( [ os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith("_metadata") ], key=natural_sort_key, ) # Read list of parquet files ddf2 = dask_cudf.read_parquet(files, gather_statistics=True) assert_eq(ddf, ddf2) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(files, columns=["x"], gather_statistics=True) assert_eq(ddf[["x"]], ddf2) # Specify columns='y' ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=True) assert_eq(ddf[["y"]], ddf2) # Now include metadata; gather_statistics is True by default # Read list of parquet files ddf2 = dask_cudf.read_parquet(tmpdir) assert_eq(ddf, ddf2) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"]) assert_eq(ddf[["x"]], ddf2) # Specify columns='y' ddf2 = dask_cudf.read_parquet(tmpdir, columns="y") assert_eq(ddf[["y"]], ddf2)
def test_getitem(): df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3, }, columns=list("ABC"), ) ddf = dd.from_pandas(df, 2) assert_eq(ddf["A"], df["A"]) # check cache consistency tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"]) assert_eq(ddf[["A", "B"]], df[["A", "B"]]) tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]]) assert_eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._meta, ddf._meta.C) assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) pytest.raises(KeyError, lambda: df["X"]) pytest.raises(KeyError, lambda: df[["A", "X"]]) pytest.raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert_eq(ddf[0], df[0]) assert_eq(ddf[[1, 2]], df[[1, 2]]) pytest.raises(KeyError, lambda: df[8]) pytest.raises(KeyError, lambda: df[[1, 8]])
def test_from_pandas_single_row(): df = pd.DataFrame({"x": [1]}, index=[1]) ddf = dd.from_pandas(df, npartitions=1) assert ddf.divisions == (1, 1) assert_eq(ddf, df)
def test_loc2d_duplicated_columns(): df = pd.DataFrame( np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("AABCD"), ) ddf = dd.from_pandas(df, 3) assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"]) assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]]) assert_eq(ddf.loc["j", "B"], df.loc[["j"], "B"]) assert_eq(ddf.loc["j", ["B"]], df.loc[["j"], ["B"]]) assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"]) assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]]) assert_eq(ddf.loc["j":"q", "B"], df.loc["j":"q", "B"]) assert_eq(ddf.loc["j":"q", ["B"]], df.loc["j":"q", ["B"]]) assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"]) assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"]) assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"]) assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"]) assert_eq(ddf.loc[ddf.B > 0, "B"], df.loc[df.B > 0, "B"]) assert_eq(ddf.loc[ddf.B > 0, ["A", "C"]], df.loc[df.B > 0, ["A", "C"]])
def test_roundtrip_from_pandas(tmpdir, write_engine, read_engine): fn = str(tmpdir.join('test.parquet')) df = pd.DataFrame({'x': [1, 2, 3]}) df.to_parquet(fn, engine=write_engine) ddf = dd.read_parquet(fn, engine=read_engine) assert_eq(df, ddf)
def test_loc2d_with_known_divisions(): df = pd.DataFrame( np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("ABCDE"), ) ddf = dd.from_pandas(df, 3) assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"]) assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]]) assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"]) assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]]) assert_eq(ddf.loc[["n"], ["A"]], df.loc[["n"], ["A"]]) assert_eq(ddf.loc[["a", "c", "n"], ["A"]], df.loc[["a", "c", "n"], ["A"]]) assert_eq(ddf.loc[["t", "b"], ["A"]], df.loc[["t", "b"], ["A"]]) assert_eq( ddf.loc[["r", "r", "c", "g", "h"], ["A"]], df.loc[["r", "r", "c", "g", "h"], ["A"]], )
def test_set_index_sorted_single_partition(): df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1, 0, 1, 0]}) ddf = dd.from_pandas(df, npartitions=1) assert_eq(ddf.set_index("x", sorted=True), df.set_index("x"))
def test_set_index_drop(drop): pdf = pd.DataFrame( { "A": list("ABAABBABAA"), "B": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "C": [1, 2, 3, 2, 1, 3, 2, 4, 2, 3], } ) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index("A", drop=drop), pdf.set_index("A", drop=drop)) assert_eq(ddf.set_index("B", drop=drop), pdf.set_index("B", drop=drop)) assert_eq(ddf.set_index("C", drop=drop), pdf.set_index("C", drop=drop)) assert_eq(ddf.set_index(ddf.A, drop=drop), pdf.set_index(pdf.A, drop=drop)) assert_eq(ddf.set_index(ddf.B, drop=drop), pdf.set_index(pdf.B, drop=drop)) assert_eq(ddf.set_index(ddf.C, drop=drop), pdf.set_index(pdf.C, drop=drop)) # numeric columns pdf = pd.DataFrame( { 0: list("ABAABBABAA"), 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3], } ) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index(0, drop=drop), pdf.set_index(0, drop=drop)) assert_eq(ddf.set_index(2, drop=drop), pdf.set_index(2, drop=drop))