Ejemplo n.º 1
0
def test_ufunc_numpy_scalar_comparison(pandas, scalar):
    # Regression test for issue #3392

    dask_compare = scalar >= dd.from_pandas(pandas, npartitions=3)
    pandas_compare = scalar >= pandas

    assert_eq(dask_compare, pandas_compare)
Ejemplo n.º 2
0
def test_dumps_serialize_numpy(df):
    header, frames = serialize(df)
    if 'compression' in header:
        frames = decompress(header, frames)
    df2 = deserialize(header, frames)

    assert_eq(df, df2)
Ejemplo n.º 3
0
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                               'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                         'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                               15, 16]},
                        index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.,
                               12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1 * (10 - i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        assert_eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df16, out)
Ejemplo n.º 4
0
def test_groupby_unaligned_index():
    df = pd.DataFrame({'a': np.random.randint(0, 10, 50),
                       'b': np.random.randn(50),
                       'c': np.random.randn(50)})
    ddf = dd.from_pandas(df, npartitions=5)
    filtered = df[df.b < 0.5]
    dfiltered = ddf[ddf.b < 0.5]

    ddf_group = dfiltered.groupby(ddf.a)
    ds_group = dfiltered.b.groupby(ddf.a)

    bad = [ddf_group.mean(),
           ddf_group.var(),
           ddf_group.b.nunique(),
           ddf_group.get_group(0),
           ds_group.mean(),
           ds_group.var(),
           ds_group.nunique(),
           ds_group.get_group(0)]

    for obj in bad:
        with pytest.raises(ValueError):
            obj.compute()

    def add1(x):
        return x + 1

    df_group = filtered.groupby(df.a)
    good = [(ddf_group.apply(add1, meta=ddf), df_group.apply(add1)),
            (ddf_group.b.apply(add1, meta=ddf.b), df_group.b.apply(add1))]

    for (res, sol) in good:
        assert_eq(res, sol)
Ejemplo n.º 5
0
def test_groupby_agg_custom__mode():
    # mode function passing intermediates as pure python objects around. to protect
    # results from pandas in apply use return results as single-item lists
    def agg_mode(s):
        def impl(s):
            res, = s.iloc[0]

            for i, in s.iloc[1:]:
                res = res.add(i, fill_value=0)

            return [res]

        return s.apply(impl)

    agg_func = dd.Aggregation(
        'custom_mode',
        lambda s: s.apply(lambda s: [s.value_counts()]),
        agg_mode,
        lambda s: s.map(lambda i: i[0].argmax()),
    )

    d = pd.DataFrame({
        'g0': [0, 0, 0, 1, 1] * 3,
        'g1': [0, 0, 0, 1, 1] * 3,
        'cc': [4, 5, 4, 6, 6] * 3,
    })
    a = dd.from_pandas(d, npartitions=5)

    actual = a['cc'].groupby([a['g0'], a['g1']]).agg(agg_func)

    # cheat to get the correct index
    expected = pd.DataFrame({'g0': [0, 1], 'g1': [0, 1], 'cc': [4, 6]})
    expected = expected['cc'].groupby([expected['g0'], expected['g1']]).agg('sum')

    assert_eq(actual, expected)
Ejemplo n.º 6
0
def test_set_index_divisions_sorted():
    p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']})
    p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']})
    p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']})

    ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3},
                       'x', p1, [None, None, None, None])
    df = ddf.compute()

    def throw(*args, **kwargs):
        raise Exception("Shouldn't have computed")

    with dask.set_options(get=throw):
        res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True)
    assert_eq(res, df.set_index('x'))

    with dask.set_options(get=throw):
        res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True)
    assert_eq(res, df.set_index('y'))

    # with sorted=True, divisions must be same length as df.divisions
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True)

    # Divisions must be sorted
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
Ejemplo n.º 7
0
def test_hash_pandas_object(obj):
    a = hash_pandas_object(obj)
    b = hash_pandas_object(obj)
    if isinstance(a, np.ndarray):
        np.testing.assert_equal(a, b)
    else:
        assert_eq(a, b)
Ejemplo n.º 8
0
def test_filters(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)

    df = pd.DataFrame({'at': ['ab', 'aa', 'ba', 'da', 'bb']})
    ddf = dd.from_pandas(df, npartitions=1)

    # Ok with 1 partition and filters
    ddf.repartition(npartitions=1, force=True).to_parquet(fn, write_index=False,
                                                          engine=write_engine)
    ddf2 = dd.read_parquet(fn, index=False, engine=read_engine,
                           filters=[('at', '==', 'aa')]).compute()
    assert_eq(ddf2, ddf)

    # with >1 partition and no filters
    ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
    dd.read_parquet(fn, engine=read_engine).compute()
    assert_eq(ddf2, ddf)

    # with >1 partition and filters using base fastparquet
    if read_engine == 'fastparquet':
        ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
        df2 = fastparquet.ParquetFile(fn).to_pandas(filters=[('at', '==', 'aa')])
        assert len(df2) > 0

    # with >1 partition and filters
    ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine)
    dd.read_parquet(fn, engine=read_engine, filters=[('at', '==', 'aa')]).compute()
    assert len(ddf2) > 0
Ejemplo n.º 9
0
def test_no_index(tmpdir, write_engine, read_engine):
    fn = str(tmpdir)
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf.to_parquet(fn, write_index=False, engine=write_engine)
    ddf2 = dd.read_parquet(fn, engine=read_engine)
    assert_eq(df, ddf2, check_index=False)
Ejemplo n.º 10
0
def test_skiprows(dd_read, pd_read, files):
    files = {name: comment_header + b'\n' + content for name, content in files.items()}
    skip = len(comment_header.splitlines())
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', skiprows=skip)
        expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)])
        assert_eq(df, expected_df, check_dtype=False)
Ejemplo n.º 11
0
def test_error_if_sample_is_too_small():
    text = ('AAAAA,BBBBB,CCCCC,DDDDD,EEEEE\n'
            '1,2,3,4,5\n'
            '6,7,8,9,10\n'
            '11,12,13,14,15')
    with filetext(text) as fn:
        # Sample size stops mid header row
        sample = 20
        with pytest.raises(ValueError):
            dd.read_csv(fn, sample=sample)

        # Saying no header means this is fine
        assert_eq(dd.read_csv(fn, sample=sample, header=None),
                  pd.read_csv(fn, header=None))

    skiptext = ('# skip\n'
                '# these\n'
                '# lines\n')

    text = skiptext + text
    with filetext(text) as fn:
        # Sample size stops mid header row
        sample = 20 + len(skiptext)
        with pytest.raises(ValueError):
            dd.read_csv(fn, sample=sample, skiprows=3)

        # Saying no header means this is fine
        assert_eq(dd.read_csv(fn, sample=sample, header=None, skiprows=3),
                  pd.read_csv(fn, header=None, skiprows=3))
Ejemplo n.º 12
0
def test_read_csv(dd_read, pd_read, text, sep):
    with filetext(text) as fn:
        f = dd_read(fn, blocksize=30, lineterminator=os.linesep, sep=sep)
        assert list(f.columns) == ['name', 'amount']
        # index may be different
        result = f.compute(scheduler='sync').reset_index(drop=True)
        assert_eq(result, pd_read(fn, sep=sep))
Ejemplo n.º 13
0
def test_header_None():
    with filetexts({'.tmp.1.csv': '1,2',
                    '.tmp.2.csv': '',
                    '.tmp.3.csv': '3,4'}):
        df = dd.read_csv('.tmp.*.csv', header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        assert_eq(df.compute().reset_index(drop=True), expected)
Ejemplo n.º 14
0
def test_pivot_table(aggfunc):
    df = pd.DataFrame({'A': np.random.choice(list('XYZ'), size=100),
                       'B': np.random.randn(100),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=100))})
    ddf = dd.from_pandas(df, 5)

    res = dd.pivot_table(ddf, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    exp = pd.pivot_table(df, index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)

    assert_eq(res, exp)

    # method
    res = ddf.pivot_table(index='A', columns='C', values='B',
                          aggfunc=aggfunc)
    exp = df.pivot_table(index='A', columns='C', values='B',
                         aggfunc=aggfunc)
    if aggfunc == 'count':
        # dask result cannot be int64 dtype depending on divisions because of NaN
        exp = exp.astype(np.float64)
    assert_eq(res, exp)
Ejemplo n.º 15
0
def test_frame_2ufunc_out():
    input_matrix = np.random.randint(1, 100, size=(20, 2))

    df = pd.DataFrame(input_matrix, columns=['A', 'B'])
    ddf = dd.from_pandas(df, 3)

    # column number mismatch
    df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 3)),
                          columns=['X', 'Y', 'Z'])
    ddf_out = dd.from_pandas(df_out, 3)

    with pytest.raises(ValueError):
        np.sin(ddf, out=ddf_out)

    # types mismatch
    ddf_out = dd.from_pandas(pd.Series([0]),1)
    with pytest.raises(TypeError):
        np.sin(ddf, out=ddf_out)

    df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)),
                          columns=['X', 'Y'])
    ddf_out = dd.from_pandas(df_out, 3)

    np.sin(ddf, out=ddf_out)
    np.add(ddf_out, 10, out=ddf_out)

    expected = pd.DataFrame(np.sin(input_matrix) + 10, columns=['A', 'B'])

    assert_eq(ddf_out, expected)
Ejemplo n.º 16
0
def test_writing_parquet_with_kwargs(tmpdir, engine):
    fn = str(tmpdir)
    path1 = os.path.join(fn, 'normal')
    path2 = os.path.join(fn, 'partitioned')
    pytest.importorskip("snappy")

    df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100),
                       'b': np.random.random(size=100),
                       'c': np.random.randint(1, 5, size=100)})
    ddf = dd.from_pandas(df, npartitions=3)

    engine_kwargs = {
        'pyarrow': {
            'compression': 'snappy',
            'coerce_timestamps': None,
            'use_dictionary': True
        },
        'fastparquet': {
            'compression': 'snappy',
            'times': 'int64',
            'fixed_text': None
        }
    }

    ddf.to_parquet(path1,  engine=engine, **engine_kwargs[engine])
    out = dd.read_parquet(path1, engine=engine, infer_divisions=should_check_divs(engine))
    assert_eq(out, ddf, check_index=(engine != 'fastparquet'), check_divisions=should_check_divs(engine))

    # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets
    with dask.config.set(scheduler='sync'):
        ddf.to_parquet(path2, engine=engine, partition_on=['a'],
                       **engine_kwargs[engine])
    out = dd.read_parquet(path2, engine=engine).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])
Ejemplo n.º 17
0
def test_concat(join):
    pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[1, 2, 3, 4, 6, 7])
    ddf1 = dd.from_pandas(pdf1, 2)
    pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf2 = dd.from_pandas(pdf2, 2)

    # different columns
    pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'z': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf3 = dd.from_pandas(pdf3, 2)

    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3)]:
        result = dd.concat([dd1, dd2], join=join)
        expected = pd.concat([pd1, pd2], join=join)
        assert_eq(result, expected)

    # test outer only, inner has a problem on pandas side
    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]:
        result = dd.concat([dd1, dd2])
        expected = pd.concat([pd1, pd2])
        assert_eq(result, expected)
Ejemplo n.º 18
0
def test_from_dask_array_index(as_frame):
    s = dd.from_pandas(pd.Series(range(10), index=list('abcdefghij')),
                       npartitions=3)
    if as_frame:
        s = s.to_frame()
    result = dd.from_dask_array(s.values, index=s.index)
    assert_eq(s, result)
Ejemplo n.º 19
0
def test_simple(db):
    # single chunk
    data = read_sql_table('test', db, npartitions=2, index_col='number'
                          ).compute()
    assert (data.name == df.name).all()
    assert data.index.name == 'number'
    assert_eq(data, df)
Ejemplo n.º 20
0
def test_get_dummies(data):
    exp = pd.get_dummies(data)

    ddata = dd.from_pandas(data, 2)
    res = dd.get_dummies(ddata)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)
Ejemplo n.º 21
0
def test_loc_on_pandas_datetimes():
    df = pd.DataFrame({'x': [1, 2, 3]},
                      index=list(map(pd.Timestamp, ['2014', '2015', '2016'])))
    a = dd.from_pandas(df, 2)
    a.divisions = list(map(pd.Timestamp, a.divisions))

    assert_eq(a.loc['2014': '2015'], a.loc['2014': '2015'])
Ejemplo n.º 22
0
def test_read_chunked(block):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient='records', lines=True)
        d = dd.read_json(fn, blocksize=block, sample=10)
        assert (d.npartitions > 1) or (block > 50)
        assert_eq(d, df, check_index=False)
Ejemplo n.º 23
0
def test_time_rolling(before, after):
    window = before
    before = pd.Timedelta(before)
    after = pd.Timedelta(after)
    result = dts.map_overlap(lambda x: x.rolling(window).count(), before, after)
    expected = dts.compute().rolling(window).count()
    assert_eq(result, expected)
Ejemplo n.º 24
0
def test_dataframe_groupby_nunique_across_group_same_value():
    strings = list('aaabbccccdddeee')
    data = list(map(int, '123111223323412'))
    ps = pd.DataFrame(dict(strings=strings, data=data))
    s = dd.from_pandas(ps, npartitions=3)
    expected = ps.groupby('strings')['data'].nunique()
    assert_eq(s.groupby('strings')['data'].nunique(), expected)
Ejemplo n.º 25
0
def test_groupby_column_and_index_apply(group_args, apply_func):
    df = pd.DataFrame({'idx': [1, 1, 1, 2, 2, 2],
                       'a': [1, 2, 1, 2, 1, 2],
                       'b': np.arange(6)}
                      ).set_index('idx')

    ddf = dd.from_pandas(df, npartitions=df.index.nunique())
    ddf_no_divs = dd.from_pandas(df, npartitions=df.index.nunique(), sort=False)

    # Expected result
    expected = df.groupby(group_args).apply(apply_func)

    # Compute on dask DataFrame with divisions (no shuffling)
    result = ddf.groupby(group_args).apply(apply_func)
    assert_eq(expected, result, check_divisions=False)

    # Check that partitioning is preserved
    assert ddf.divisions == result.divisions

    # Check that no shuffling occurred.
    # The groupby operation should add only 1 task per partition
    assert len(result.dask) == (len(ddf.dask) + ddf.npartitions)

    # Compute on dask DataFrame without divisions (requires shuffling)
    result = ddf_no_divs.groupby(group_args).apply(apply_func)
    assert_eq(expected, result, check_divisions=False)

    # Check that divisions were preserved (all None in this case)
    assert ddf_no_divs.divisions == result.divisions

    # Crude check to see if shuffling was performed.
    # The groupby operation should add only more than 1 task per partition
    assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions)
Ejemplo n.º 26
0
def test_dataframe_groupby_nunique():
    strings = list('aaabbccccdddeee')
    data = np.random.randn(len(strings))
    ps = pd.DataFrame(dict(strings=strings, data=data))
    s = dd.from_pandas(ps, npartitions=3)
    expected = ps.groupby('strings')['data'].nunique()
    assert_eq(s.groupby('strings')['data'].nunique(), expected)
Ejemplo n.º 27
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5),
                        columns=list('ABCDE'), index=list('abcdefghij'))
    pdf2 = pd.DataFrame(np.random.randn(13, 5),
                        columns=list('ABCDE'), index=list('fghijklmnopqr'))
    pdf3 = pd.DataFrame(np.random.randn(13, 6),
                        columns=list('CDEXYZ'), index=list('fghijklmnopqr'))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = ('All inputs have known divisions which cannot be '
           'concatenated in order. Specify '
           'interleave_partitions=True to ignore order')

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1],
             [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with pytest.raises(ValueError) as err:
            dd.concat(case)
        assert msg in str(err.value)

        assert_eq(dd.concat(case, interleave_partitions=True),
                  pd.concat(pdcase))
        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

    msg = "'join' must be 'inner' or 'outer'"
    with pytest.raises(ValueError) as err:
        dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True)
    assert msg in str(err.value)
Ejemplo n.º 28
0
def test_late_dtypes():
    text = 'numbers,names,more_numbers,integers\n'
    for i in range(1000):
        text += '1,foo,2,3\n'
    text += '1.5,bar,2.5,3\n'
    with filetext(text) as fn:
        sol = pd.read_csv(fn)
        with pytest.raises(ValueError) as e:
            dd.read_csv(fn, sample=50).compute(get=get_sync)

        msg = ("Mismatched dtypes found.\n"
               "Expected integers, but found floats for columns:\n"
               "- 'more_numbers'\n"
               "- 'numbers'\n"
               "\n"
               "To fix, specify dtypes manually by adding:\n"
               "\n"
               "dtype={'more_numbers': float,\n"
               "       'numbers': float}\n"
               "\n"
               "to the call to `read_csv`/`read_table`.\n"
               "\n"
               "Alternatively, provide `assume_missing=True` to interpret "
               "all unspecified integer columns as floats.")

        assert str(e.value) == msg

        # Specifying dtypes works
        res = dd.read_csv(fn, sample=50,
                          dtype={'more_numbers': float, 'numbers': float})
        assert_eq(res, sol)
Ejemplo n.º 29
0
def test_read_csv_header_issue_823():
    text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t')
    with filetext(text) as fn:
        df = dd.read_csv(fn, sep='\t')
        assert_eq(df, pd.read_csv(fn, sep='\t'))

        df = dd.read_csv(fn, delimiter='\t')
        assert_eq(df, pd.read_csv(fn, delimiter='\t'))
Ejemplo n.º 30
0
def test_skiprows_as_list(dd_read, pd_read, files, units):
    files = {name: (comment_header + b'\n' +
                    content.replace(b'\n', b'\n' + units, 1)) for name, content in files.items()}
    skip = [0, 1, 2, 3, 5]
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', skiprows=skip)
        expected_df = pd.concat([pd_read(n, skiprows=skip) for n in sorted(files)])
        assert_eq(df, expected_df, check_dtype=False)
Ejemplo n.º 31
0
def test_columns_index_with_multi_index(tmpdir, engine):
    fn = os.path.join(str(tmpdir), 'test.parquet')
    index = pd.MultiIndex.from_arrays(
        [np.arange(10), np.arange(10) + 1], names=['x0', 'x1'])
    df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index)
    df2 = df.reset_index(drop=False)

    if engine == 'fastparquet':
        fastparquet.write(fn, df, write_index=True)

        # fastparquet doesn't support multi-index
        with pytest.raises(ValueError):
            ddf = dd.read_parquet(fn, engine=engine)
    else:
        import pyarrow as pa
        pq.write_table(pa.Table.from_pandas(df), fn)

        # Pyarrow supports multi-index reads
        ddf = dd.read_parquet(fn, engine=engine)
        assert_eq(ddf, df)

        d = dd.read_parquet(fn, columns='a', engine=engine)
        assert_eq(d, df['a'])

        d = dd.read_parquet(fn,
                            index=['a', 'b'],
                            columns=['x0', 'x1'],
                            engine=engine)
        assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']])

    # Just index
    d = dd.read_parquet(fn, index=False, engine=engine)
    assert_eq(d, df2)

    d = dd.read_parquet(fn, index=['a'], engine=engine)
    assert_eq(d, df2.set_index('a')[['b']])

    d = dd.read_parquet(fn, index=['x0'], engine=engine)
    assert_eq(d, df2.set_index('x0')[['a', 'b']])

    # Just columns
    d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine)
    assert_eq(d, df2.set_index('x1')[['x0', 'a']])

    # Both index and columns
    d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine)
    assert_eq(d, df2[['x0', 'b']])

    for index in ['x1', 'b']:
        d = dd.read_parquet(fn,
                            index=index,
                            columns=['x0', 'a'],
                            engine=engine)
        assert_eq(d, df2.set_index(index)[['x0', 'a']])

    # Columns and index intersect
    for index in ['a', 'x0']:
        with pytest.raises(ValueError):
            d = dd.read_parquet(fn,
                                index=index,
                                columns=['x0', 'a'],
                                engine=engine)

    # Series output
    for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')),
                             (False, 'b', df2), (False, 'x0', df2),
                             ('a', 'x0', df2.set_index('a')),
                             ('a', 'b', df2.set_index('a'))]:
        d = dd.read_parquet(fn, index=ind, columns=col, engine=engine)
        assert_eq(d, sol_df[col])
Ejemplo n.º 32
0
def rolling_functions_tests(p, d):
    # Old-fashioned rolling API
    with pytest.warns(FutureWarning):
        assert_eq(pd.rolling_count(p, 3), dd.rolling_count(d, 3))
        assert_eq(pd.rolling_sum(p, 3), dd.rolling_sum(d, 3))
        assert_eq(pd.rolling_mean(p, 3), dd.rolling_mean(d, 3))
        assert_eq(pd.rolling_median(p, 3), dd.rolling_median(d, 3))
        assert_eq(pd.rolling_min(p, 3), dd.rolling_min(d, 3))
        assert_eq(pd.rolling_max(p, 3), dd.rolling_max(d, 3))
        assert_eq(pd.rolling_std(p, 3), dd.rolling_std(d, 3))
        assert_eq(pd.rolling_var(p, 3), dd.rolling_var(d, 3))
        # see note around test_rolling_dataframe for logic concerning precision
        assert_eq(pd.rolling_skew(p, 3),
                  dd.rolling_skew(d, 3),
                  check_less_precise=True)
        assert_eq(pd.rolling_kurt(p, 3),
                  dd.rolling_kurt(d, 3),
                  check_less_precise=True)
        assert_eq(pd.rolling_quantile(p, 3, 0.5),
                  dd.rolling_quantile(d, 3, 0.5))
        assert_eq(pd.rolling_apply(p, 3, mad), dd.rolling_apply(d, 3, mad))
        assert_eq(pd.rolling_window(p, 3, win_type='boxcar'),
                  dd.rolling_window(d, 3, win_type='boxcar'))
        # Test with edge-case window sizes
        assert_eq(pd.rolling_sum(p, 0), dd.rolling_sum(d, 0))
        assert_eq(pd.rolling_sum(p, 1), dd.rolling_sum(d, 1))
        # Test with kwargs
        assert_eq(pd.rolling_sum(p, 3, min_periods=3),
                  dd.rolling_sum(d, 3, min_periods=3))
Ejemplo n.º 33
0
def test_from_dask_array_index(as_frame):
    s = dd.from_pandas(pd.Series(range(10), index=list("abcdefghij")), npartitions=3)
    if as_frame:
        s = s.to_frame()
    result = dd.from_dask_array(s.values, index=s.index)
    assert_eq(s, result)
Ejemplo n.º 34
0
def test_window_sum_dataframe(stream):
    df = cudf.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
    sdf = DataFrame(example=df, stream=stream)
    L = sdf.window(n=4).sum().stream.gather().sink_to_list()

    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], cudf.Series([9, 21], index=['x', 'y']))
    sdf.emit(df)
    assert_eq(L[0], cudf.Series([6, 15], index=['x', 'y']))
    assert_eq(L[1], cudf.Series([9, 21], index=['x', 'y']))
    assert_eq(L[2], cudf.Series([9, 21], index=['x', 'y']))
Ejemplo n.º 35
0
def test_set_index_overlap():
    A = pd.DataFrame({"key": [1, 2, 3, 4, 4, 5, 6, 7], "value": list("abcd" * 2)})
    a = dd.from_pandas(A, npartitions=2)
    a = a.set_index("key", sorted=True)
    b = a.repartition(divisions=a.divisions)
    assert_eq(a, b)
Ejemplo n.º 36
0
def test_set_index_tasks(npartitions):
    df = pd.DataFrame({'x': np.random.random(100),
                       'y': np.random.random(100) // 0.2},
                      index=np.random.random(100))

    ddf = dd.from_pandas(df, npartitions=npartitions)

    assert_eq(df.set_index('x'),
              ddf.set_index('x', shuffle='tasks'))

    assert_eq(df.set_index('y'),
              ddf.set_index('y', shuffle='tasks'))

    assert_eq(df.set_index(df.x),
              ddf.set_index(ddf.x, shuffle='tasks'))

    assert_eq(df.set_index(df.x + df.y),
              ddf.set_index(ddf.x + ddf.y, shuffle='tasks'))

    assert_eq(df.set_index(df.x + 1),
              ddf.set_index(ddf.x + 1, shuffle='tasks'))

    assert_eq(df.set_index(df.index),
              ddf.set_index(ddf.index, shuffle='tasks'))
Ejemplo n.º 37
0
def test_loc2d():
    # index indexer is always regarded as slice for duplicated values
    assert_eq(d.loc[5, "a"], full.loc[5:5, "a"])
    # assert_eq(d.loc[[5], 'a'], full.loc[[5], 'a'])
    assert_eq(d.loc[5, ["a"]], full.loc[5:5, ["a"]])
    # assert_eq(d.loc[[5], ['a']], full.loc[[5], ['a']])

    assert_eq(d.loc[3:8, "a"], full.loc[3:8, "a"])
    assert_eq(d.loc[:8, "a"], full.loc[:8, "a"])
    assert_eq(d.loc[3:, "a"], full.loc[3:, "a"])
    assert_eq(d.loc[[8], "a"], full.loc[[8], "a"])

    assert_eq(d.loc[3:8, ["a"]], full.loc[3:8, ["a"]])
    assert_eq(d.loc[:8, ["a"]], full.loc[:8, ["a"]])
    assert_eq(d.loc[3:, ["a"]], full.loc[3:, ["a"]])

    # 3d
    with pytest.raises(pd.core.indexing.IndexingError):
        d.loc[3, 3, 3]

    # Series should raise
    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[3, 3]

    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[3:, 3]

    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[d.a % 2 == 0, 3]
Ejemplo n.º 38
0
def test_to_sql(npartitions, parallel):
    df_by_age = df.set_index("age")
    df_appended = pd.concat([
        df,
        df,
    ])

    ddf = dd.from_pandas(df, npartitions)
    ddf_by_age = ddf.set_index("age")

    # Simple round trip test: use existing "number" index_col
    with tmp_db_uri() as uri:
        ddf.to_sql("test", uri, parallel=parallel)
        result = read_sql_table("test", uri, "number")
        assert_eq(df, result)

    # Test writing no index, and reading back in with one of the other columns as index (`read_sql_table` requires
    # an index_col)
    with tmp_db_uri() as uri:
        ddf.to_sql("test", uri, parallel=parallel, index=False)

        result = read_sql_table("test", uri, "negish")
        assert_eq(df.set_index("negish"), result)

        result = read_sql_table("test", uri, "age")
        assert_eq(df_by_age, result)

    # Index by "age" instead
    with tmp_db_uri() as uri:
        ddf_by_age.to_sql("test", uri, parallel=parallel)
        result = read_sql_table("test", uri, "age")
        assert_eq(df_by_age, result)

    # Index column can't have "object" dtype if no partitions are provided
    with tmp_db_uri() as uri:
        ddf.set_index("name").to_sql("test", uri)
        with pytest.raises(
                TypeError,
                match=
                'Provided index column is of type "object".  If divisions is not provided the index column type must be numeric or datetime.',  # noqa: E501
        ):
            read_sql_table("test", uri, "name")

    # Test various "if_exists" values
    with tmp_db_uri() as uri:
        ddf.to_sql("test", uri)

        # Writing a table that already exists fails
        with pytest.raises(ValueError, match="Table 'test' already exists"):
            ddf.to_sql("test", uri)

        ddf.to_sql("test", uri, parallel=parallel, if_exists="append")
        result = read_sql_table("test", uri, "number")

        assert_eq(df_appended, result)

        ddf_by_age.to_sql("test", uri, parallel=parallel, if_exists="replace")
        result = read_sql_table("test", uri, "age")
        assert_eq(df_by_age, result)

    # Verify number of partitions returned, when compute=False
    with tmp_db_uri() as uri:
        result = ddf.to_sql("test", uri, parallel=parallel, compute=False)

        # the first result is from the "meta" insert
        actual = len(result.compute())

        assert actual == npartitions
Ejemplo n.º 39
0
def test_set_index_drop(drop):
    pdf = pd.DataFrame({'A': list('ABAABBABAA'),
                        'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                        'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})
    ddf = dd.from_pandas(pdf, 3)

    assert_eq(ddf.set_index('A', drop=drop),
              pdf.set_index('A', drop=drop))
    assert_eq(ddf.set_index('B', drop=drop),
              pdf.set_index('B', drop=drop))
    assert_eq(ddf.set_index('C', drop=drop),
              pdf.set_index('C', drop=drop))
    assert_eq(ddf.set_index(ddf.A, drop=drop),
              pdf.set_index(pdf.A, drop=drop))
    assert_eq(ddf.set_index(ddf.B, drop=drop),
              pdf.set_index(pdf.B, drop=drop))
    assert_eq(ddf.set_index(ddf.C, drop=drop),
              pdf.set_index(pdf.C, drop=drop))

    # numeric columns
    pdf = pd.DataFrame({0: list('ABAABBABAA'),
                        1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                        2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})
    ddf = dd.from_pandas(pdf, 3)
    assert_eq(ddf.set_index(0, drop=drop),
              pdf.set_index(0, drop=drop))
    assert_eq(ddf.set_index(2, drop=drop),
              pdf.set_index(2, drop=drop))
Ejemplo n.º 40
0
def test_set_index_tasks(npartitions):
    df = pd.DataFrame(
        {"x": np.random.random(100), "y": np.random.random(100) // 0.2},
        index=np.random.random(100),
    )

    ddf = dd.from_pandas(df, npartitions=npartitions)

    assert_eq(df.set_index("x"), ddf.set_index("x", shuffle="tasks"))

    assert_eq(df.set_index("y"), ddf.set_index("y", shuffle="tasks"))

    assert_eq(df.set_index(df.x), ddf.set_index(ddf.x, shuffle="tasks"))

    assert_eq(df.set_index(df.x + df.y), ddf.set_index(ddf.x + ddf.y, shuffle="tasks"))

    assert_eq(df.set_index(df.x + 1), ddf.set_index(ddf.x + 1, shuffle="tasks"))

    assert_eq(df.set_index(df.index), ddf.set_index(ddf.index, shuffle="tasks"))
Ejemplo n.º 41
0
def test_to_hdf_lock_delays():
    pytest.importorskip("tables")
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df16, 16)

    # adding artificial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1 * (10 - i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple hdf files
    # adding artificial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, "data*")
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df16, out)
Ejemplo n.º 42
0
def test_to_hdf_multiple_files():
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0]
    )
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # saving to multiple files making sure order is kept
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        b.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df16, out)

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

        out = pd.read_hdf(os.path.join(dn, "data_a.h5"), "/data")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, "data_aa.h5"), "/data")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)
Ejemplo n.º 43
0
def test_set_index_divisions_compute():
    d2 = d.set_index("b", divisions=[0, 2, 9], compute=False)
    d3 = d.set_index("b", divisions=[0, 2, 9], compute=True)

    assert_eq(d2, d3)
    assert_eq(d2, full.set_index("b"))
    assert_eq(d3, full.set_index("b"))
    assert len(d2.dask) > len(d3.dask)

    d4 = d.set_index(d.b, divisions=[0, 2, 9], compute=False)
    d5 = d.set_index(d.b, divisions=[0, 2, 9], compute=True)
    exp = full.copy()
    exp.index = exp.b
    assert_eq(d4, d5)
    assert_eq(d4, exp)
    assert_eq(d5, exp)
    assert len(d4.dask) > len(d5.dask)
Ejemplo n.º 44
0
def test_getitem_period_str():

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.period_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    # TODO(pandas) starting with pandas 1.2, __getitem__ with an implicit slice
    # is deprecated -> should we deprecate this in dask as well?
    if not PANDAS_GT_120:
        assert_eq(df["2011-01-02"], ddf["2011-01-02"])
    assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.period_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    if not PANDAS_GT_120:
        assert_eq(df["2011-01"], ddf["2011-01"])
        assert_eq(df["2011"], ddf["2011"])

    assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"])
    assert_eq(df["2011":"2015"], ddf["2011":"2015"])
Ejemplo n.º 45
0
def test_loc2d_some_missing():
    with pytest.warns(FutureWarning):
        assert_eq(d.loc[[3, 4, 3], ["a"]], full.loc[[3, 4, 3], ["a"]])
Ejemplo n.º 46
0
def test_loc_period_str():
    # .loc with PeriodIndex doesn't support partial string indexing
    # https://github.com/pydata/pandas/issues/13429
    # -> this started working in pandas 1.1
    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.period_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
    assert_eq(df.loc["2011-01-02":"2011-01-10"],
              ddf.loc["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.period_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf.loc["2011-01"])
    assert_eq(df.loc["2011"], ddf.loc["2011"])

    assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"])
    assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"])
Ejemplo n.º 47
0
def test_loc_with_series():
    assert_eq(d.loc[d.a % 2 == 0], full.loc[full.a % 2 == 0])

    assert sorted(d.loc[d.a % 2].dask) == sorted(d.loc[d.a % 2].dask)
    assert sorted(d.loc[d.a % 2].dask) != sorted(d.loc[d.a % 3].dask)
Ejemplo n.º 48
0
def test_getitem_timestamp_str():

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.date_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    # TODO(pandas) starting with pandas 1.2, __getitem__ with an implicit slice
    # is deprecated -> should we deprecate this in dask as well?
    assert_eq(df.loc["2011-01-02"], ddf["2011-01-02"])
    assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"])

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.date_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf["2011-01"])
    assert_eq(df.loc["2011"], ddf["2011"])

    assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"])
    assert_eq(df["2011":"2015"], ddf["2011":"2015"])
Ejemplo n.º 49
0
def test_set_index_nan_partition():
    d[d.a > 3].set_index("a")  # Set index with 1 null partition
    d[d.a > 1].set_index("a", sorted=True)  # Set sorted index with 0 null partitions
    a = d[d.a > 3].set_index("a", sorted=True)  # Set sorted index with 1 null partition
    assert_eq(a, a)
Ejemplo n.º 50
0
def test_loc_timestamp_str():

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.date_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
    assert_eq(df.loc["2011-01-02":"2011-01-10"],
              ddf.loc["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame
    assert_eq(df.loc["2011-01-02 10:00"].to_frame().T,
              ddf.loc["2011-01-02 10:00"], **CHECK_FREQ)

    # series
    assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], **CHECK_FREQ)
    assert_eq(df.A.loc["2011-01-02":"2011-01-10"],
              ddf.A.loc["2011-01-02":"2011-01-10"], **CHECK_FREQ)

    # slice with timestamp (dask result must be DataFrame)
    assert_eq(df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
              ddf.loc[pd.Timestamp("2011-01-02")], **CHECK_FREQ)
    assert_eq(df.loc[pd.Timestamp("2011-01-02"):pd.Timestamp("2011-01-10")],
              ddf.loc[pd.Timestamp("2011-01-02"):pd.Timestamp("2011-01-10")],
              **CHECK_FREQ)
    assert_eq(df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
              ddf.loc[pd.Timestamp("2011-01-02 10:00")], **CHECK_FREQ)

    df = pd.DataFrame(
        {
            "A": np.random.randn(100),
            "B": np.random.randn(100)
        },
        index=pd.date_range("2011-01-01", freq="M", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf.loc["2011-01"])
    assert_eq(df.loc["2011"], ddf.loc["2011"])

    assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"])
    assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"])

    # series
    assert_eq(df.B.loc["2011-01"], ddf.B.loc["2011-01"])
    assert_eq(df.B.loc["2011"], ddf.B.loc["2011"])

    assert_eq(df.B.loc["2011-01":"2012-05"], ddf.B.loc["2011-01":"2012-05"])
    assert_eq(df.B.loc["2011":"2015"], ddf.B.loc["2011":"2015"])
Ejemplo n.º 51
0
def test_rolling_axis():
    df = pd.DataFrame(np.random.randn(20, 16))
    ddf = dd.from_pandas(df, npartitions=3)

    assert_eq(df.rolling(3, axis=0).mean(), ddf.rolling(3, axis=0).mean())
    assert_eq(df.rolling(3, axis=1).mean(), ddf.rolling(3, axis=1).mean())
    assert_eq(
        df.rolling(3, min_periods=1, axis=1).mean(),
        ddf.rolling(3, min_periods=1, axis=1).mean())
    assert_eq(
        df.rolling(3, axis='columns').mean(),
        ddf.rolling(3, axis='columns').mean())
    assert_eq(
        df.rolling(3, axis='rows').mean(),
        ddf.rolling(3, axis='rows').mean())

    s = df[3]
    ds = ddf[3]
    assert_eq(s.rolling(5, axis=0).std(), ds.rolling(5, axis=0).std())
Ejemplo n.º 52
0
def test_loc():
    assert d.loc[3:8].divisions[0] == 3
    assert d.loc[3:8].divisions[-1] == 8

    assert d.loc[5].divisions == (5, 5)

    assert_eq(d.loc[5], full.loc[5:5])
    assert_eq(d.loc[3:8], full.loc[3:8])
    assert_eq(d.loc[:8], full.loc[:8])
    assert_eq(d.loc[3:], full.loc[3:])
    assert_eq(d.loc[[5]], full.loc[[5]])

    expected_warning = FutureWarning

    if not PANDAS_GT_100:
        # removed in pandas 1.0
        with pytest.warns(expected_warning):
            assert_eq(d.loc[[3, 4, 1, 8]], full.loc[[3, 4, 1, 8]])
        with pytest.warns(expected_warning):
            assert_eq(d.loc[[3, 4, 1, 9]], full.loc[[3, 4, 1, 9]])
        with pytest.warns(expected_warning):
            assert_eq(d.loc[np.array([3, 4, 1, 9])],
                      full.loc[np.array([3, 4, 1, 9])])

    assert_eq(d.a.loc[5], full.a.loc[5:5])
    assert_eq(d.a.loc[3:8], full.a.loc[3:8])
    assert_eq(d.a.loc[:8], full.a.loc[:8])
    assert_eq(d.a.loc[3:], full.a.loc[3:])
    assert_eq(d.a.loc[[5]], full.a.loc[[5]])
    if not PANDAS_GT_100:
        # removed in pandas 1.0
        with pytest.warns(expected_warning):
            assert_eq(d.a.loc[[3, 4, 1, 8]], full.a.loc[[3, 4, 1, 8]])
        with pytest.warns(expected_warning):
            assert_eq(d.a.loc[[3, 4, 1, 9]], full.a.loc[[3, 4, 1, 9]])
        with pytest.warns(expected_warning):
            assert_eq(d.a.loc[np.array([3, 4, 1, 9])],
                      full.a.loc[np.array([3, 4, 1, 9])])
    assert_eq(d.a.loc[[]], full.a.loc[[]])
    assert_eq(d.a.loc[np.array([])], full.a.loc[np.array([])])

    pytest.raises(KeyError, lambda: d.loc[1000])
    assert_eq(d.loc[1000:], full.loc[1000:])
    assert_eq(d.loc[-2000:-1000], full.loc[-2000:-1000])

    assert sorted(d.loc[5].dask) == sorted(d.loc[5].dask)
    assert sorted(d.loc[5].dask) != sorted(d.loc[6].dask)
Ejemplo n.º 53
0
def test_roundtrip_from_dask(tmpdir):
    tmpdir = str(tmpdir)
    ddf.to_parquet(tmpdir, engine="pyarrow")
    files = sorted(
        [
            os.path.join(tmpdir, f)
            for f in os.listdir(tmpdir) if not f.endswith("_metadata")
        ],
        key=natural_sort_key,
    )

    # Read list of parquet files
    ddf2 = dask_cudf.read_parquet(files, gather_statistics=True)
    assert_eq(ddf, ddf2)

    # Specify columns=['x']
    ddf2 = dask_cudf.read_parquet(files, columns=["x"], gather_statistics=True)
    assert_eq(ddf[["x"]], ddf2)

    # Specify columns='y'
    ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=True)
    assert_eq(ddf[["y"]], ddf2)

    # Now include metadata; gather_statistics is True by default
    # Read list of parquet files
    ddf2 = dask_cudf.read_parquet(tmpdir)
    assert_eq(ddf, ddf2)

    # Specify columns=['x']
    ddf2 = dask_cudf.read_parquet(tmpdir, columns=["x"])
    assert_eq(ddf[["x"]], ddf2)

    # Specify columns='y'
    ddf2 = dask_cudf.read_parquet(tmpdir, columns="y")
    assert_eq(ddf[["y"]], ddf2)
Ejemplo n.º 54
0
def test_getitem():
    df = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
            "B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
            "C": [True, False, True] * 3,
        },
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf["A"], df["A"])
    # check cache consistency
    tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"])

    assert_eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]])

    assert_eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    pytest.raises(KeyError, lambda: df["X"])
    pytest.raises(KeyError, lambda: df[["A", "X"]])
    pytest.raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf[0], df[0])
    assert_eq(ddf[[1, 2]], df[[1, 2]])

    pytest.raises(KeyError, lambda: df[8])
    pytest.raises(KeyError, lambda: df[[1, 8]])
Ejemplo n.º 55
0
def test_from_pandas_single_row():
    df = pd.DataFrame({"x": [1]}, index=[1])
    ddf = dd.from_pandas(df, npartitions=1)
    assert ddf.divisions == (1, 1)
    assert_eq(ddf, df)
Ejemplo n.º 56
0
def test_loc2d_duplicated_columns():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("AABCD"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"])
    assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]])
    assert_eq(ddf.loc["j", "B"], df.loc[["j"], "B"])
    assert_eq(ddf.loc["j", ["B"]], df.loc[["j"], ["B"]])

    assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"])
    assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]])
    assert_eq(ddf.loc["j":"q", "B"], df.loc["j":"q", "B"])
    assert_eq(ddf.loc["j":"q", ["B"]], df.loc["j":"q", ["B"]])

    assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"])
    assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"])
    assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"])
    assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"])

    assert_eq(ddf.loc[ddf.B > 0, "B"], df.loc[df.B > 0, "B"])
    assert_eq(ddf.loc[ddf.B > 0, ["A", "C"]], df.loc[df.B > 0, ["A", "C"]])
Ejemplo n.º 57
0
def test_roundtrip_from_pandas(tmpdir, write_engine, read_engine):
    fn = str(tmpdir.join('test.parquet'))
    df = pd.DataFrame({'x': [1, 2, 3]})
    df.to_parquet(fn, engine=write_engine)
    ddf = dd.read_parquet(fn, engine=read_engine)
    assert_eq(df, ddf)
Ejemplo n.º 58
0
def test_loc2d_with_known_divisions():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("ABCDE"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"])
    assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]])
    assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"])
    assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]])
    assert_eq(ddf.loc[["n"], ["A"]], df.loc[["n"], ["A"]])
    assert_eq(ddf.loc[["a", "c", "n"], ["A"]], df.loc[["a", "c", "n"], ["A"]])
    assert_eq(ddf.loc[["t", "b"], ["A"]], df.loc[["t", "b"], ["A"]])
    assert_eq(
        ddf.loc[["r", "r", "c", "g", "h"], ["A"]],
        df.loc[["r", "r", "c", "g", "h"], ["A"]],
    )
Ejemplo n.º 59
0
def test_set_index_sorted_single_partition():
    df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1, 0, 1, 0]})
    ddf = dd.from_pandas(df, npartitions=1)
    assert_eq(ddf.set_index("x", sorted=True), df.set_index("x"))
Ejemplo n.º 60
0
def test_set_index_drop(drop):
    pdf = pd.DataFrame(
        {
            "A": list("ABAABBABAA"),
            "B": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            "C": [1, 2, 3, 2, 1, 3, 2, 4, 2, 3],
        }
    )
    ddf = dd.from_pandas(pdf, 3)

    assert_eq(ddf.set_index("A", drop=drop), pdf.set_index("A", drop=drop))
    assert_eq(ddf.set_index("B", drop=drop), pdf.set_index("B", drop=drop))
    assert_eq(ddf.set_index("C", drop=drop), pdf.set_index("C", drop=drop))
    assert_eq(ddf.set_index(ddf.A, drop=drop), pdf.set_index(pdf.A, drop=drop))
    assert_eq(ddf.set_index(ddf.B, drop=drop), pdf.set_index(pdf.B, drop=drop))
    assert_eq(ddf.set_index(ddf.C, drop=drop), pdf.set_index(pdf.C, drop=drop))

    # numeric columns
    pdf = pd.DataFrame(
        {
            0: list("ABAABBABAA"),
            1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3],
        }
    )
    ddf = dd.from_pandas(pdf, 3)
    assert_eq(ddf.set_index(0, drop=drop), pdf.set_index(0, drop=drop))
    assert_eq(ddf.set_index(2, drop=drop), pdf.set_index(2, drop=drop))