Ejemplo n.º 1
0
def test_categorical_compare_ordered(data):
    cat1 = data[0]
    cat2 = data[1]
    pdsr1 = pd.Series(cat1)
    pdsr2 = pd.Series(cat2)
    sr1 = Series(cat1)
    sr2 = Series(cat2)
    dsr1 = dgd.from_pygdf(sr1, npartitions=2)
    dsr2 = dgd.from_pygdf(sr2, npartitions=2)

    # Test equality
    out = dsr1 == dsr1
    assert out.dtype == np.bool_
    assert np.all(out.compute())
    assert np.all(pdsr1 == pdsr1)

    # Test inequality
    out = dsr1 != dsr1
    assert not np.any(out.compute())
    assert not np.any(pdsr1 != pdsr1)

    assert dsr1.cat.ordered
    assert pdsr1.cat.ordered

    # Test ordered operators
    np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute())
    np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute())
Ejemplo n.º 2
0
def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how):
    chunksize = 50

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows, dtype=np.float64)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows,
                                                dtype=np.float64)}.items())

    expect = left.set_index('x').join(right.set_index('x'), how=how,
                                      sort=True, lsuffix='l', rsuffix='r')
    expect = expect.to_pandas()

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.set_index('x').join(right.set_index('x'), how=how,
                                      lsuffix='l', rsuffix='r')
    got = joined.compute().to_pandas()

    # Check index
    np.testing.assert_array_equal(expect.index.values,
                                  got.index.values)

    # Check rows in each groups
    expect_rows = {}
    got_rows = {}

    def gather(df, grows):
        cola = np.sort(np.asarray(df.al))
        colb = np.sort(np.asarray(df.ar))

        grows[df['index'].values[0]] = (cola, colb)

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=expect_rows))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=got_rows))

    for k in expect_rows:
        np.testing.assert_array_equal(expect_rows[k][0],
                                      got_rows[k][0])
        np.testing.assert_array_equal(expect_rows[k][1],
                                      got_rows[k][1])
Ejemplo n.º 3
0
def test_assign():
    np.random.seed(0)
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=20),
                       'y': np.random.normal(size=20)})

    dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2)
    pdcol = pd.Series(np.arange(20) + 1000)
    newcol = dgd.from_pygdf(gd.Series(pdcol),
                            npartitions=dgf.npartitions)
    out = dgf.assign(z=newcol)

    got = out.compute().to_pandas()
    assert_frame_equal(got.loc[:, ['x', 'y']], df)
    np.testing.assert_array_equal(got['z'], pdcol)
Ejemplo n.º 4
0
def test_categorical_compare_unordered(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_pygdf(sr, npartitions=2)

    # Test equality
    out = dsr == dsr
    assert out.dtype == np.bool_
    assert np.all(out.compute())
    assert np.all(pdsr == pdsr)

    # Test inequality
    out = dsr != dsr
    assert not np.any(out.compute())
    assert not np.any(pdsr != pdsr)

    assert not dsr.cat.ordered.compute()
    assert not pdsr.cat.ordered

    with pytest.raises((TypeError, ValueError)) as raises:
        pdsr < pdsr

    raises.match("Unordered Categoricals can only compare equality or not")

    with pytest.raises((TypeError, ValueError)) as raises:
        dsr < dsr

    raises.match("Unordered Categoricals can only compare equality or not")
Ejemplo n.º 5
0
def test_series_to_delayed():
    nelem = 100

    sr = gd.Series(np.random.randint(nelem, size=nelem))

    dsr = dgd.from_pygdf(sr, npartitions=5)

    delays = dsr.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert isinstance(got, gd.Series)
    np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas())

    # Check individual partitions
    divs = dsr.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = sr[s:e].to_pandas()
        got = part.compute().to_pandas()
        np.testing.assert_array_equal(got, expect)
Ejemplo n.º 6
0
def test_repeated_groupby():
    np.random.seed(0)

    nelem = 100
    df = pd.DataFrame()
    df['a'] = _gen_uniform_keys(nelem)
    df['b'] = _gen_uniform_keys(nelem)

    ref_df = gd.DataFrame.from_pandas(df)
    df = dgd.from_pygdf(ref_df, npartitions=3)
    assert df.known_divisions

    df2 = df.groupby('a').apply(lambda x: x)
    assert not df2.known_divisions

    got = df2.groupby('a').apply(lambda x: x).compute().to_pandas()
    expect = ref_df.groupby('a').apply(lambda x: x).to_pandas()

    def sort_content(df):
        return sorted(list(df.b))

    got = got.groupby('a').apply(sort_content)
    expect = expect.groupby('a').apply(sort_content)

    pd.util.testing.assert_series_equal(got, expect)
Ejemplo n.º 7
0
def test_groupby_multi_keys(keygen):
    np.random.seed(0)

    nelem = 500
    npartitions = 10

    # Generate the keys
    xs = keygen(nelem)
    ys = keygen(nelem)

    assert xs.size == nelem
    assert ys.size == nelem
    df = pd.DataFrame({
        'x': xs,
        'y': ys,
        'z': np.random.normal(size=nelem) + 1
    })

    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_pygdf(gdf, npartitions=npartitions)

    groups = dgf.groupby(by=['x', 'y']).count()
    got = groups.compute().to_pandas()

    # Check against expectation
    expect = df.groupby(by=['x', 'y'], as_index=False).count()
    # Check keys
    np.testing.assert_array_equal(got.x, expect.x)
    np.testing.assert_array_equal(got.y, expect.y)
    # Check values
    np.testing.assert_array_equal(got.z, expect.z)
Ejemplo n.º 8
0
def test_dataframe_to_delayed():
    nelem = 100

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delays = ddf.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert_frame_equal(got.to_pandas(), df.to_pandas())

    # Check individual partitions
    divs = ddf.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = df[s:e].to_pandas()
        got = part.compute().to_pandas()
        assert_frame_equal(got, expect)
Ejemplo n.º 9
0
def test_categorical_basic(data):
    cat = data.copy()
    pdsr = pd.Series(cat)
    sr = Series(cat)
    dsr = dgd.from_pygdf(sr, npartitions=2)
    result = dsr.compute()
    np.testing.assert_array_equal(cat.codes, result.to_array())
    assert dsr.dtype == pdsr.dtype

    # Test attributes
    assert pdsr.cat.ordered == dsr.cat.ordered.compute()
    # TODO: Investigate dsr.cat.categories: It raises
    # ValueError: Expected iterable of tuples of (name, dtype),
    # got ('a', 'b', 'c')
    # assert(tuple(pdsr.cat.categories) == tuple(dsr.cat.categories))

    np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype)

    string = str(result)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
Ejemplo n.º 10
0
def test_dt_series(data, field):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_pygdf(sr, npartitions=5)
    base = getattr(pdsr.dt, field)
    test = getattr(dsr.dt, field).compute()\
                                 .to_pandas().astype('int64')
    assert_series_equal(base, test)
Ejemplo n.º 11
0
def test_head():
    np.random.seed(0)
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=100),
                       'y': np.random.normal(size=100)})
    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_pygdf(gdf, npartitions=2)

    assert_frame_equal(dgf.head().to_pandas(), df.head())
Ejemplo n.º 12
0
def _make_random_frame(nelem, npartitions=2):
    df = pd.DataFrame({
        'x': np.random.randint(0, 5, size=nelem),
        'y': np.random.normal(size=nelem) + 1
    })
    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_pygdf(gdf, npartitions=npartitions)
    return df, dgf
Ejemplo n.º 13
0
def test_dt_series(data, field):
    pd_data = pd.Series(data.copy())
    gdf_data = Series(pd_data)
    dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5)
    base = getattr(pd_data.dt, field)
    test = getattr(dask_gdf_data.dt, field).compute()\
                                           .to_pandas().astype('int64')
    assert_series_equal(base, test)
Ejemplo n.º 14
0
def test_series(data):
    pd_data = pd.Series(data.copy())
    gdf_data = Series(pd_data)
    dask_gdf_data = dgd.from_pygdf(gdf_data, npartitions=5)

    np.testing.assert_equal(
        np.array(pd_data),
        np.array(dask_gdf_data.compute()),
    )
Ejemplo n.º 15
0
def test_sort_values(nelem, nparts, by):
    df = pygdf.DataFrame()
    df['a'] = np.ascontiguousarray(np.arange(nelem)[::-1])
    df['b'] = np.arange(100, nelem + 100)
    ddf = dgd.from_pygdf(df, npartitions=nparts)

    got = ddf.sort_values(by=by).compute().to_pandas()
    expect = df.sort_values(by=by).to_pandas().reset_index(drop=True)
    pd.util.testing.assert_frame_equal(got, expect)
Ejemplo n.º 16
0
def test_frame_extra_columns_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)
    ddf1 = dgd.from_pygdf(df, npartitions=5)

    df['z'] = np.arange(nelem)
    ddf2 = dgd.from_pygdf(df, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"extra columns")
Ejemplo n.º 17
0
def test_series(data):
    pdsr = pd.Series(data.copy())
    sr = Series(pdsr)
    dsr = dgd.from_pygdf(sr, npartitions=5)

    np.testing.assert_equal(
        np.array(pdsr),
        np.array(dsr.compute()),
    )
Ejemplo n.º 18
0
def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
    chunksize = 50

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows)}.items())

    expect = left.set_index('x').join(right.set_index('x'), how='inner',
                                      sort=True, lsuffix='l', rsuffix='r')
    expect = expect.to_pandas()

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.set_index('x').join(right.set_index('x'), how='inner',
                                      lsuffix='l', rsuffix='r')
    got = joined.compute().to_pandas()

    # Check index
    np.testing.assert_array_equal(expect.index.values,
                                  got.index.values)

    # Check rows in each groups
    expect_rows = {}
    got_rows = {}

    def gather(df, grows):
        grows[df['index'].values[0]] = (set(df.al), set(df.ar))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=expect_rows))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=got_rows))

    assert got_rows == expect_rows
Ejemplo n.º 19
0
def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys,
                    how='left'):
    print(left_nrows, right_nrows, left_nkeys, right_nkeys)
    chunksize = 3

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'y': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows, dtype=np.float64)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'y': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows,
                                                dtype=np.float64)}.items())

    print(left.to_pandas())
    print(right.to_pandas())

    expect = left.merge(right, on=('x', 'y'), how=how)
    expect = expect.to_pandas().sort_values(['x', 'y', 'a_x', 'a_y'])\
        .reset_index(drop=True)

    print("Expect".center(80, '='))
    print(expect)

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.merge(right, on=('x', 'y'), how=how)

    print("Got".center(80, '='))
    got = joined.compute().to_pandas()

    got = got.sort_values(['x', 'y', 'a_x', 'a_y']).reset_index(drop=True)
    print(got)

    pd.util.testing.assert_frame_equal(expect, got)
Ejemplo n.º 20
0
def test_set_index_2(nelem):
    np.random.seed(0)
    df = pd.DataFrame({'x': 100 + np.random.randint(0, nelem//2, size=nelem),
                       'y': np.random.normal(size=nelem)})
    expect = df.set_index('x').sort_index()

    dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=4)
    res = dgf.set_index('x')  # sort by default
    got = res.compute().to_pandas()

    assert_frame_equal_by_index_group(expect, got)
Ejemplo n.º 21
0
def test_setitem_scalar_float(data_type):
    np.random.seed(0)
    scalar = np.random.randn(1).astype(data_type)[0]
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=20),
                       'y': np.random.normal(size=20)})
    dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2)

    df['z'] = scalar
    dgf['z'] = scalar

    got = dgf.compute().to_pandas()
    np.testing.assert_array_equal(got['z'], df['z'])
Ejemplo n.º 22
0
def test_setitem_scalar_datetime():
    np.random.seed(0)
    scalar = np.int64(np.random.randint(0, 100)).astype('datetime64[ms]')
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=20),
                       'y': np.random.normal(size=20)})
    dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2)

    df['z'] = scalar
    dgf['z'] = scalar

    got = dgf.compute().to_pandas()
    np.testing.assert_array_equal(got['z'], df['z'])
Ejemplo n.º 23
0
def test_frame_dtype_error():
    nelem = 20

    df1 = gd.DataFrame()
    df1['bad'] = np.arange(nelem)
    df1['bad'] = np.arange(nelem, dtype=np.float64)

    df2 = gd.DataFrame()
    df2['bad'] = np.arange(nelem)
    df2['bad'] = np.arange(nelem, dtype=np.float32)

    ddf1 = dgd.from_pygdf(df1, npartitions=5)
    ddf2 = dgd.from_pygdf(df2, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    print("out")
    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
Ejemplo n.º 24
0
def test_from_pygdf():
    np.random.seed(0)

    df = pd.DataFrame({'x': np.random.randint(0, 5, size=10000),
                       'y': np.random.normal(size=10000)})

    gdf = gd.DataFrame.from_pandas(df)

    # Test simple around to/from dask
    ingested = dgd.from_pygdf(gdf, npartitions=2)
    assert_frame_equal(ingested.compute().to_pandas(), df)

    # Test conversion to dask.dataframe
    ddf = ingested.to_dask_dataframe()
    assert_frame_equal(ddf.compute(), df)
Ejemplo n.º 25
0
def test_groupby_apply_grouped():
    np.random.seed(0)

    nelem = 100
    xs = _gen_uniform_keys(nelem)
    ys = _gen_uniform_keys(nelem)
    df = pd.DataFrame({
        'x': xs,
        'y': ys,
        'idx': np.arange(nelem),
        'v1': np.random.normal(size=nelem),
        'v2': np.random.normal(size=nelem)
    })

    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_pygdf(gdf, npartitions=2)

    def transform(y, v1, v2, out1):
        for i in range(cuda.threadIdx.x, y.size, cuda.blockDim.x):
            out1[i] = y[i] * (v1[i] + v2[i])

    grouped = dgf.groupby(by=['x', 'y']).apply_grouped(
        transform,
        incols=['y', 'v1', 'v2'],
        outcols={'out1': np.float64},
    )

    # Compute with dask
    dgd_grouped = grouped.compute().to_pandas()
    binning = {}
    for _, row in dgd_grouped.iterrows():
        binning[row.idx] = row

    # Emulate the operation with pandas
    def emulate(df):
        df['out1'] = df.y * (df.v1 + df.v2)
        return df

    pd_groupby = df.groupby(by=['x', 'y'], sort=True,
                            as_index=True).apply(emulate)

    # Check the result
    for _, expect in pd_groupby.iterrows():
        got = binning[expect.idx]

        attrs = ['x', 'y', 'v1', 'v2', 'out1']
        for a in attrs:
            np.testing.assert_equal(getattr(got, a), getattr(expect, a))
Ejemplo n.º 26
0
def test_query():
    np.random.seed(0)

    df = pd.DataFrame({'x': np.random.randint(0, 5, size=10),
                       'y': np.random.normal(size=10)})
    gdf = gd.DataFrame.from_pandas(df)
    expr = 'x > 2'

    assert_frame_equal(gdf.query(expr).to_pandas(), df.query(expr))

    queried = (dgd.from_pygdf(gdf, npartitions=2).query(expr))

    got = queried.compute().to_pandas()
    expect = gdf.query(expr).to_pandas()

    assert_frame_equal(got, expect)
Ejemplo n.º 27
0
def test_mixing_series_frame_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delay_frame = ddf.to_delayed()
    delay_series = ddf.x.to_delayed()
    combined = dgd.from_delayed(delay_frame + delay_series)

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"Expected partition of type `DataFrame` but got `Series`")
Ejemplo n.º 28
0
def test_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    # Combine with .append
    head = frags[0]
    tail = frags[1:]

    appended = dgd.from_pygdf(head, npartitions=1)
    for each in tail:
        appended = appended.append(each)

    assert_frame_equal(df, appended.compute().to_pandas())
Ejemplo n.º 29
0
def test_series_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    appending = dgd.from_pygdf(frags[0], npartitions=1)
    for frag in frags[1:]:
        appending = appending.append(frag)

    appended = appending.compute().to_pandas()
    assert isinstance(appended, pd.Series)
    np.testing.assert_array_equal(appended, df.x)
Ejemplo n.º 30
0
def test_sort_values_binned():
    np.random.seed(43)
    nelem = 100
    nparts = 5
    by = 'a'
    df = pygdf.DataFrame()
    df['a'] = np.random.randint(1, 5, nelem)
    ddf = dgd.from_pygdf(df, npartitions=nparts)

    parts = ddf.sort_values_binned(by=by).to_delayed()
    part_uniques = []
    for i, p in enumerate(parts):
        part = dask.compute(p)[0]
        part_uniques.append(set(part.a.unique()))

    # Partitions do not have intersecting keys
    for i in range(len(part_uniques)):
        for j in range(i + 1, len(part_uniques)):
            assert not (part_uniques[i] & part_uniques[j]), \
                    "should have empty intersection"