Ejemplo n.º 1
0
def test_dataframe_from_delayed():
    delays = [load_data(10 * i, i) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.DataFrame)

    expected = gd.concat([d.compute() for d in delays])
    assert_frame_equal(res.to_pandas(), expected.to_pandas())
Ejemplo n.º 2
0
def test_series_from_delayed():
    delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.Series)

    expected = gd.concat([d.compute() for d in delays])
    np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
Ejemplo n.º 3
0
def test_frame_extra_columns_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)
    ddf1 = dgd.from_pygdf(df, npartitions=5)

    df['z'] = np.arange(nelem)
    ddf2 = dgd.from_pygdf(df, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"extra columns")
Ejemplo n.º 4
0
def test_mixing_series_frame_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delay_frame = ddf.to_delayed()
    delay_series = ddf.x.to_delayed()
    combined = dgd.from_delayed(delay_frame + delay_series)

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"Expected partition of type `DataFrame` but got `Series`")
Ejemplo n.º 5
0
def test_frame_dtype_error():
    nelem = 20

    df1 = gd.DataFrame()
    df1['bad'] = np.arange(nelem)
    df1['bad'] = np.arange(nelem, dtype=np.float64)

    df2 = gd.DataFrame()
    df2['bad'] = np.arange(nelem)
    df2['bad'] = np.arange(nelem, dtype=np.float32)

    ddf1 = dgd.from_pygdf(df1, npartitions=5)
    ddf2 = dgd.from_pygdf(df2, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    print("out")
    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
Ejemplo n.º 6
0
def join_frames(left, right, on, how, lsuffix, rsuffix):
    """Join two frames on 1 or more columns.

    Parameters
    ----------
    left, right : dask_gdf.DataFrame
    on : tuple[str]
        key column(s)
    how : str
        Join method
    lsuffix, rsuffix : str

    """
    assert how == 'left'

    def fix_left(df):
        newdf = gd.DataFrame()
        df = df.reset_index()
        for k in on:
            newdf[k] = df[k]
        for k in left_val_names:
            newdf[fix_name(k, lsuffix)] = df[k]
        for k in right_val_names:
            newdf[fix_name(k, rsuffix)] = nullcolumn(len(df), dtypes[k])
        return newdf

    def nullcolumn(nelem, dtype):
        data = np.zeros(nelem, dtype=dtype)
        mask_size = gd.utils.calc_chunk_size(
            data.size,
            gd.utils.mask_bitsize,
        )
        mask = np.zeros(mask_size, dtype=gd.utils.mask_dtype)
        sr = gd.Series.from_masked_array(
            data=data,
            mask=mask,
            null_count=data.size,
        )
        return sr

    def make_empty():
        df = gd.DataFrame()
        for k in on:
            df[k] = np.asarray([], dtype=dtypes[k])
        for k in left_val_names:
            df[fix_name(k, lsuffix)] = np.asarray([], dtype=dtypes[k])
        for k in right_val_names:
            df[fix_name(k, rsuffix)] = np.asarray([], dtype=dtypes[k])
        return df

    def merge(left, right):
        if left is None and right is None:
            # FIXME: this should go inside pygdf so it can merge two empty
            #        frames
            return empty_frame
        elif left is None:
            # FIXME: this should go inside pygdf so it can merge empty frames
            #        left frames
            return empty_frame
        elif right is None:
            # FIXME: this should go inside pygdf so it can merge empty frames
            #        right frames
            return fix_left(left)
        else:
            return left.merge(right, on=on, how=how)

    left_val_names = [k for k in left.columns if k not in on]
    right_val_names = [k for k in right.columns if k not in on]
    same_names = set(left_val_names) & set(right_val_names)
    fix_name = partial(_fix_name, same_names=same_names)
    if same_names and not (lsuffix or rsuffix):
        raise ValueError('there are overlapping columns but '
                         'lsuffix and rsuffix are not defined')

    dtypes = {k: left[k].dtype for k in left.columns}
    dtypes.update({k: right[k].dtype for k in right.columns})

    empty_frame = make_empty()
    left_parts = left.to_delayed()
    right_parts = right.to_delayed()

    # Add column w/ hash(v) % nparts
    nparts = max(len(left_parts), len(right_parts))

    left_hashed = group_frame(left_parts, nparts, on)
    right_hashed = group_frame(right_parts, nparts, on)

    # Fanout each partition into nparts subgroups
    left_subgroups = fanout_subgroups(left_hashed, nparts)
    right_subgroups = fanout_subgroups(right_hashed, nparts)

    assert len(left_subgroups) == len(right_subgroups)

    # Concat
    left_cats = [concat(*it) for it in left_subgroups]
    right_cats = [concat(*it) for it in right_subgroups]

    # Combine
    merged = [
        delayed(merge)(left_cats[i], right_cats[i]) for i in range(nparts)
    ]

    return dask_gdf.from_delayed(merged,
                                 prefix='join_result',
                                 meta=empty_frame)