Ejemplo n.º 1
0
def _shuffle_group(df, columns, stage, k, npartitions, ignore_index):
    c = hash_object_dispatch(df[columns], index=False)
    typ = np.min_scalar_type(npartitions * 2)
    c = np.mod(c, npartitions).astype(typ, copy=False)
    if stage > 0:
        np.floor_divide(c, k**stage, out=c)
    if k < int(npartitions / (k**stage)):
        np.mod(c, k, out=c)
    return group_split_dispatch(df,
                                c.astype(np.int32),
                                k,
                                ignore_index=ignore_index)
Ejemplo n.º 2
0
def _hash_series(s):
    """Row-wise Series hash"""
    if isinstance(s, pd.Series):
        # Using pandas hashing, which does not produce the
        # same result as cudf.Series.hash_values().  Do not
        # expect hash-based data transformations to be the
        # same on CPU and CPU.  TODO: Fix this (maybe use
        # murmurhash3 manually on CPU).
        return hash_object_dispatch(s).values
    else:
        if _is_list_dtype(s):
            return s.list.leaves.hash_values()
        else:
            return s.hash_values()
Ejemplo n.º 3
0
def _shuffle_group_2(df, cols, ignore_index, nparts):
    if not len(df):
        return {}, df

    ind = (hash_object_dispatch(df[cols] if cols else df, index=False) %
           int(nparts)).astype(np.int32)

    n = ind.max() + 1

    result2 = group_split_dispatch(df,
                                   ind.values,
                                   n,
                                   ignore_index=ignore_index)
    return result2, df.iloc[:0]
Ejemplo n.º 4
0
def _shuffle_group(df, columns, stage, k, npartitions, ignore_index, nfinal):
    ind = hash_object_dispatch(df[columns], index=False)
    if nfinal and nfinal != npartitions:
        # Want to start with final mapping here
        ind = ind % int(nfinal)

    c = ind.values
    typ = np.min_scalar_type(npartitions * 2)
    c = np.mod(c, npartitions).astype(typ, copy=False)
    if stage > 0:
        np.floor_divide(c, k**stage, out=c)
    if k < int(npartitions / (k**stage)):
        np.mod(c, k, out=c)
    return group_split_dispatch(df,
                                c.astype(np.int32),
                                k,
                                ignore_index=ignore_index)
Ejemplo n.º 5
0
def set_partitions_hash(df, columns, npartitions):
    c = hash_object_dispatch(df[columns], index=False)
    return np.mod(c, npartitions)