Esempio n. 1
0
def pack_payload_pandas(partition: pd.DataFrame, group_key: List[str]) -> pd.DataFrame:
    try:
        # Technically distributed is an optional dependency
        from distributed.protocol import serialize_bytes
    except ImportError:
        _logger.warning(
            "Shuffle payload columns cannot be compressed since distributed is not installed."
        )
        return partition

    if partition.empty:
        res = partition[group_key]
        res[_PAYLOAD_COL] = b""
    else:
        res = partition.groupby(
            group_key,
            sort=False,
            observed=True,
            # Keep the as_index s.t. the group values are not dropped. With this
            # the behaviour seems to be consistent along pandas versions
            as_index=True,
        ).apply(lambda x: pd.Series({_PAYLOAD_COL: serialize_bytes(x)}))

        res = res.reset_index()
    return res
Esempio n. 2
0
def test_serialize_bytes(kwargs):
    for x in [
            1,
            "abc",
            np.arange(5),
            b"ab" * int(40e6),
            int(2**26) * b"ab",
        (int(2**25) * b"ab", int(2**25) * b"ab"),
    ]:
        b = serialize_bytes(x, **kwargs)
        assert isinstance(b, bytes)
        y = deserialize_bytes(b)
        assert str(x) == str(y)
Esempio n. 3
0
def test_serialize_bytes():
    for x in [1, 'abc', np.arange(5)]:
        b = serialize_bytes(x)
        assert isinstance(b, bytes)
        y = deserialize_bytes(b)
        assert str(x) == str(y)
Esempio n. 4
0
def test_serialize_bytes():
    for x in [1, "abc", np.arange(5), b"ab" * int(40e6)]:
        b = serialize_bytes(x)
        assert isinstance(b, bytes)
        y = deserialize_bytes(b)
        assert str(x) == str(y)
Esempio n. 5
0
def _serialize_if_device(obj):
    """ Serialize an object if it's a device object """
    if is_device_object(obj):
        return serialize_bytes(obj, on_error="raise")
    else:
        return obj
Esempio n. 6
0
def test_serialize_bytes():
    for x in [1, 'abc', np.arange(5)]:
        b = serialize_bytes(x)
        assert isinstance(b, bytes)
        y = deserialize_bytes(b)
        assert str(x) == str(y)