def test_set_pickle(): # Use a custom type to trigger pickling. class Foo(object): pass context = pa.SerializationContext() context.register_type(Foo, 'Foo', pickle=True) test_object = Foo() # Define a custom serializer and deserializer to use in place of pickle. def dumps1(obj): return b'custom' def loads1(serialized_obj): return serialized_obj + b' serialization 1' # Test that setting a custom pickler changes the behavior. context.set_pickle(dumps1, loads1) serialized = pa.serialize(test_object, context=context).to_buffer() deserialized = pa.deserialize(serialized.to_pybytes(), context=context) assert deserialized == b'custom serialization 1' # Define another custom serializer and deserializer. def dumps2(obj): return b'custom' def loads2(serialized_obj): return serialized_obj + b' serialization 2' # Test that setting another custom pickler changes the behavior again. context.set_pickle(dumps2, loads2) serialized = pa.serialize(test_object, context=context).to_buffer() deserialized = pa.deserialize(serialized.to_pybytes(), context=context) assert deserialized == b'custom serialization 2'
import struct import lz4.frame import pyarrow from scipy.sparse import csc_matrix # The magic intial bytes which tell us that a given binary chunk is LZ4 # compressed data LZ4_MAGIC_NUMBER = struct.pack("<I", 0x184D2204) context = pyarrow.SerializationContext() def serialize_csc(matrix): """ Decompose a matrix in Compressed Sparse Column format into more basic data types (tuples and numpy arrays) which PyArrow knows how to serialize """ return ((matrix.data, matrix.indices, matrix.indptr), matrix.shape) def deserialize_csc(args): """ Reconstruct a Compressed Sparse Column matrix from its decomposed parts """ # We construct a `csc_matrix` instance by directly assigning its members, # rather than using `__init__` which runs additional checks that # significantly slow down deserialization. Because we know these values # came from properly constructed matrices we can skip these checks (data, indices, indptr), shape = args matrix = csc_matrix.__new__(csc_matrix)
def test_serialization_deprecated_toplevel(): with pytest.warns(FutureWarning): pa.SerializedPyObject() with pytest.warns(FutureWarning): pa.SerializationContext()
return np.ma.MaskedArray(data, mask=mask, fill_value=fill_value, hard_mask=hardmask) def _serialize_numpy_masked_constant(obj): # Workaround for "Changing the dtype of a 0d array is only supported if the itemsize is unchanged" error return None def _deserialize_numpy_masked_constant(obj): return np.ma.masked serialization_context = pa.SerializationContext() pa.register_default_serialization_handlers(serialization_context) serialization_context.register_type( np.ma.MaskedArray, "numpy.ma.core.MaskedArray", custom_serializer=_serialize_numpy_masked_array, custom_deserializer=_deserialize_numpy_masked_array, ) serialization_context.register_type( np.ma.core.MaskedConstant, "numpy.ma.core.MaskedConstant", custom_serializer=_serialize_numpy_masked_constant, custom_deserializer=_deserialize_numpy_masked_constant, )
msg.result = data["result"] msg.id = data["id"] msg.fields = data["fields"] msg.data = data["data"] return msg def _serialize_StructDict(struct): return struct.__getstate__() def _deserialize_StructDict(data): return StructDict(data) pyarrow_context = pa.SerializationContext() pyarrow_context.register_type(Message, 'Message', custom_serializer=_serialize_Message, custom_deserializer=_deserialize_Message) pyarrow_context.register_type(StructDict, 'StructDict', custom_serializer=_serialize_StructDict, custom_deserializer=_deserialize_StructDict) class RtMessagingClient: """Messaging client for connecting to a server and sending messages""" def __init__(self, serverAddr, serverPort): self.addr = serverAddr self.port = serverPort
def make_serialization_context(): def array_custom_serializer(obj): return obj.tolist(), obj.dtype.str def array_custom_deserializer(serialized_obj): return np.array(serialized_obj[0], dtype=np.dtype(serialized_obj[1])) context = pa.SerializationContext() # This is for numpy arrays of "object" only; primitive types are handled # efficiently with Arrow's Tensor facilities (see python_to_arrow.cc) context.register_type(np.ndarray, 20 * b"\x00", custom_serializer=array_custom_serializer, custom_deserializer=array_custom_deserializer) context.register_type(Foo, 20 * b"\x01") context.register_type(Bar, 20 * b"\x02") context.register_type(Baz, 20 * b"\x03") context.register_type(Qux, 20 * b"\x04") context.register_type(SubQux, 20 * b"\x05") context.register_type(SubQuxPickle, 20 * b"\x05", pickle=True) context.register_type(Exception, 20 * b"\x06") context.register_type(CustomError, 20 * b"\x07") context.register_type(Point, 20 * b"\x08") context.register_type(NamedTupleExample, 20 * b"\x09") # TODO(pcm): This is currently a workaround until arrow supports # arbitrary precision integers. This is only called on long integers, # see the associated case in the append method in python_to_arrow.cc context.register_type(int, 20 * b"\x10", pickle=False, custom_serializer=lambda obj: str(obj), custom_deserializer=( lambda serialized_obj: int(serialized_obj))) if (sys.version_info < (3, 0)): deserializer = ( lambda serialized_obj: long(serialized_obj)) # noqa: E501,F821 context.register_type(long, 20 * b"\x11", pickle=False, # noqa: E501,F821 custom_serializer=lambda obj: str(obj), custom_deserializer=deserializer) def ordered_dict_custom_serializer(obj): return list(obj.keys()), list(obj.values()) def ordered_dict_custom_deserializer(obj): return OrderedDict(zip(obj[0], obj[1])) context.register_type(OrderedDict, 20 * b"\x12", pickle=False, custom_serializer=ordered_dict_custom_serializer, custom_deserializer=ordered_dict_custom_deserializer) def default_dict_custom_serializer(obj): return list(obj.keys()), list(obj.values()), obj.default_factory def default_dict_custom_deserializer(obj): return defaultdict(obj[2], zip(obj[0], obj[1])) context.register_type(defaultdict, 20 * b"\x13", pickle=False, custom_serializer=default_dict_custom_serializer, custom_deserializer=default_dict_custom_deserializer) context.register_type(type(lambda: 0), 20 * b"\x14", pickle=True) return context