def test_serialize_pandas_no_preserve_index(): df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3]) expected = pd.DataFrame({'a': [1, 2, 3]}) buf = pa.serialize_pandas(df, preserve_index=False) result = pa.deserialize_pandas(buf) assert_frame_equal(result, expected) buf = pa.serialize_pandas(df, preserve_index=True) result = pa.deserialize_pandas(buf) assert_frame_equal(result, df)
def serialize(obj): if isinstance(obj, bytes): return obj if isinstance(obj, pd.DataFrame) and np.product(obj.shape) > 30000: pa_buffer = pa.serialize_pandas(obj) return pa_buffer.to_pybytes() return pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
def test_pandas_serialize_round_trip_nthreads(): index = pd.Index([1, 2, 3], name='my_index') columns = ['foo', 'bar'] df = pd.DataFrame( {'foo': [1.5, 1.6, 1.7], 'bar': list('abc')}, index=index, columns=columns ) buf = pa.serialize_pandas(df) result = pa.deserialize_pandas(buf, nthreads=2) assert_frame_equal(result, df)
def test_pandas_serialize_round_trip_multi_index(): index1 = pd.Index([1, 2, 3], name='level_1') index2 = pd.Index(list('def'), name=None) index = pd.MultiIndex.from_arrays([index1, index2]) columns = ['foo', 'bar'] df = pd.DataFrame( {'foo': [1.5, 1.6, 1.7], 'bar': list('abc')}, index=index, columns=columns, ) buf = pa.serialize_pandas(df) result = pa.deserialize_pandas(buf) assert_frame_equal(result, df)
def time_serialize_pandas(self): pa.serialize_pandas(self.df)
def test_pandas_serialize_round_trip_not_string_columns(): df = pd.DataFrame(list(zip([1.5, 1.6, 1.7], 'abc'))) buf = pa.serialize_pandas(df) result = pa.deserialize_pandas(buf) assert_frame_equal(result, df)
def _check_serialize_pandas_round_trip(df, nthreads=1): buf = pa.serialize_pandas(df, nthreads=nthreads) result = pa.deserialize_pandas(buf, nthreads=nthreads) assert_frame_equal(result, df)
def _check_serialize_pandas_round_trip(df, use_threads=False): buf = pa.serialize_pandas(df, nthreads=2 if use_threads else 1) result = pa.deserialize_pandas(buf, use_threads=use_threads) assert_frame_equal(result, df)
def _serialize_pandas_series(obj): return serialize_pandas(pd.DataFrame({obj.name: obj}))
def _serialize_pandas_dataframe(obj): return serialize_pandas(obj).to_pybytes()
def _serialize_pandas_series(obj): return serialize_pandas(pd.DataFrame({obj.name: obj}))
def _serialize_pandas_dataframe(obj): return serialize_pandas(obj)
def _check_serialize_pandas_round_trip(df, use_threads=False): buf = pa.serialize_pandas(df, nthreads=2 if use_threads else 1) result = pa.deserialize_pandas(buf, use_threads=use_threads) assert_frame_equal(result, df)
def setup(self): # 10 million length n = 10000000 self.df = pd.DataFrame({'data': np.random.randn(n)}) self.serialized = pa.serialize_pandas(self.df)
def _serialize_pandas_dataframe(obj): return serialize_pandas(obj)
def test_pandas_serialize_round_trip_not_string_columns(): df = pd.DataFrame(list(zip([1.5, 1.6, 1.7], 'abc'))) buf = pa.serialize_pandas(df) result = pa.deserialize_pandas(buf) assert_frame_equal(result, df)
def _serialize_pandas_series(obj): # TODO: serializing Series without extra copy return serialize_pandas(pd.DataFrame({obj.name: obj})).to_pybytes()