def test_python_file_closing(): bio = BytesIO() pf = pa.PythonFile(bio) wr = weakref.ref(pf) del pf assert wr() is None # object was destroyed assert not bio.closed pf = pa.PythonFile(bio) pf.close() assert bio.closed
def test_python_file_read(): data = b'some sample data' buf = BytesIO(data) f = pa.PythonFile(buf, mode='r') assert f.size() == len(data) assert f.tell() == 0 assert f.read(4) == b'some' assert f.tell() == 4 f.seek(0) assert f.tell() == 0 f.seek(5) assert f.tell() == 5 v = f.read(50) assert v == b'sample data' assert len(v) == 11 assert f.size() == len(data) f.close()
def test_python_file_read_buffer(): length = 10 data = b'0123456798' dst_buf = bytearray(data) class DuckReader: def close(self): pass @property def closed(self): return False def read_buffer(self, nbytes): assert nbytes == length return memoryview(dst_buf)[:nbytes] duck_reader = DuckReader() with pa.PythonFile(duck_reader, mode='r') as f: buf = f.read_buffer(length) assert len(buf) == length assert memoryview(buf).tobytes() == dst_buf[:length] # buf should point to the same memory, so modyfing it memoryview(buf)[0] = ord(b'x') # should modify the original assert dst_buf[0] == ord(b'x')
def test_write(self): # Write out test file with UncloseableBytesIO() as write_buffer: with Writer(write_buffer, self.table) as writer: writer.write_row_group(self.data) file_bytes = write_buffer.getvalue() # Read in test file read_buffer = BytesIO(file_bytes) with pa.PythonFile(read_buffer, mode='r') as infile: # Verify data parq_table = pq.read_table(infile) written_data = list(parq_table.to_pydict().values()) self.assertEqual(self.data, written_data) # Verify parquet file schema for i, field in enumerate(parq_table.schema): self.assertEqual(field.type.id, self.expected_datatypes[i].id) # Ensure timestamp column was written with int96; right now # there is no way to see except to check that the unit on # the timestamp type is 'ns' ts_col = parq_table.schema.field_by_name('timestamp_col') self.assertEqual(ts_col.type.unit, 'ns')
def test_python_file_writelines(tmpdir): lines = [b'line1\n', b'line2\n' b'line3'] path = os.path.join(str(tmpdir), 'foo.txt') with open(path, 'wb') as f: try: f = pa.PythonFile(f, mode='w') assert f.writable() f.writelines(lines) finally: f.close() with open(path, 'rb') as f: try: f = pa.PythonFile(f, mode='r') assert f.readable() assert f.read() == b''.join(lines) finally: f.close()
def test_python_file_readinto(): length = 10 data = b'some sample data longer than 10' dst_buf = bytearray(length) src_buf = BytesIO(data) with pa.PythonFile(src_buf, mode='r') as f: assert f.readinto(dst_buf) == 10 assert dst_buf[:length] == data[:length] assert len(dst_buf) == length
def test_python_file_iterable(): data = b'''line1 line2 line3 ''' buf = BytesIO(data) buf2 = BytesIO(data) with pa.PythonFile(buf, mode='r') as f: for read, expected in zip(f, buf2): assert read == expected
def _write_parquet(df, f): import pyarrow.parquet as pq import pyarrow as pa # Because pyarrow can't handle these (with a cryptic error) _warn_if_any_all_nan_columns(df) _raise_if_any_mixed_type_columns(df) # we have to coerce timestamps to millisecond resolution as # nanoseconds are not yet supported by Arrow/Parquet tbl = pa.Table.from_pandas(df, timestamps_to_ms=True) pq.write_table(tbl, pa.PythonFile(f))
def test_parquet_read_from_buffer(tempdir, use_legacy_dataset): # reading from a buffer from python's open() table = pa.table({"a": [1, 2, 3]}) pq.write_table(table, str(tempdir / "data.parquet")) with open(str(tempdir / "data.parquet"), "rb") as f: result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) with open(str(tempdir / "data.parquet"), "rb") as f: result = pq.read_table(pa.PythonFile(f), use_legacy_dataset=use_legacy_dataset) assert result.equals(table)
def test_python_file_implicit_mode(tmpdir): path = os.path.join(str(tmpdir), 'foo.txt') with open(path, 'wb') as f: pf = pa.PythonFile(f) assert pf.writable() assert not pf.readable() assert not pf.seekable() # PyOutputStream isn't seekable f.write(b'foobar\n') with open(path, 'rb') as f: pf = pa.PythonFile(f) assert pf.readable() assert not pf.writable() assert pf.seekable() assert pf.read() == b'foobar\n' bio = BytesIO() pf = pa.PythonFile(bio) assert pf.writable() assert not pf.readable() assert not pf.seekable() pf.write(b'foobar\n') assert bio.getvalue() == b'foobar\n'
def test_python_file_read_at(): data = b'some sample data' buf = BytesIO(data) f = pa.PythonFile(buf, mode='r') # test simple read at v = f.read_at(nbytes=5, offset=3) assert v == b'e sam' assert len(v) == 5 # test reading entire file when nbytes > len(file) w = f.read_at(nbytes=50, offset=0) assert w == data assert len(w) == 16
def test_python_file_write(): buf = BytesIO() f = pa.PythonFile(buf) assert f.tell() == 0 s1 = b'enga\xc3\xb1ado' s2 = b'foobar' f.write(s1) assert f.tell() == len(s1) f.write(s2) expected = s1 + s2 result = buf.getvalue() assert result == expected f.close()
def factory(filename): return pa.PythonFile(open(filename, 'rb'))
def test_python_file_correct_abc(): with pa.PythonFile(BytesIO(b''), mode='r') as f: assert isinstance(f, BufferedIOBase) assert isinstance(f, IOBase)
def test_python_file_readall(): data = b'some sample data' buf = BytesIO(data) with pa.PythonFile(buf, mode='r') as f: assert f.readall() == data
from io import BytesIO import pyarrow as pa import pyarrow.parquet as pq import pandas as pd # オンメモリでarrow_tableをperquet形式にシリアライズする table = pd.DataFrame({'a': [x for x in range(10000)]}) buf = BytesIO() f = pa.PythonFile(buf) arrow_table = pa.Table.from_pandas(table) pq.write_table(arrow_table, f, use_dictionary=False, compression=None) result = buf.getvalue() # parquest方式は改行や制御文字を含まないので、このまま、MapReduceなどにかけることができる print(result) # オンメモリでbytes型でデータを受け取ったら、pyarrow形式に解釈する arrow = pq.read_table(pa.BufferReader(result), nthreads=16) df = arrow.to_pandas() print(df.head())