コード例 #1
0
ファイル: test_io.py プロジェクト: joshuawinter/beamer
def test_python_file_closing():
    bio = BytesIO()
    pf = pa.PythonFile(bio)
    wr = weakref.ref(pf)
    del pf
    assert wr() is None  # object was destroyed
    assert not bio.closed
    pf = pa.PythonFile(bio)
    pf.close()
    assert bio.closed
コード例 #2
0
def test_python_file_read():
    data = b'some sample data'

    buf = BytesIO(data)
    f = pa.PythonFile(buf, mode='r')

    assert f.size() == len(data)

    assert f.tell() == 0

    assert f.read(4) == b'some'
    assert f.tell() == 4

    f.seek(0)
    assert f.tell() == 0

    f.seek(5)
    assert f.tell() == 5

    v = f.read(50)
    assert v == b'sample data'
    assert len(v) == 11

    assert f.size() == len(data)

    f.close()
コード例 #3
0
def test_python_file_read_buffer():
    length = 10
    data = b'0123456798'
    dst_buf = bytearray(data)

    class DuckReader:
        def close(self):
            pass

        @property
        def closed(self):
            return False

        def read_buffer(self, nbytes):
            assert nbytes == length
            return memoryview(dst_buf)[:nbytes]

    duck_reader = DuckReader()
    with pa.PythonFile(duck_reader, mode='r') as f:
        buf = f.read_buffer(length)
        assert len(buf) == length
        assert memoryview(buf).tobytes() == dst_buf[:length]
        # buf should point to the same memory, so modyfing it
        memoryview(buf)[0] = ord(b'x')
        # should modify the original
        assert dst_buf[0] == ord(b'x')
コード例 #4
0
    def test_write(self):
        # Write out test file
        with UncloseableBytesIO() as write_buffer:
            with Writer(write_buffer, self.table) as writer:
                writer.write_row_group(self.data)
            file_bytes = write_buffer.getvalue()

        # Read in test file
        read_buffer = BytesIO(file_bytes)
        with pa.PythonFile(read_buffer, mode='r') as infile:

            # Verify data
            parq_table = pq.read_table(infile)
            written_data = list(parq_table.to_pydict().values())
            self.assertEqual(self.data, written_data)

            # Verify parquet file schema
            for i, field in enumerate(parq_table.schema):
                self.assertEqual(field.type.id, self.expected_datatypes[i].id)

            # Ensure timestamp column was written with int96; right now
            # there is no way to see except to check that the unit on
            # the timestamp type is 'ns'
            ts_col = parq_table.schema.field_by_name('timestamp_col')
            self.assertEqual(ts_col.type.unit, 'ns')
コード例 #5
0
def test_python_file_writelines(tmpdir):
    lines = [b'line1\n', b'line2\n' b'line3']
    path = os.path.join(str(tmpdir), 'foo.txt')
    with open(path, 'wb') as f:
        try:
            f = pa.PythonFile(f, mode='w')
            assert f.writable()
            f.writelines(lines)
        finally:
            f.close()

    with open(path, 'rb') as f:
        try:
            f = pa.PythonFile(f, mode='r')
            assert f.readable()
            assert f.read() == b''.join(lines)
        finally:
            f.close()
コード例 #6
0
def test_python_file_readinto():
    length = 10
    data = b'some sample data longer than 10'
    dst_buf = bytearray(length)
    src_buf = BytesIO(data)

    with pa.PythonFile(src_buf, mode='r') as f:
        assert f.readinto(dst_buf) == 10

        assert dst_buf[:length] == data[:length]
        assert len(dst_buf) == length
コード例 #7
0
def test_python_file_iterable():
    data = b'''line1
    line2
    line3
    '''

    buf = BytesIO(data)
    buf2 = BytesIO(data)

    with pa.PythonFile(buf, mode='r') as f:
        for read, expected in zip(f, buf2):
            assert read == expected
コード例 #8
0
def _write_parquet(df, f):
    import pyarrow.parquet as pq
    import pyarrow as pa

    # Because pyarrow can't handle these (with a cryptic error)
    _warn_if_any_all_nan_columns(df)
    _raise_if_any_mixed_type_columns(df)

    # we have to coerce timestamps to millisecond resolution as
    # nanoseconds are not yet supported by Arrow/Parquet
    tbl = pa.Table.from_pandas(df, timestamps_to_ms=True)
    pq.write_table(tbl, pa.PythonFile(f))
コード例 #9
0
def test_parquet_read_from_buffer(tempdir, use_legacy_dataset):
    # reading from a buffer from python's open()
    table = pa.table({"a": [1, 2, 3]})
    pq.write_table(table, str(tempdir / "data.parquet"))

    with open(str(tempdir / "data.parquet"), "rb") as f:
        result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)

    with open(str(tempdir / "data.parquet"), "rb") as f:
        result = pq.read_table(pa.PythonFile(f),
                               use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)
コード例 #10
0
def test_python_file_implicit_mode(tmpdir):
    path = os.path.join(str(tmpdir), 'foo.txt')
    with open(path, 'wb') as f:
        pf = pa.PythonFile(f)
        assert pf.writable()
        assert not pf.readable()
        assert not pf.seekable()  # PyOutputStream isn't seekable
        f.write(b'foobar\n')

    with open(path, 'rb') as f:
        pf = pa.PythonFile(f)
        assert pf.readable()
        assert not pf.writable()
        assert pf.seekable()
        assert pf.read() == b'foobar\n'

    bio = BytesIO()
    pf = pa.PythonFile(bio)
    assert pf.writable()
    assert not pf.readable()
    assert not pf.seekable()
    pf.write(b'foobar\n')
    assert bio.getvalue() == b'foobar\n'
コード例 #11
0
ファイル: test_io.py プロジェクト: zhu2856061/arrow
def test_python_file_read_at():
    data = b'some sample data'

    buf = BytesIO(data)
    f = pa.PythonFile(buf, mode='r')

    # test simple read at
    v = f.read_at(nbytes=5, offset=3)
    assert v == b'e sam'
    assert len(v) == 5

    # test reading entire file when nbytes > len(file)
    w = f.read_at(nbytes=50, offset=0)
    assert w == data
    assert len(w) == 16
コード例 #12
0
def test_python_file_write():
    buf = BytesIO()

    f = pa.PythonFile(buf)

    assert f.tell() == 0

    s1 = b'enga\xc3\xb1ado'
    s2 = b'foobar'

    f.write(s1)
    assert f.tell() == len(s1)

    f.write(s2)

    expected = s1 + s2

    result = buf.getvalue()
    assert result == expected

    f.close()
コード例 #13
0
 def factory(filename):
     return pa.PythonFile(open(filename, 'rb'))
コード例 #14
0
def test_python_file_correct_abc():
    with pa.PythonFile(BytesIO(b''), mode='r') as f:
        assert isinstance(f, BufferedIOBase)
        assert isinstance(f, IOBase)
コード例 #15
0
def test_python_file_readall():
    data = b'some sample data'

    buf = BytesIO(data)
    with pa.PythonFile(buf, mode='r') as f:
        assert f.readall() == data
from io import BytesIO
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# オンメモリでarrow_tableをperquet形式にシリアライズする
table = pd.DataFrame({'a': [x for x in range(10000)]})
buf = BytesIO()
f = pa.PythonFile(buf)
arrow_table = pa.Table.from_pandas(table)
pq.write_table(arrow_table, f, use_dictionary=False, compression=None)
result = buf.getvalue()
# parquest方式は改行や制御文字を含まないので、このまま、MapReduceなどにかけることができる
print(result)

# オンメモリでbytes型でデータを受け取ったら、pyarrow形式に解釈する

arrow = pq.read_table(pa.BufferReader(result), nthreads=16)
df = arrow.to_pandas()
print(df.head())