コード例 #1
0
ファイル: test_io.py プロジェクト: NonVolatileComputing/arrow
def test_native_file_modes(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='r') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='rb') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='w') as f:
        assert f.mode == 'wb'

    with pa.OSFile(path, mode='wb') as f:
        assert f.mode == 'wb'

    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.memory_map(path, 'r') as f:
        assert f.mode == 'rb'

    with pa.memory_map(path, 'r+') as f:
        assert f.mode == 'rb+'

    with pa.memory_map(path, 'r+b') as f:
        assert f.mode == 'rb+'
コード例 #2
0
def test_datetime_serialization(large_memory_map):
    data = [
        #  Principia Mathematica published
        datetime.datetime(year=1687, month=7, day=5),

        # Some random date
        datetime.datetime(year=1911, month=6, day=3, hour=4,
                          minute=55, second=44),
        # End of WWI
        datetime.datetime(year=1918, month=11, day=11),

        # Beginning of UNIX time
        datetime.datetime(year=1970, month=1, day=1),

        # The Berlin wall falls
        datetime.datetime(year=1989, month=11, day=9),

        # Another random date
        datetime.datetime(year=2011, month=6, day=3, hour=4,
                          minute=0, second=3),
        # Another random date
        datetime.datetime(year=1970, month=1, day=3, hour=4,
                          minute=0, second=0)
    ]
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for d in data:
            serialization_roundtrip(d, mmap)
コード例 #3
0
ファイル: test_io.py プロジェクト: sunchao/arrow
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file,
             mmap_file]

    methods = [('tell', ()),
               ('seek', (0,)),
               ('size', ()),
               ('flush', ()),
               ('readable', ()),
               ('writable', ()),
               ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
コード例 #4
0
def test_arrow_limits(self):
    def huge_memory_map(temp_dir):
        return large_memory_map(temp_dir, 100 * 1024 * 1024 * 1024)

    with pa.memory_map(huge_memory_map, mode="r+") as mmap:
        # Test that objects that are too large for Arrow throw a Python
        # exception. These tests give out of memory errors on Travis and need
        # to be run on a machine with lots of RAM.
        x = 2 ** 29 * [1.0]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * ["s"]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * [["1"], 2, 3, [{"s": 4}]]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * [{"s": 1}] + 2 ** 29 * [1.0]
        serialization_roundtrip(x, mmap)
        del x
        x = np.zeros(2 ** 25)
        serialization_roundtrip(x, mmap)
        del x
        x = [np.zeros(2 ** 18) for _ in range(2 ** 7)]
        serialization_roundtrip(x, mmap)
        del x
コード例 #5
0
def test_numpy_immutable(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        obj = np.zeros([10])
        mmap.seek(0)
        pa.serialize_to(obj, mmap, serialization_context)
        mmap.seek(0)
        result = pa.deserialize_from(mmap, None, serialization_context)
        with pytest.raises(ValueError):
            result[0] = 1.0
コード例 #6
0
def test_torch_serialization(large_memory_map):
    pytest.importorskip("torch")
    import torch
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        # These are the only types that are supported for the
        # PyTorch to NumPy conversion
        for t in ["float32", "float64",
                  "uint8", "int16", "int32", "int64"]:
            obj = torch.from_numpy(np.random.randn(1000).astype(t))
            serialization_roundtrip(obj, mmap)
コード例 #7
0
ファイル: test_io.py プロジェクト: StevenMPhillips/arrow
def test_memory_map_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        f = pa.memory_map(path, mode='r+w')

        f.seek(10)
        f.write('peekaboo')
        assert f.tell() == 18

        f.seek(10)
        assert f.read(8) == b'peekaboo'

        f2 = pa.memory_map(path, mode='r+w')

        f2.seek(10)
        f2.write(b'booapeak')
        f2.seek(10)

        f.seek(10)
        assert f.read(8) == b'booapeak'

        # Does not truncate file
        f3 = pa.memory_map(path, mode='w')
        f3.write('foo')

        with pa.memory_map(path) as f4:
            assert f4.size() == SIZE

        with pytest.raises(IOError):
            f3.read(5)

        f.seek(0)
        assert f.read(3) == b'foo'
    finally:
        _try_delete(path)
コード例 #8
0
def test_read_tensor(tmpdir):
    # Create and write tensor tensor
    data = np.random.randn(10, 4)
    tensor = pa.Tensor.from_numpy(data)
    data_size = pa.get_tensor_size(tensor)
    path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-read-tensor')
    write_mmap = pa.create_memory_map(path, data_size)
    pa.write_tensor(tensor, write_mmap)
    # Try to read tensor
    read_mmap = pa.memory_map(path, mode='r')
    array = pa.read_tensor(read_mmap).to_numpy()
    np.testing.assert_equal(data, array)
コード例 #9
0
ファイル: arrow_reader.py プロジェクト: recitalAI/nlp
 def _get_dataset_from_filename(self, filename_skip_take):
     """Returns a Dataset instance from given (filename, skip, take)."""
     filename, skip, take = (
         filename_skip_take["filename"],
         filename_skip_take["skip"] if "skip" in filename_skip_take else None,
         filename_skip_take["take"] if "take" in filename_skip_take else None,
     )
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     if skip is not None and take is not None:
         pa_table = pa_table.slice(skip, take)
     return pa_table
コード例 #10
0
 def gen_by_batch():
     import numpy as np
     import math
     if 'data_mmap_file_ref' not in file_ref:
         file_ref['data_mmap_file_ref'] = pa.memory_map(
             context_id + "/__input__.dat")
     reader = pa.ipc.open_file(file_ref['data_mmap_file_ref'])
     num_record_batches = reader.num_record_batches
     for i in range(num_record_batches):
         df = reader.get_batch(i).to_pandas()
         for small_batch in np.array_split(
                 df, math.floor(df.shape[0] / batch_size)):
             yield small_batch
コード例 #11
0
def test_memory_map_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    f = pa.memory_map(path, mode='r+b')

    f.seek(10)
    f.write(b'peekaboo')
    assert f.tell() == 18

    f.seek(10)
    assert f.read(8) == b'peekaboo'

    f2 = pa.memory_map(path, mode='r+b')

    f2.seek(10)
    f2.write(b'booapeak')
    f2.seek(10)

    f.seek(10)
    assert f.read(8) == b'booapeak'

    # Does not truncate file
    f3 = pa.memory_map(path, mode='w')
    f3.write(b'foo')

    with pa.memory_map(path) as f4:
        assert f4.size() == SIZE

    with pytest.raises(IOError):
        f3.read(5)

    f.seek(0)
    assert f.read(3) == b'foo'
コード例 #12
0
ファイル: arrow_dataset.py プロジェクト: zedauna/nlp-1
 def from_file(cls,
               filename: str,
               info: Optional[Any] = None,
               split: Optional[Any] = None):
     """ Instantiate a Dataset backed by an Arrow table at filename """
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     return cls(arrow_table=pa_table,
                data_files=[{
                    "filename": filename
                }],
                info=info,
                split=split)
コード例 #13
0
 def _get_dataset_from_filename(self, filename_skip_take):
     """Returns a Dataset instance from given (filename, skip, take)."""
     filename, skip, take = (
         filename_skip_take["filename"],
         filename_skip_take["skip"] if "skip" in filename_skip_take else None,
         filename_skip_take["take"] if "take" in filename_skip_take else None,
     )
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     # here we don't want to slice an empty table, or it may segfault
     if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
         pa_table = pa_table.slice(skip, take)
     return pa_table
コード例 #14
0
def test_memory_map_retain_buffer_reference(sample_disk_data):
    path, data = sample_disk_data

    cases = []
    with pa.memory_map(path, 'rb') as f:
        cases.append((f.read_buffer(100), data[:100]))
        cases.append((f.read_buffer(100), data[100:200]))
        cases.append((f.read_buffer(100), data[200:300]))

    # Call gc.collect() for good measure
    gc.collect()

    for buf, expected in cases:
        assert buf.to_pybytes() == expected
コード例 #15
0
ファイル: test_io.py プロジェクト: sunchao/arrow
def test_memory_map_retain_buffer_reference(sample_disk_data):
    path, data = sample_disk_data

    cases = []
    with pa.memory_map(path, 'rb') as f:
        cases.append((f.read_buffer(100), data[:100]))
        cases.append((f.read_buffer(100), data[100:200]))
        cases.append((f.read_buffer(100), data[200:300]))

    # Call gc.collect() for good measure
    gc.collect()

    for buf, expected in cases:
        assert buf.to_pybytes() == expected
コード例 #16
0
def test_native_file_permissions(tmpdir):
    # ARROW-10124: permissions of created files should follow umask
    cur_umask = os.umask(0o002)
    os.umask(cur_umask)

    path = os.path.join(str(tmpdir), guid())
    with pa.OSFile(path, mode='w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask

    path = os.path.join(str(tmpdir), guid())
    with pa.memory_map(path, 'w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
コード例 #17
0
ファイル: transfer.py プロジェクト: swipswaps/orchest
def _deserialize_output_disk(full_path: str, serialization: str) -> Any:
    """Gets data from disk.

    Raises:
        ValueError: If the serialization argument is unsupported.
    """
    file_path = f"{full_path}.{serialization}"
    if serialization == Serialization.ARROW_TABLE.name:
        # pa.memory_map is for reading (zero-copy)
        with pa.memory_map(file_path, "rb") as input_file:
            # read all batches as a table
            stream = pa.ipc.open_stream(input_file)
            return stream.read_all()
    elif serialization == Serialization.ARROW_BATCH.name:
        with pa.memory_map(file_path, "rb") as input_file:
            # return the first batch (the only one)
            stream = pa.ipc.open_stream(input_file)
            return [b for b in stream][0]
    elif serialization == Serialization.PICKLE.name:
        # https://docs.python.org/3/library/pickle.html
        # The argument file must have three methods:
        # * ``read()`` that takes an integer argument,
        # * ``readinto()`` that takes a buffer argument,
        # * ``readline()`` that requires no arguments, similar to the
        #   ``io.BufferedIOBase`` interface.

        # https://arrow.apache.org/docs/python/generated/pyarrow.MemoryMappedFile.html#pyarrow.MemoryMappedFile
        # While ``memory_map`` does not support readline, given the
        # docs, using ``pickle.load`` on a memory mapped file would
        # work, however, it was safer to not take the risk and use the
        # normal python file.
        with open(file_path, "rb") as input_file:
            return pickle.load(input_file)
    else:
        raise ValueError(
            f"The specified serialization of '{serialization}' is unsupported."
        )
コード例 #18
0
ファイル: __init__.py プロジェクト: yaotianzhang/vaex
def open(path,
         mode='rb',
         fs_options={},
         fs=None,
         for_arrow=False,
         mmap=False,
         encoding="utf8"):
    if is_file_object(path):
        return path
    fs, path = parse(path, fs_options=fs_options, fs=fs, for_arrow=for_arrow)
    if fs is None:
        path = stringyfy(path)
        if for_arrow:
            if fs_options:
                raise ValueError(
                    f'fs_options not supported for local files. You passed: {repr(fs_options)}.'
                )
            if mmap:
                return pa.memory_map(path, mode)
            else:
                return pa.OSFile(path, mode)
        else:
            if 'b' not in mode:
                return normal_open(path, mode, encoding=encoding)
            else:
                return normal_open(path, mode)
    if mode == 'rb':

        def create():
            return fs.open_input_file(path)
    elif mode == "r":

        def create():
            fa = fs.open_input_file(path)
            fp = FileProxy(fa, path, lambda: fs.open_input_file(path))
            return io.TextIOWrapper(fp, encoding=encoding)
    elif mode == 'wb':

        def create():
            return fs.open_output_stream(path)
    elif mode == "w":

        def create():
            fa = fs.open_output_stream(path)
            fp = FileProxy(fa, path, lambda: fs.open_output_stream(path))
            return io.TextIOWrapper(fa, encoding=encoding)
    else:
        raise ValueError(f'Only mode=rb/bw/r/w are supported, not {mode}')
    return FileProxy(create(), path, create)
コード例 #19
0
ファイル: dataset.py プロジェクト: ramm8469/vaex
def open(filename, as_numpy=False):
    source = pa.memory_map(filename)
    try:
        # first we try if it opens as stream
        reader = pa.ipc.open_stream(source)
    except pa.lib.ArrowInvalid:
        # if not, we open as file
        reader = pa.ipc.open_file(source)
        # for some reason this reader is not iterable
        batches = [reader.get_batch(i) for i in range(reader.num_record_batches)]
    else:
        # if a stream, we're good
        batches = reader  # this reader is iterable
    table = pa.Table.from_batches(batches)
    return from_table(table, as_numpy=as_numpy)
コード例 #20
0
 def _encrypt(self, aTableName):
     readPath = (self._path + "/" + aTableName + ".pq")
     writePath = (self._path + "/" + aTableName + ".enc")
     with open(writePath, 'wb') as out_file:
         recipient_key = RSA.import_key(open(self._dbPub).read())
         session_key = get_random_bytes(16)
         cipher_rsa = PKCS1_OAEP.new(recipient_key)
         out_file.write(cipher_rsa.encrypt(session_key))
         cipher_aes = AES.new(session_key, AES.MODE_EAX)
         ciphertext, tag = \
             cipher_aes.encrypt_and_digest(pa.memory_map(readPath).read())
         out_file.write(cipher_aes.nonce)
         out_file.write(tag)
         out_file.write(ciphertext)
     os.remove(readPath)
コード例 #21
0
ファイル: inmemory.py プロジェクト: SurrealAI/caraml
 def __init__(self, filename, debug=False):
     """
         Initializer
     Args:
         filename: location to store the temporary file
         debug: print additional debug information
     """
     self.debug = debug
     if self.debug:
         print('Shared memory with name {} created'.format(filename))
         memory_usage[filename] = True
     self.filename = filename
     self.file = pa.memory_map(filename)
     self.buffer = self.file.read_buffer()
     self.data = pa.deserialize(self.buffer)
     self.deleted = False
コード例 #22
0
    def __init__(self, arrow_file_name: Path) -> None:
        self._arrow_file_name = str(arrow_file_name)

        LOGGER.debug(f"init with arrow file: {self._arrow_file_name}")
        timer = PerfTimer()

        source = pa.memory_map(self._arrow_file_name, "r")
        et_open_ms = timer.lap_ms()

        reader = pa.ipc.RecordBatchFileReader(source)
        et_create_reader_ms = timer.lap_ms()

        # Discover columns and realizations that are present in the file
        column_names_on_file = reader.schema.names
        self._vector_names: List[str] = [
            colname
            for colname in column_names_on_file
            if colname not in ["DATE", "REAL", "ENSEMBLE"]
        ]
        et_find_vec_names_ms = timer.lap_ms()

        unique_realizations_on_file = reader.read_all().column("REAL").unique()
        self._realizations: List[int] = unique_realizations_on_file.to_pylist()
        et_find_real_ms = timer.lap_ms()

        # We'll try and keep the file open for the life-span of the provider.
        # Done to try and stop blobfuse from throwing the file out of its cache.
        self._cached_reader = reader

        # For testing, uncomment code below and we will be more aggressive
        # and keep the "raw" table in memory
        self._cached_full_table = None
        # self._cached_full_table = reader.read_all()

        LOGGER.debug(
            f"init took: {timer.elapsed_s():.2f}s, "
            f"(open={et_open_ms}ms, create_reader={et_create_reader_ms}ms, "
            f"find_vec_names={et_find_vec_names_ms}ms, find_real={et_find_real_ms}ms), "
            f"#vector_names={len(self._vector_names)}, "
            f"#realization={len(self._realizations)}"
        )

        if not self._realizations:
            raise ValueError("Init from backing store failed NO realizations")
        if not self._vector_names:
            raise ValueError("Init from backing store failed NO vector_names")
コード例 #23
0
ファイル: dataset.py プロジェクト: ankitskvmdam/vaex-arrow
 def _load(self):
     source = pa.memory_map(self.path)
     try:
         # first we try if it opens as stream
         reader = pa.ipc.open_stream(source)
     except pa.lib.ArrowInvalid:
         # if not, we open as file
         reader = pa.ipc.open_file(source)
         # for some reason this reader is not iterable
         batches = [
             reader.get_batch(i) for i in range(reader.num_record_batches)
         ]
     else:
         # if a stream, we're good
         batches = reader  # this reader is iterable
     table = pa.Table.from_batches(batches)
     self._load_table(table)
コード例 #24
0
ファイル: serialization.py プロジェクト: m-wiesner/lhotse
 def _init_table_from_path(self):
     if '.jsonl' in self.path.suffixes:
         # Can read ".jsonl" or ".jsonl.gz"
         import pyarrow.json as paj
         self.table = paj.read_json(
             str(self.path),
             read_options=paj.ReadOptions(
                 # magic constants:
                 # 894 - estimated average number of bytes per JSON item manifest
                 # 10000 - how many items we want to have in a chunk (Arrow's "batch")
                 block_size=894 * 10000))
     elif '.arrow' == self.path.suffixes[-1]:
         # Can read ".arrow"
         import pyarrow as pa
         mmap = pa.memory_map(str(self.path))
         stream = pa.ipc.open_file(mmap)
         self.table = stream.read_all()
     else:
         raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
コード例 #25
0
ファイル: __init__.py プロジェクト: smt2009/vaex
def open_for_arrow(path, mode='rb', fs_options={}, mmap=False):
    '''When the file will be passed to arrow, we want file object arrow likes.

    This might avoid peformance issues with GIL, or call overhead.
    '''
    import pyarrow as pa
    if is_file_object(path):
        return path
    path = stringyfy(path)
    scheme, _ = split_scheme(path)
    if scheme is None:
        if fs_options:
            raise ValueError(f'fs_options not supported for local files. You passed: {repr(fs_options)}.')
        if mmap:
            return pa.memory_map(path, mode)
        else:
            return pa.OSFile(path, mode)
    else:
        return open(path, mode=mode, fs_options=fs_options).file
コード例 #26
0
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file, mmap_file]

    methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()),
               ('readable', ()), ('writable', ()), ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
コード例 #27
0
def test_datetime_serialization(large_memory_map):
    data = [
        #  Principia Mathematica published
        datetime.datetime(year=1687, month=7, day=5),

        # Some random date
        datetime.datetime(year=1911,
                          month=6,
                          day=3,
                          hour=4,
                          minute=55,
                          second=44),
        # End of WWI
        datetime.datetime(year=1918, month=11, day=11),

        # Beginning of UNIX time
        datetime.datetime(year=1970, month=1, day=1),

        # The Berlin wall falls
        datetime.datetime(year=1989, month=11, day=9),

        # Another random date
        datetime.datetime(year=2011,
                          month=6,
                          day=3,
                          hour=4,
                          minute=0,
                          second=3),
        # Another random date
        datetime.datetime(year=1970,
                          month=1,
                          day=3,
                          hour=4,
                          minute=0,
                          second=0)
    ]
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for d in data:
            serialization_roundtrip(d, mmap)
コード例 #28
0
ファイル: __init__.py プロジェクト: kPsarakis/geovaex
def open(path):
    """Opens an arrow spatial file.
    Parameters:
        path (string): The file's full path.
    Returns:
        (object) A GeoDataFrame object.
    """
    source = pa.memory_map(path)
    try:
        # first we try if it opens as stream
        reader = pa.ipc.open_stream(source)
    except pa.lib.ArrowInvalid:
        # if not, we open as file
        reader = pa.ipc.open_file(source)
        # for some reason this reader is not iterable
        batches = [
            reader.get_batch(i) for i in range(reader.num_record_batches)
        ]
    else:
        # if a stream, we're good
        batches = reader  # this reader is iterable
    table = pa.Table.from_batches(batches)
    if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys(
    ):
        metadata = table.schema.metadata
        print(f"Opened file {os.path.basename(path)}, "
              f"created by geovaex v{metadata[b'geovaex version'].decode()} "
              f"using {metadata[b'driver'].decode()} driver.")
        df = from_arrow_spatial_table(table)
        has_geometry = df.geometry.get_raw_geometry().null_count != len(
            df.geometry)
        if has_geometry:
            return df
        table = table.drop(['geometry'])

    warnings.warn('Not a spatial arrow file. Returning a Vaex DataFrame.')
    df = from_arrow_table(table).copy()
    return df
コード例 #29
0
ファイル: scdata.py プロジェクト: das-projects/singlecell
 def apply(
         self,
         function,
         axis=None,
         in_place: bool = True
 ) -> Optional[pa.RecordBatchFileReader, "SCData"]:
     scdata = self.to_pandas().apply(function,
                                     axis=axis,
                                     result_type='broadcast')
     if in_place:
         scdata = self.ensure_scdata_format(pa.Table.from_pandas(scdata),
                                            self.obs, self.var, self.uns,
                                            self.obsm, self.varm)
         self.filetype = "arrow"
         with open(self.arrow_file, 'bw') as f:
             writer = pa.RecordBatchFileWriter(f, scdata.schema)
             writer.write(scdata)
             writer.close()
         self.memory_mapped_dataset = pa.memory_map(self.arrow_file, 'r')
         return self.to_memory()
     else:
         return SCData(scdata, self.obs, self.var, self.uns, self.obsm,
                       self.varm)
コード例 #30
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))
コード例 #31
0
ファイル: scdata.py プロジェクト: das-projects/singlecell
 def update_scdata(self, scdata):  # TODO: Only accepts one file for now
     ds.write_dataset(scdata, self.data_interim, format="arrow")
     dataset = ds.dataset(self.data_interim, format="arrow")
     self.filetype = "arrow"
     self.memory_mapped_dataset = pa.memory_map(dataset.files[0], 'r')
コード例 #32
0
def test_primitive_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in PRIMITIVE_OBJECTS:
            serialization_roundtrip(obj, mmap)
コード例 #33
0
ファイル: test_io.py プロジェクト: zhu2856061/arrow
def test_native_file_open_error():
    with assert_file_not_found():
        pa.OSFile('non_existent_file', 'rb')
    with assert_file_not_found():
        pa.memory_map('non_existent_file', 'rb')
コード例 #34
0
ファイル: test_io.py プロジェクト: sunchao/arrow
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
コード例 #35
0
ファイル: mmap.py プロジェクト: clarkfitzg/phd_research
"""
Fri Jul 14 11:36:58 PDT 2017

Goal:
Share a memory mapped file between two Python processes

Following
https://arrow.apache.org/docs/python/memory.html#on-disk-and-memory-mapped-files
"""

import pyarrow as pa


# For a more realistic use case this would be a Parquet file
fname = '../example.dat'
with open(fname, 'wb') as f:
    f.write(b'some example data')


mmap = pa.memory_map(fname, 'r')

mmap.read()
mmap.seek(0)

# Modify the actual file
with open(fname, 'wb') as f:
    f.write(b'SOME EXAMPLE DATA')

# Now we see the modified contents, even from a different process.
mmap.read()
コード例 #36
0
ファイル: serialize.py プロジェクト: overshiki/datasets
def readBuf(name):
	mmap = pa.memory_map(name)
	buf = mmap.read_buffer()
	return buf
コード例 #37
0
 def from_file(cls, filename: str):
     """ Instantiate a Dataset backed by an Arrow table at filename """
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     return cls(arrow_table=pa_table, data_files=[{"filename": filename}])
コード例 #38
0
 def _check_output(self, output):
     mmap = pa.BufferReader(output) if isinstance(output, pa.Buffer) else pa.memory_map(output)
     f = pa.ipc.open_stream(mmap)
     pa_table: pa.Table = f.read_all()
     self.assertDictEqual(pa_table.to_pydict(), {"col_1": ["foo", "bar"], "col_2": [1, 2]})
     del pa_table
コード例 #39
0
def test_custom_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in CUSTOM_OBJECTS:
            serialization_roundtrip(obj, mmap)
コード例 #40
0
def test_default_dict_serialization(large_memory_map):
    pytest.importorskip("cloudpickle")
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        obj = defaultdict(lambda: 0, [("hello", 1), ("world", 2)])
        serialization_roundtrip(obj, mmap)
コード例 #41
0
def test_numpy_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for t in ["bool", "int8", "uint8", "int16", "uint16", "int32",
                  "uint32", "float16", "float32", "float64"]:
            obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
            serialization_roundtrip(obj, mmap)
コード例 #42
0
ファイル: mmap2.py プロジェクト: clarkfitzg/phd_research
import pyarrow as pa

fname = 'example.dat'

mmap = pa.memory_map(fname)

mmap.read()

mmap.seek(0)

mmap.read()
コード例 #43
0
ファイル: write_vqa.py プロジェクト: Jxu-Thu/virtex
def make_arrow(root, dataset_root):
    with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json",
              "r") as fp:
        questions_train2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp:
        questions_val2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json",
              "r") as fp:
        questions_test2015 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json",
              "r") as fp:
        questions_test_dev2015 = json.load(fp)["questions"]

    with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp:
        annotations_train2014 = json.load(fp)["annotations"]
    with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp:
        annotations_val2014 = json.load(fp)["annotations"]

    annotations = dict()

    # 一张图可能有多个question,这里聚合起来
    for split, questions in zip(
        ["train", "val", "test", "test-dev"],
        [
            questions_train2014,
            questions_val2014,
            questions_test2015,
            questions_test_dev2015,
        ],
    ):
        _annot = defaultdict(dict)
        for q in tqdm(questions):
            _annot[q["image_id"]][q["question_id"]] = [q["question"]]

        annotations[split] = _annot

    all_major_answers = list()
    # 把所有的答案拿到
    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            all_major_answers.append(q["multiple_choice_answer"])

    all_major_answers = [
        normalize_word(word) for word in tqdm(all_major_answers)
    ]
    counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9}
    ans2label = {k: i for i, k in enumerate(counter.keys())}
    label2ans = list(counter.keys())

    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            answers = q["answers"]
            answer_count = {}
            for answer in answers:
                answer_ = answer["answer"]
                answer_count[answer_] = answer_count.get(answer_, 0) + 1

            labels = []
            scores = []
            for answer in answer_count:
                if answer not in ans2label:
                    continue
                labels.append(ans2label[answer])
                score = get_score(answer_count[answer])
                scores.append(score)

            _annot[q["image_id"]][q["question_id"]].append({
                "labels": labels,
                "scores": scores,
            })

    # _annot[q["image_id"]][q["question_id"]] = ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
    # 删除label=0的question
    for split in ["train", "val"]:
        filtered_annot = dict()
        for ik, iv in annotations[split].items():
            # ik image_id
            # iv : {458752000: ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}],
            # 458752001: ['What position is this man playing?', {'labels': [1, 67],
            # 'scores': [1.0, 0.3]}], 458752002: ['What color is the players shirt?',
            # {'labels': [2], 'scores': [1.0]}], 458752003: ['Is this man a professional baseball player?',
            # {'labels': [3, 9], 'scores': [1.0, 0.3]}]}
            new_q = dict()
            for qk, qv in iv.items():
                # qv : ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
                if len(qv[1]["labels"]) != 0:
                    new_q[qk] = qv
            if len(new_q) != 0:
                filtered_annot[ik] = new_q
        annotations[split] = filtered_annot

    for split in [
            "train",
            "val",
            "test",
            "test-dev",
    ]:
        annot = annotations[split]
        split_name = {
            "train": "train2014",
            "val": "val2014",
            "test": "test2015",
            "test-dev": "test2015",
        }[split]
        paths = list(glob(f"{root}/{split_name}/*.jpg"))
        random.shuffle(paths)
        annot_paths = [
            path for path in paths
            if int(path.split("/")[-1].split("_")[-1][:-4]) in annot
        ]

        if len(paths) == len(annot_paths):
            print("all images have caption annotations")
        else:
            print("not all images have caption annotations")
        print(
            len(paths),
            len(annot_paths),
            len(annot),
        )

        bs = [
            path2rest(path, split, annotations, label2ans)
            for path in tqdm(annot_paths)
        ]

        dataframe = pd.DataFrame(
            bs,
            columns=[
                "image",
                "questions",
                "answers",
                "answer_labels",
                "answer_scores",
                "image_id",
                "question_id",
                "split",
            ],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)

    table = pa.ipc.RecordBatchFileReader(
        pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r")).read_all()

    pdtable = table.to_pandas()

    df1 = pdtable[:-1000]
    df2 = pdtable[-1000:]

    df1 = pa.Table.from_pandas(df1)
    df2 = pa.Table.from_pandas(df2)

    with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df1.schema) as writer:
            writer.write_table(df1)

    with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df2.schema) as writer:
            writer.write_table(df2)
コード例 #44
0
tables_files = {
    "mediacloud": [
        pjoin(usenews_arrows19, "mediacloud2019.arrow"),
        pjoin(usenews_arrows20, "mediacloud2020.arrow"),
    ],
    "crowdtangle": [
        pjoin(usenews_arrows19, "crowdtangle2019.arrow"),
        pjoin(usenews_arrows20, "crowdtangle2020.arrow"),
    ]
}
tables = {}

for name, files in tables_files.items():
    sub_tables = []
    for file in files:
        source = pyarrow.memory_map(file, 'r')
        sub_tables.append(pyarrow.ipc.RecordBatchFileReader(source).read_all())
    tables[name] = pyarrow.concat_tables(sub_tables)

tables["crowdtangle"]["link"].map(urlnorm)
tables["mediacloud"]["guid"].map(urlnorm)

joined = pandas.merge(
    tables["crowdtangle"].groupby(["link"]).sum(),
    tables["mediacloud"],
    left_on="link",
    right_on="guid",
)

with pyarrow.OSFile(sys.argv[3], 'wb') as sink:
    with pyarrow.RecordBatchFileWriter(sink, joined.schema) as writer:
コード例 #45
0
def test_complex_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in COMPLEX_OBJECTS:
            serialization_roundtrip(obj, mmap)
コード例 #46
0
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
コード例 #47
0
ファイル: py_arrow.py プロジェクト: starmap0312/python
with open('example2.dat', 'wb') as fout:
    fout.write(b'some example data')
# b) using sing pyarrow's OSFile class:
with pa.OSFile('example3.dat', 'wb') as fout:
    fout.write(b'some example data')
# For reading files, you can use OSFile or MemoryMappedFile:
#   for OSFile, it allocates new memory on each read, like Python file objects
#   for memory maps, the library constructs a buffer referencing the mapped memory without any memory allocation or copying
# a) OSFile([filepath]):
file_obj = pa.OSFile('example2.dat')
print(file_obj.read(4))  # b'some': this allocates new memory when read
# note:
#   using OSFile for read() & write() is more efficient than using standard operating system-level file APIs

# b) memory_map([filepath]): this opens the memory map at file path
mmap = pa.memory_map('example3.dat')
print(
    mmap.read(4)
)  # b'some': this DOES NOT allocate new memory, as it references the maaped memory when read

# read() vs. read_buffer()
# read(): this implements the standard Python file read API
# read_buffer(): this reads into an Arrow Buffer object
print(mmap.seek(0))  # 0
buf = mmap.read_buffer(
    4
)  # <pyarrow.lib.Buffer object at 0x10cfc3960>: this does not allocate any memory
print(buf.to_pybytes())  # b'some': this allocates new memory

# 4) In-Memory Reading and Writing
#   for serialization and deserialization of in-memory data (arrow is more efficient than pickle)
コード例 #48
0
ファイル: table.py プロジェクト: merveenoyan/datasets
def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
    memory_mapped_stream = pa.memory_map(filename)
    opened_stream = pa.ipc.open_stream(memory_mapped_stream)
    pa_table = opened_stream.read_all()
    return pa_table
コード例 #49
0
ファイル: dataset.py プロジェクト: maartenbreddels/vaex
 def _load(self):
     source = pa.memory_map(self.path)
     reader = pa.ipc.open_stream(source)
     table = pa.Table.from_batches([b for b in reader])
     self._load_table(table)