Ejemplo n.º 1
0
def test_native_file_modes(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='r') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='rb') as f:
        assert f.mode == 'rb'

    with pa.OSFile(path, mode='w') as f:
        assert f.mode == 'wb'

    with pa.OSFile(path, mode='wb') as f:
        assert f.mode == 'wb'

    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.memory_map(path, 'r') as f:
        assert f.mode == 'rb'

    with pa.memory_map(path, 'r+') as f:
        assert f.mode == 'rb+'

    with pa.memory_map(path, 'r+b') as f:
        assert f.mode == 'rb+'
def test_datetime_serialization(large_memory_map):
    data = [
        #  Principia Mathematica published
        datetime.datetime(year=1687, month=7, day=5),

        # Some random date
        datetime.datetime(year=1911, month=6, day=3, hour=4,
                          minute=55, second=44),
        # End of WWI
        datetime.datetime(year=1918, month=11, day=11),

        # Beginning of UNIX time
        datetime.datetime(year=1970, month=1, day=1),

        # The Berlin wall falls
        datetime.datetime(year=1989, month=11, day=9),

        # Another random date
        datetime.datetime(year=2011, month=6, day=3, hour=4,
                          minute=0, second=3),
        # Another random date
        datetime.datetime(year=1970, month=1, day=3, hour=4,
                          minute=0, second=0)
    ]
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for d in data:
            serialization_roundtrip(d, mmap)
Ejemplo n.º 3
0
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file,
             mmap_file]

    methods = [('tell', ()),
               ('seek', (0,)),
               ('size', ()),
               ('flush', ()),
               ('readable', ()),
               ('writable', ()),
               ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
Ejemplo n.º 4
0
def test_arrow_limits(self):
    def huge_memory_map(temp_dir):
        return large_memory_map(temp_dir, 100 * 1024 * 1024 * 1024)

    with pa.memory_map(huge_memory_map, mode="r+") as mmap:
        # Test that objects that are too large for Arrow throw a Python
        # exception. These tests give out of memory errors on Travis and need
        # to be run on a machine with lots of RAM.
        x = 2 ** 29 * [1.0]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * ["s"]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * [["1"], 2, 3, [{"s": 4}]]
        serialization_roundtrip(x, mmap)
        del x
        x = 2 ** 29 * [{"s": 1}] + 2 ** 29 * [1.0]
        serialization_roundtrip(x, mmap)
        del x
        x = np.zeros(2 ** 25)
        serialization_roundtrip(x, mmap)
        del x
        x = [np.zeros(2 ** 18) for _ in range(2 ** 7)]
        serialization_roundtrip(x, mmap)
        del x
def test_numpy_immutable(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        obj = np.zeros([10])
        mmap.seek(0)
        pa.serialize_to(obj, mmap, serialization_context)
        mmap.seek(0)
        result = pa.deserialize_from(mmap, None, serialization_context)
        with pytest.raises(ValueError):
            result[0] = 1.0
def test_torch_serialization(large_memory_map):
    pytest.importorskip("torch")
    import torch
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        # These are the only types that are supported for the
        # PyTorch to NumPy conversion
        for t in ["float32", "float64",
                  "uint8", "int16", "int32", "int64"]:
            obj = torch.from_numpy(np.random.randn(1000).astype(t))
            serialization_roundtrip(obj, mmap)
Ejemplo n.º 7
0
def test_memory_map_writer():
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = guid()
    try:
        with open(path, 'wb') as f:
            f.write(data)

        f = pa.memory_map(path, mode='r+w')

        f.seek(10)
        f.write('peekaboo')
        assert f.tell() == 18

        f.seek(10)
        assert f.read(8) == b'peekaboo'

        f2 = pa.memory_map(path, mode='r+w')

        f2.seek(10)
        f2.write(b'booapeak')
        f2.seek(10)

        f.seek(10)
        assert f.read(8) == b'booapeak'

        # Does not truncate file
        f3 = pa.memory_map(path, mode='w')
        f3.write('foo')

        with pa.memory_map(path) as f4:
            assert f4.size() == SIZE

        with pytest.raises(IOError):
            f3.read(5)

        f.seek(0)
        assert f.read(3) == b'foo'
    finally:
        _try_delete(path)
Ejemplo n.º 8
0
def test_read_tensor(tmpdir):
    # Create and write tensor tensor
    data = np.random.randn(10, 4)
    tensor = pa.Tensor.from_numpy(data)
    data_size = pa.get_tensor_size(tensor)
    path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-read-tensor')
    write_mmap = pa.create_memory_map(path, data_size)
    pa.write_tensor(tensor, write_mmap)
    # Try to read tensor
    read_mmap = pa.memory_map(path, mode='r')
    array = pa.read_tensor(read_mmap).to_numpy()
    np.testing.assert_equal(data, array)
Ejemplo n.º 9
0
 def _get_dataset_from_filename(self, filename_skip_take):
     """Returns a Dataset instance from given (filename, skip, take)."""
     filename, skip, take = (
         filename_skip_take["filename"],
         filename_skip_take["skip"] if "skip" in filename_skip_take else None,
         filename_skip_take["take"] if "take" in filename_skip_take else None,
     )
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     if skip is not None and take is not None:
         pa_table = pa_table.slice(skip, take)
     return pa_table
Ejemplo n.º 10
0
 def gen_by_batch():
     import numpy as np
     import math
     if 'data_mmap_file_ref' not in file_ref:
         file_ref['data_mmap_file_ref'] = pa.memory_map(
             context_id + "/__input__.dat")
     reader = pa.ipc.open_file(file_ref['data_mmap_file_ref'])
     num_record_batches = reader.num_record_batches
     for i in range(num_record_batches):
         df = reader.get_batch(i).to_pandas()
         for small_batch in np.array_split(
                 df, math.floor(df.shape[0] / batch_size)):
             yield small_batch
Ejemplo n.º 11
0
def test_memory_map_writer(tmpdir):
    SIZE = 4096
    arr = np.random.randint(0, 256, size=SIZE).astype('u1')
    data = arr.tobytes()[:SIZE]

    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(data)

    f = pa.memory_map(path, mode='r+b')

    f.seek(10)
    f.write(b'peekaboo')
    assert f.tell() == 18

    f.seek(10)
    assert f.read(8) == b'peekaboo'

    f2 = pa.memory_map(path, mode='r+b')

    f2.seek(10)
    f2.write(b'booapeak')
    f2.seek(10)

    f.seek(10)
    assert f.read(8) == b'booapeak'

    # Does not truncate file
    f3 = pa.memory_map(path, mode='w')
    f3.write(b'foo')

    with pa.memory_map(path) as f4:
        assert f4.size() == SIZE

    with pytest.raises(IOError):
        f3.read(5)

    f.seek(0)
    assert f.read(3) == b'foo'
Ejemplo n.º 12
0
 def from_file(cls,
               filename: str,
               info: Optional[Any] = None,
               split: Optional[Any] = None):
     """ Instantiate a Dataset backed by an Arrow table at filename """
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     return cls(arrow_table=pa_table,
                data_files=[{
                    "filename": filename
                }],
                info=info,
                split=split)
Ejemplo n.º 13
0
 def _get_dataset_from_filename(self, filename_skip_take):
     """Returns a Dataset instance from given (filename, skip, take)."""
     filename, skip, take = (
         filename_skip_take["filename"],
         filename_skip_take["skip"] if "skip" in filename_skip_take else None,
         filename_skip_take["take"] if "take" in filename_skip_take else None,
     )
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     # here we don't want to slice an empty table, or it may segfault
     if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):
         pa_table = pa_table.slice(skip, take)
     return pa_table
Ejemplo n.º 14
0
def test_memory_map_retain_buffer_reference(sample_disk_data):
    path, data = sample_disk_data

    cases = []
    with pa.memory_map(path, 'rb') as f:
        cases.append((f.read_buffer(100), data[:100]))
        cases.append((f.read_buffer(100), data[100:200]))
        cases.append((f.read_buffer(100), data[200:300]))

    # Call gc.collect() for good measure
    gc.collect()

    for buf, expected in cases:
        assert buf.to_pybytes() == expected
Ejemplo n.º 15
0
def test_memory_map_retain_buffer_reference(sample_disk_data):
    path, data = sample_disk_data

    cases = []
    with pa.memory_map(path, 'rb') as f:
        cases.append((f.read_buffer(100), data[:100]))
        cases.append((f.read_buffer(100), data[100:200]))
        cases.append((f.read_buffer(100), data[200:300]))

    # Call gc.collect() for good measure
    gc.collect()

    for buf, expected in cases:
        assert buf.to_pybytes() == expected
Ejemplo n.º 16
0
def test_native_file_permissions(tmpdir):
    # ARROW-10124: permissions of created files should follow umask
    cur_umask = os.umask(0o002)
    os.umask(cur_umask)

    path = os.path.join(str(tmpdir), guid())
    with pa.OSFile(path, mode='w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask

    path = os.path.join(str(tmpdir), guid())
    with pa.memory_map(path, 'w'):
        pass
    assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
Ejemplo n.º 17
0
def _deserialize_output_disk(full_path: str, serialization: str) -> Any:
    """Gets data from disk.

    Raises:
        ValueError: If the serialization argument is unsupported.
    """
    file_path = f"{full_path}.{serialization}"
    if serialization == Serialization.ARROW_TABLE.name:
        # pa.memory_map is for reading (zero-copy)
        with pa.memory_map(file_path, "rb") as input_file:
            # read all batches as a table
            stream = pa.ipc.open_stream(input_file)
            return stream.read_all()
    elif serialization == Serialization.ARROW_BATCH.name:
        with pa.memory_map(file_path, "rb") as input_file:
            # return the first batch (the only one)
            stream = pa.ipc.open_stream(input_file)
            return [b for b in stream][0]
    elif serialization == Serialization.PICKLE.name:
        # https://docs.python.org/3/library/pickle.html
        # The argument file must have three methods:
        # * ``read()`` that takes an integer argument,
        # * ``readinto()`` that takes a buffer argument,
        # * ``readline()`` that requires no arguments, similar to the
        #   ``io.BufferedIOBase`` interface.

        # https://arrow.apache.org/docs/python/generated/pyarrow.MemoryMappedFile.html#pyarrow.MemoryMappedFile
        # While ``memory_map`` does not support readline, given the
        # docs, using ``pickle.load`` on a memory mapped file would
        # work, however, it was safer to not take the risk and use the
        # normal python file.
        with open(file_path, "rb") as input_file:
            return pickle.load(input_file)
    else:
        raise ValueError(
            f"The specified serialization of '{serialization}' is unsupported."
        )
Ejemplo n.º 18
0
def open(path,
         mode='rb',
         fs_options={},
         fs=None,
         for_arrow=False,
         mmap=False,
         encoding="utf8"):
    if is_file_object(path):
        return path
    fs, path = parse(path, fs_options=fs_options, fs=fs, for_arrow=for_arrow)
    if fs is None:
        path = stringyfy(path)
        if for_arrow:
            if fs_options:
                raise ValueError(
                    f'fs_options not supported for local files. You passed: {repr(fs_options)}.'
                )
            if mmap:
                return pa.memory_map(path, mode)
            else:
                return pa.OSFile(path, mode)
        else:
            if 'b' not in mode:
                return normal_open(path, mode, encoding=encoding)
            else:
                return normal_open(path, mode)
    if mode == 'rb':

        def create():
            return fs.open_input_file(path)
    elif mode == "r":

        def create():
            fa = fs.open_input_file(path)
            fp = FileProxy(fa, path, lambda: fs.open_input_file(path))
            return io.TextIOWrapper(fp, encoding=encoding)
    elif mode == 'wb':

        def create():
            return fs.open_output_stream(path)
    elif mode == "w":

        def create():
            fa = fs.open_output_stream(path)
            fp = FileProxy(fa, path, lambda: fs.open_output_stream(path))
            return io.TextIOWrapper(fa, encoding=encoding)
    else:
        raise ValueError(f'Only mode=rb/bw/r/w are supported, not {mode}')
    return FileProxy(create(), path, create)
Ejemplo n.º 19
0
def open(filename, as_numpy=False):
    source = pa.memory_map(filename)
    try:
        # first we try if it opens as stream
        reader = pa.ipc.open_stream(source)
    except pa.lib.ArrowInvalid:
        # if not, we open as file
        reader = pa.ipc.open_file(source)
        # for some reason this reader is not iterable
        batches = [reader.get_batch(i) for i in range(reader.num_record_batches)]
    else:
        # if a stream, we're good
        batches = reader  # this reader is iterable
    table = pa.Table.from_batches(batches)
    return from_table(table, as_numpy=as_numpy)
Ejemplo n.º 20
0
 def _encrypt(self, aTableName):
     readPath = (self._path + "/" + aTableName + ".pq")
     writePath = (self._path + "/" + aTableName + ".enc")
     with open(writePath, 'wb') as out_file:
         recipient_key = RSA.import_key(open(self._dbPub).read())
         session_key = get_random_bytes(16)
         cipher_rsa = PKCS1_OAEP.new(recipient_key)
         out_file.write(cipher_rsa.encrypt(session_key))
         cipher_aes = AES.new(session_key, AES.MODE_EAX)
         ciphertext, tag = \
             cipher_aes.encrypt_and_digest(pa.memory_map(readPath).read())
         out_file.write(cipher_aes.nonce)
         out_file.write(tag)
         out_file.write(ciphertext)
     os.remove(readPath)
Ejemplo n.º 21
0
 def __init__(self, filename, debug=False):
     """
         Initializer
     Args:
         filename: location to store the temporary file
         debug: print additional debug information
     """
     self.debug = debug
     if self.debug:
         print('Shared memory with name {} created'.format(filename))
         memory_usage[filename] = True
     self.filename = filename
     self.file = pa.memory_map(filename)
     self.buffer = self.file.read_buffer()
     self.data = pa.deserialize(self.buffer)
     self.deleted = False
    def __init__(self, arrow_file_name: Path) -> None:
        self._arrow_file_name = str(arrow_file_name)

        LOGGER.debug(f"init with arrow file: {self._arrow_file_name}")
        timer = PerfTimer()

        source = pa.memory_map(self._arrow_file_name, "r")
        et_open_ms = timer.lap_ms()

        reader = pa.ipc.RecordBatchFileReader(source)
        et_create_reader_ms = timer.lap_ms()

        # Discover columns and realizations that are present in the file
        column_names_on_file = reader.schema.names
        self._vector_names: List[str] = [
            colname
            for colname in column_names_on_file
            if colname not in ["DATE", "REAL", "ENSEMBLE"]
        ]
        et_find_vec_names_ms = timer.lap_ms()

        unique_realizations_on_file = reader.read_all().column("REAL").unique()
        self._realizations: List[int] = unique_realizations_on_file.to_pylist()
        et_find_real_ms = timer.lap_ms()

        # We'll try and keep the file open for the life-span of the provider.
        # Done to try and stop blobfuse from throwing the file out of its cache.
        self._cached_reader = reader

        # For testing, uncomment code below and we will be more aggressive
        # and keep the "raw" table in memory
        self._cached_full_table = None
        # self._cached_full_table = reader.read_all()

        LOGGER.debug(
            f"init took: {timer.elapsed_s():.2f}s, "
            f"(open={et_open_ms}ms, create_reader={et_create_reader_ms}ms, "
            f"find_vec_names={et_find_vec_names_ms}ms, find_real={et_find_real_ms}ms), "
            f"#vector_names={len(self._vector_names)}, "
            f"#realization={len(self._realizations)}"
        )

        if not self._realizations:
            raise ValueError("Init from backing store failed NO realizations")
        if not self._vector_names:
            raise ValueError("Init from backing store failed NO vector_names")
Ejemplo n.º 23
0
 def _load(self):
     source = pa.memory_map(self.path)
     try:
         # first we try if it opens as stream
         reader = pa.ipc.open_stream(source)
     except pa.lib.ArrowInvalid:
         # if not, we open as file
         reader = pa.ipc.open_file(source)
         # for some reason this reader is not iterable
         batches = [
             reader.get_batch(i) for i in range(reader.num_record_batches)
         ]
     else:
         # if a stream, we're good
         batches = reader  # this reader is iterable
     table = pa.Table.from_batches(batches)
     self._load_table(table)
Ejemplo n.º 24
0
 def _init_table_from_path(self):
     if '.jsonl' in self.path.suffixes:
         # Can read ".jsonl" or ".jsonl.gz"
         import pyarrow.json as paj
         self.table = paj.read_json(
             str(self.path),
             read_options=paj.ReadOptions(
                 # magic constants:
                 # 894 - estimated average number of bytes per JSON item manifest
                 # 10000 - how many items we want to have in a chunk (Arrow's "batch")
                 block_size=894 * 10000))
     elif '.arrow' == self.path.suffixes[-1]:
         # Can read ".arrow"
         import pyarrow as pa
         mmap = pa.memory_map(str(self.path))
         stream = pa.ipc.open_file(mmap)
         self.table = stream.read_all()
     else:
         raise ValueError(f"Unknown LazyDict file format : '{self.path}'")
Ejemplo n.º 25
0
def open_for_arrow(path, mode='rb', fs_options={}, mmap=False):
    '''When the file will be passed to arrow, we want file object arrow likes.

    This might avoid peformance issues with GIL, or call overhead.
    '''
    import pyarrow as pa
    if is_file_object(path):
        return path
    path = stringyfy(path)
    scheme, _ = split_scheme(path)
    if scheme is None:
        if fs_options:
            raise ValueError(f'fs_options not supported for local files. You passed: {repr(fs_options)}.')
        if mmap:
            return pa.memory_map(path, mode)
        else:
            return pa.OSFile(path, mode)
    else:
        return open(path, mode=mode, fs_options=fs_options).file
Ejemplo n.º 26
0
def test_native_file_raises_ValueError_after_close(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    with open(path, 'wb') as f:
        f.write(b'foooo')

    with pa.OSFile(path, mode='rb') as os_file:
        assert not os_file.closed
    assert os_file.closed

    with pa.memory_map(path, mode='rb') as mmap_file:
        assert not mmap_file.closed
    assert mmap_file.closed

    files = [os_file, mmap_file]

    methods = [('tell', ()), ('seek', (0, )), ('size', ()), ('flush', ()),
               ('readable', ()), ('writable', ()), ('seekable', ())]

    for f in files:
        for method, args in methods:
            with pytest.raises(ValueError):
                getattr(f, method)(*args)
Ejemplo n.º 27
0
def test_datetime_serialization(large_memory_map):
    data = [
        #  Principia Mathematica published
        datetime.datetime(year=1687, month=7, day=5),

        # Some random date
        datetime.datetime(year=1911,
                          month=6,
                          day=3,
                          hour=4,
                          minute=55,
                          second=44),
        # End of WWI
        datetime.datetime(year=1918, month=11, day=11),

        # Beginning of UNIX time
        datetime.datetime(year=1970, month=1, day=1),

        # The Berlin wall falls
        datetime.datetime(year=1989, month=11, day=9),

        # Another random date
        datetime.datetime(year=2011,
                          month=6,
                          day=3,
                          hour=4,
                          minute=0,
                          second=3),
        # Another random date
        datetime.datetime(year=1970,
                          month=1,
                          day=3,
                          hour=4,
                          minute=0,
                          second=0)
    ]
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for d in data:
            serialization_roundtrip(d, mmap)
Ejemplo n.º 28
0
def open(path):
    """Opens an arrow spatial file.
    Parameters:
        path (string): The file's full path.
    Returns:
        (object) A GeoDataFrame object.
    """
    source = pa.memory_map(path)
    try:
        # first we try if it opens as stream
        reader = pa.ipc.open_stream(source)
    except pa.lib.ArrowInvalid:
        # if not, we open as file
        reader = pa.ipc.open_file(source)
        # for some reason this reader is not iterable
        batches = [
            reader.get_batch(i) for i in range(reader.num_record_batches)
        ]
    else:
        # if a stream, we're good
        batches = reader  # this reader is iterable
    table = pa.Table.from_batches(batches)
    if table.schema.metadata is not None and b'geovaex version' in table.schema.metadata.keys(
    ):
        metadata = table.schema.metadata
        print(f"Opened file {os.path.basename(path)}, "
              f"created by geovaex v{metadata[b'geovaex version'].decode()} "
              f"using {metadata[b'driver'].decode()} driver.")
        df = from_arrow_spatial_table(table)
        has_geometry = df.geometry.get_raw_geometry().null_count != len(
            df.geometry)
        if has_geometry:
            return df
        table = table.drop(['geometry'])

    warnings.warn('Not a spatial arrow file. Returning a Vaex DataFrame.')
    df = from_arrow_table(table).copy()
    return df
Ejemplo n.º 29
0
 def apply(
         self,
         function,
         axis=None,
         in_place: bool = True
 ) -> Optional[pa.RecordBatchFileReader, "SCData"]:
     scdata = self.to_pandas().apply(function,
                                     axis=axis,
                                     result_type='broadcast')
     if in_place:
         scdata = self.ensure_scdata_format(pa.Table.from_pandas(scdata),
                                            self.obs, self.var, self.uns,
                                            self.obsm, self.varm)
         self.filetype = "arrow"
         with open(self.arrow_file, 'bw') as f:
             writer = pa.RecordBatchFileWriter(f, scdata.schema)
             writer.write(scdata)
             writer.close()
         self.memory_mapped_dataset = pa.memory_map(self.arrow_file, 'r')
         return self.to_memory()
     else:
         return SCData(scdata, self.obs, self.var, self.uns, self.obsm,
                       self.varm)
Ejemplo n.º 30
0
def memory_and_io_interfaces_example():
	# pyarrow.Buffer.

	data = b"abcdefghijklmnopqrstuvwxyz"

	# Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the data bytes object.
	buf = pa.py_buffer(data)
	# External memory, under the form of a raw pointer and size, can also be referenced using the foreign_buffer() function.
	#buf = pa.foreign_buffer(data)

	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	print("memoryview(buf) = {}.".format(memoryview(buf)))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# Memory pools.

	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = pa.allocate_buffer(1024, resizable=True)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf.resize(2048)
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	buf = None
	print("pa.total_allocated_bytes() = {}.".format(pa.total_allocated_bytes()))

	print("pa.default_memory_pool().backend_name = {}.".format(pa.default_memory_pool().backend_name))

	#--------------------
	# Input and output streams.

	buf = memoryview(b"some data")
	stream = pa.input_stream(buf)

	print("stream.read(4) = {}.".format(stream.read(4)))

	import gzip
	with gzip.open("./example.gz", "wb") as f:
		f.write(b"some data\n" * 3)

	stream = pa.input_stream("./example.gz")
	print("stream.read() = {}.".format(stream.read()))

	with pa.output_stream("./example1.dat") as stream:
		stream.write(b"some data")

	f = open("./example1.dat", "rb")
	print("f.read() = {}.".format(f.read()))

	#--------------------
	# On-disk and memory mapped files.

	# Using regular Python.
	with open("./example2.dat", "wb") as f:
		f.write(b"some example data")

	file_obj = pa.OSFile("./example2.dat")
	print("file_obj.read(4) = {}.".format(file_obj.read(4)))

	# Using pyarrow's OSFile class.
	with pa.OSFile("./example3.dat", "wb") as f:
		f.write(b"some example data")

	mmap = pa.memory_map("./example3.dat")
	print("mmap.read(4) = {}.".format(mmap.read(4)))

	mmap.seek(0)
	buf = mmap.read_buffer(4)
	print("buf = {}.".format(buf))
	print("buf.to_pybytes() = {}.".format(buf.to_pybytes()))

	#--------------------
	# In-memory reading and writing.

	writer = pa.BufferOutputStream()
	writer.write(b"hello, friends")
	buf = writer.getvalue()
	print("buf = {}.".format(buf))
	print("buf.size = {}.".format(buf.size))

	reader = pa.BufferReader(buf)
	reader.seek(7)
	print("reader.read(7) = {}.".format(reader.read(7)))
Ejemplo n.º 31
0
 def update_scdata(self, scdata):  # TODO: Only accepts one file for now
     ds.write_dataset(scdata, self.data_interim, format="arrow")
     dataset = ds.dataset(self.data_interim, format="arrow")
     self.filetype = "arrow"
     self.memory_mapped_dataset = pa.memory_map(dataset.files[0], 'r')
def test_primitive_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in PRIMITIVE_OBJECTS:
            serialization_roundtrip(obj, mmap)
Ejemplo n.º 33
0
def test_native_file_open_error():
    with assert_file_not_found():
        pa.OSFile('non_existent_file', 'rb')
    with assert_file_not_found():
        pa.memory_map('non_existent_file', 'rb')
Ejemplo n.º 34
0
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
Ejemplo n.º 35
0
"""
Fri Jul 14 11:36:58 PDT 2017

Goal:
Share a memory mapped file between two Python processes

Following
https://arrow.apache.org/docs/python/memory.html#on-disk-and-memory-mapped-files
"""

import pyarrow as pa


# For a more realistic use case this would be a Parquet file
fname = '../example.dat'
with open(fname, 'wb') as f:
    f.write(b'some example data')


mmap = pa.memory_map(fname, 'r')

mmap.read()
mmap.seek(0)

# Modify the actual file
with open(fname, 'wb') as f:
    f.write(b'SOME EXAMPLE DATA')

# Now we see the modified contents, even from a different process.
mmap.read()
Ejemplo n.º 36
0
def readBuf(name):
	mmap = pa.memory_map(name)
	buf = mmap.read_buffer()
	return buf
Ejemplo n.º 37
0
 def from_file(cls, filename: str):
     """ Instantiate a Dataset backed by an Arrow table at filename """
     mmap = pa.memory_map(filename)
     f = pa.ipc.open_stream(mmap)
     pa_table = f.read_all()
     return cls(arrow_table=pa_table, data_files=[{"filename": filename}])
Ejemplo n.º 38
0
 def _check_output(self, output):
     mmap = pa.BufferReader(output) if isinstance(output, pa.Buffer) else pa.memory_map(output)
     f = pa.ipc.open_stream(mmap)
     pa_table: pa.Table = f.read_all()
     self.assertDictEqual(pa_table.to_pydict(), {"col_1": ["foo", "bar"], "col_2": [1, 2]})
     del pa_table
def test_custom_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in CUSTOM_OBJECTS:
            serialization_roundtrip(obj, mmap)
def test_default_dict_serialization(large_memory_map):
    pytest.importorskip("cloudpickle")
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        obj = defaultdict(lambda: 0, [("hello", 1), ("world", 2)])
        serialization_roundtrip(obj, mmap)
def test_numpy_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for t in ["bool", "int8", "uint8", "int16", "uint16", "int32",
                  "uint32", "float16", "float32", "float64"]:
            obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
            serialization_roundtrip(obj, mmap)
Ejemplo n.º 42
0
import pyarrow as pa

fname = 'example.dat'

mmap = pa.memory_map(fname)

mmap.read()

mmap.seek(0)

mmap.read()
Ejemplo n.º 43
0
def make_arrow(root, dataset_root):
    with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json",
              "r") as fp:
        questions_train2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp:
        questions_val2014 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json",
              "r") as fp:
        questions_test2015 = json.load(fp)["questions"]
    with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json",
              "r") as fp:
        questions_test_dev2015 = json.load(fp)["questions"]

    with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp:
        annotations_train2014 = json.load(fp)["annotations"]
    with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp:
        annotations_val2014 = json.load(fp)["annotations"]

    annotations = dict()

    # 一张图可能有多个question,这里聚合起来
    for split, questions in zip(
        ["train", "val", "test", "test-dev"],
        [
            questions_train2014,
            questions_val2014,
            questions_test2015,
            questions_test_dev2015,
        ],
    ):
        _annot = defaultdict(dict)
        for q in tqdm(questions):
            _annot[q["image_id"]][q["question_id"]] = [q["question"]]

        annotations[split] = _annot

    all_major_answers = list()
    # 把所有的答案拿到
    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            all_major_answers.append(q["multiple_choice_answer"])

    all_major_answers = [
        normalize_word(word) for word in tqdm(all_major_answers)
    ]
    counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9}
    ans2label = {k: i for i, k in enumerate(counter.keys())}
    label2ans = list(counter.keys())

    for split, annots in zip(
        ["train", "val"],
        [annotations_train2014, annotations_val2014],
    ):
        _annot = annotations[split]
        for q in tqdm(annots):
            answers = q["answers"]
            answer_count = {}
            for answer in answers:
                answer_ = answer["answer"]
                answer_count[answer_] = answer_count.get(answer_, 0) + 1

            labels = []
            scores = []
            for answer in answer_count:
                if answer not in ans2label:
                    continue
                labels.append(ans2label[answer])
                score = get_score(answer_count[answer])
                scores.append(score)

            _annot[q["image_id"]][q["question_id"]].append({
                "labels": labels,
                "scores": scores,
            })

    # _annot[q["image_id"]][q["question_id"]] = ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
    # 删除label=0的question
    for split in ["train", "val"]:
        filtered_annot = dict()
        for ik, iv in annotations[split].items():
            # ik image_id
            # iv : {458752000: ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}],
            # 458752001: ['What position is this man playing?', {'labels': [1, 67],
            # 'scores': [1.0, 0.3]}], 458752002: ['What color is the players shirt?',
            # {'labels': [2], 'scores': [1.0]}], 458752003: ['Is this man a professional baseball player?',
            # {'labels': [3, 9], 'scores': [1.0, 0.3]}]}
            new_q = dict()
            for qk, qv in iv.items():
                # qv : ['What is this photo taken looking through?', {'labels': [0], 'scores': [1.0]}]
                if len(qv[1]["labels"]) != 0:
                    new_q[qk] = qv
            if len(new_q) != 0:
                filtered_annot[ik] = new_q
        annotations[split] = filtered_annot

    for split in [
            "train",
            "val",
            "test",
            "test-dev",
    ]:
        annot = annotations[split]
        split_name = {
            "train": "train2014",
            "val": "val2014",
            "test": "test2015",
            "test-dev": "test2015",
        }[split]
        paths = list(glob(f"{root}/{split_name}/*.jpg"))
        random.shuffle(paths)
        annot_paths = [
            path for path in paths
            if int(path.split("/")[-1].split("_")[-1][:-4]) in annot
        ]

        if len(paths) == len(annot_paths):
            print("all images have caption annotations")
        else:
            print("not all images have caption annotations")
        print(
            len(paths),
            len(annot_paths),
            len(annot),
        )

        bs = [
            path2rest(path, split, annotations, label2ans)
            for path in tqdm(annot_paths)
        ]

        dataframe = pd.DataFrame(
            bs,
            columns=[
                "image",
                "questions",
                "answers",
                "answer_labels",
                "answer_scores",
                "image_id",
                "question_id",
                "split",
            ],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)

    table = pa.ipc.RecordBatchFileReader(
        pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r")).read_all()

    pdtable = table.to_pandas()

    df1 = pdtable[:-1000]
    df2 = pdtable[-1000:]

    df1 = pa.Table.from_pandas(df1)
    df2 = pa.Table.from_pandas(df2)

    with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df1.schema) as writer:
            writer.write_table(df1)

    with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink:
        with pa.RecordBatchFileWriter(sink, df2.schema) as writer:
            writer.write_table(df2)
Ejemplo n.º 44
0
tables_files = {
    "mediacloud": [
        pjoin(usenews_arrows19, "mediacloud2019.arrow"),
        pjoin(usenews_arrows20, "mediacloud2020.arrow"),
    ],
    "crowdtangle": [
        pjoin(usenews_arrows19, "crowdtangle2019.arrow"),
        pjoin(usenews_arrows20, "crowdtangle2020.arrow"),
    ]
}
tables = {}

for name, files in tables_files.items():
    sub_tables = []
    for file in files:
        source = pyarrow.memory_map(file, 'r')
        sub_tables.append(pyarrow.ipc.RecordBatchFileReader(source).read_all())
    tables[name] = pyarrow.concat_tables(sub_tables)

tables["crowdtangle"]["link"].map(urlnorm)
tables["mediacloud"]["guid"].map(urlnorm)

joined = pandas.merge(
    tables["crowdtangle"].groupby(["link"]).sum(),
    tables["mediacloud"],
    left_on="link",
    right_on="guid",
)

with pyarrow.OSFile(sys.argv[3], 'wb') as sink:
    with pyarrow.RecordBatchFileWriter(sink, joined.schema) as writer:
def test_complex_serialization(large_memory_map):
    with pa.memory_map(large_memory_map, mode="r+") as mmap:
        for obj in COMPLEX_OBJECTS:
            serialization_roundtrip(obj, mmap)
Ejemplo n.º 46
0
def test_memory_zero_length(tmpdir):
    path = os.path.join(str(tmpdir), guid())
    f = open(path, 'wb')
    f.close()
    with pa.memory_map(path, mode='r+b') as memory_map:
        assert memory_map.size() == 0
Ejemplo n.º 47
0
with open('example2.dat', 'wb') as fout:
    fout.write(b'some example data')
# b) using sing pyarrow's OSFile class:
with pa.OSFile('example3.dat', 'wb') as fout:
    fout.write(b'some example data')
# For reading files, you can use OSFile or MemoryMappedFile:
#   for OSFile, it allocates new memory on each read, like Python file objects
#   for memory maps, the library constructs a buffer referencing the mapped memory without any memory allocation or copying
# a) OSFile([filepath]):
file_obj = pa.OSFile('example2.dat')
print(file_obj.read(4))  # b'some': this allocates new memory when read
# note:
#   using OSFile for read() & write() is more efficient than using standard operating system-level file APIs

# b) memory_map([filepath]): this opens the memory map at file path
mmap = pa.memory_map('example3.dat')
print(
    mmap.read(4)
)  # b'some': this DOES NOT allocate new memory, as it references the maaped memory when read

# read() vs. read_buffer()
# read(): this implements the standard Python file read API
# read_buffer(): this reads into an Arrow Buffer object
print(mmap.seek(0))  # 0
buf = mmap.read_buffer(
    4
)  # <pyarrow.lib.Buffer object at 0x10cfc3960>: this does not allocate any memory
print(buf.to_pybytes())  # b'some': this allocates new memory

# 4) In-Memory Reading and Writing
#   for serialization and deserialization of in-memory data (arrow is more efficient than pickle)
Ejemplo n.º 48
0
def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
    memory_mapped_stream = pa.memory_map(filename)
    opened_stream = pa.ipc.open_stream(memory_mapped_stream)
    pa_table = opened_stream.read_all()
    return pa_table
Ejemplo n.º 49
0
 def _load(self):
     source = pa.memory_map(self.path)
     reader = pa.ipc.open_stream(source)
     table = pa.Table.from_batches([b for b in reader])
     self._load_table(table)