def arrayfile(data_file, shape, descr, fortran=False): ''' Returns an array that is memory-mapped to an NPY (v1.0) file Arguments --------- data_file : a file-like object opened with write mode compatible to NumPy's memory-mapped array types (see `numpy.memmap`). It is responsibility of the caller to close the file. shape : tuple shape of the ndarray. descr : str a typecode str (see `array` of `numpy.dtype`). Will be converted to a NumPy dtype. fortran : bool optional; if True, the array uses Fortran data order. Default: use C order. ''' from numpy.lib import format header = {'descr': descr, 'fortran_order': fortran, 'shape': shape} cio = StringIO() format.write_array_header_1_0(cio, header) # write header here first format.write_array_header_1_0(data_file, header) # write header cio.seek(0) offset = len(cio.readline()) # get offset return np.memmap(data_file, dtype=np.dtype(descr), mode=data_file.mode, shape=shape, offset=offset)
def test_bad_header(): # header of length less than 2 should fail s = BytesIO() assert_raises(ValueError, format.read_array_header_1_0, s) s = BytesIO(asbytes('1')) assert_raises(ValueError, format.read_array_header_1_0, s) # header shorter than indicated size should fail s = BytesIO(asbytes('\x01\x00')) assert_raises(ValueError, format.read_array_header_1_0, s) # headers without the exact keys required should fail d = {"shape": (1, 2), "descr": "x"} s = BytesIO() format.write_array_header_1_0(s, d) assert_raises(ValueError, format.read_array_header_1_0, s) d = {"shape": (1, 2), "fortran_order": False, "descr": "x", "extrakey": -1} s = BytesIO() format.write_array_header_1_0(s, d) assert_raises(ValueError, format.read_array_header_1_0, s)
def test_large_header(): s = BytesIO() d = {'a': 1, 'b': 2} format.write_array_header_1_0(s, d) s = BytesIO() d = {'a': 1, 'b': 2, 'c': 'x'*256*256} assert_raises(ValueError, format.write_array_header_1_0, s, d)
def test_large_header(): s = StringIO() d = {"a": 1, "b": 2} format.write_array_header_1_0(s, d) s = StringIO() d = {"a": 1, "b": 2, "c": "x" * 256 * 256} assert_raises(ValueError, format.write_array_header_1_0, s, d)
def test_large_header(): s = BytesIO() d = {"a": 1, "b": 2} format.write_array_header_1_0(s, d) s = BytesIO() d = {"a": 1, "b": 2, "c": "x" * 256 * 256} assert_raises(ValueError, format.write_array_header_1_0, s, d)
def _npy_size(ary): assert not ary.dtype.hasobject magic_len = npy.MAGIC_LEN # TODO: could calculate this directly with closing(StringIO()) as sio: npy.write_array_header_1_0(sio, npy.header_data_from_array_1_0(ary)) header_len = sio.tell() data_len = ary.dtype.itemsize * ary.size return magic_len + header_len + data_len
def write_localarray(fp, arr, version=(1, 0)): """ Write a LocalArray to a .dap file, including a header. The ``__version__`` and ``dim_data`` keys from the Distributed Array Protocol are written to a header, then ``numpy.save`` is used to write the value of the ``buffer`` key. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. arr : LocalArray The array to write to disk. version : (int, int), optional The version number of the file format. Default: (1, 0) Raises ------ ValueError If the array cannot be persisted. Various other errors If the underlying numpy array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ if version != (1, 0): msg = "Only version (1, 0) is supported, not %s." raise ValueError(msg % (version,)) fp.write(magic(*version)) distbuffer = arr.__distarray__() metadata = {'__version__': distbuffer['__version__'], 'dim_data': distbuffer['dim_data'], } write_array_header_1_0(fp, metadata) np.save(fp, distbuffer['buffer'])
def save(file_name, array, axis, full_shape=None, mpi_comm=MPI.COMM_WORLD): """ Save a numpy array from parallel jobs in the MPI communicator. The array is gathered along the chosen dimension. Parameters ---------- file_name : str The numpy array file to load. array : numpy.ndarray The distributed array. axis : int The axis on which to distribute the array. full_shape : tuple(int), optional The size of the full array, by default None. mpi_comm : mpi4py.MPI.Comm, optional The MPI communicator used to distribute, by default MPI.COMM_WORLD. """ if full_shape is None: full_shape = gather_full_shape(array, axis, mpi_comm) axis = utils.positive_index(axis, len(full_shape)) header_offset = None if is_root_process(mpi_comm): header_dict = { 'shape': full_shape, 'fortran_order': False, 'descr': npformat.dtype_to_descr(array.dtype) } with open(file_name, 'wb') as fp: try: npformat.write_array_header_1_0(fp, header_dict) except ValueError: npformat.write_array_header_2_0(fp, header_dict) header_offset = fp.tell() header_offset = mpi_comm.bcast(header_offset, root=0) i_start, bin_size = distribute_mpi(full_shape[axis], mpi_comm) slice_type = create_slice_view(axis, bin_size, shape=full_shape, dtype=array.dtype) slice_type.Commit() single_slice_extent = slice_type.extent if bin_size != 0: single_slice_extent /= bin_size displacement = header_offset + i_start * single_slice_extent base_type = to_mpi_datatype(array.dtype) fh = MPI.File.Open(mpi_comm, file_name, MPI.MODE_WRONLY | MPI.MODE_APPEND) fh.Set_view(displacement, filetype=slice_type) fh.Write_all([array, array.size, base_type]) fh.Close() slice_type.Free()
def open_memmap(filename, mode='r+', dtype=None, shape=None, fortran_order=False, version=(1,0), offset=0): """ Open a .npy file as a memory-mapped array, with offset argument. This may be used to read an existing file or create a new one. :param str filename: The name of the file on disk. This may not be a file-like object. :param str mode: The mode to open the file with. In addition to the standard file modes, 'c' is also accepted to mean "copy on write". See `numpy.memmap` for the available mode strings. :param dtype dtype: The data type of the array if we are creating a new file in "write" mode. :param tuple shape: The shape of the array if we are creating a new file in "write" mode. Shape of (contiguous) slice if opening an existing file. :param bool fortran_order: Whether the array should be Fortran-contiguous (True) or C-contiguous (False) if we are creating a new file in "write" mode. :param tuple version: If the mode is a "write" mode, then this is the version (major, minor) of the file format used to create the file. :param int offset: Number of elements to skip along the first dimension. :return numpy.memmap: The memory-mapped array. Raises: * :exc:`ValueError` if the data or the mode is invalid * :exc:`IOError` if the file is not found or cannot be opened correctly. .. seealso:: :func:`numpy.memmap` """ if not isinstance(filename, basestring): raise ValueError("Filename must be a string. Memmap cannot use" \ " existing file handles.") if 'w' in mode: assert offset == 0, "Cannot specify offset when creating memmap" # We are creating the file, not reading it. # Check if we ought to create the file. if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) # Ensure that the given dtype is an authentic dtype object rather than # just something that can be interpreted as a dtype object. dtype = np.dtype(dtype) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) d = dict( descr=dtype_to_descr(dtype), fortran_order=fortran_order, shape=shape, ) # If we got here, then it should be safe to create the file. fp = open(filename, mode+'b') try: fp.write(magic(*version)) write_array_header_1_0(fp, d) offset = fp.tell() finally: fp.close() else: # Read the header of the file first. fp = open(filename, 'rb') try: version = read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) fullshape, fortran_order, dtype = read_array_header_1_0(fp) if shape: length = np.atleast_1d(shape) msg = "Specify shape along first dimension only" assert length.ndim == 1, msg else: length = fullshape[0] - offset shape = (length,) + fullshape[1:] if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) offset_items = offset * np.prod(fullshape[1:], dtype=int) offset_bytes = fp.tell() + offset_items * dtype.itemsize finally: fp.close() if fortran_order: order = 'F' else: order = 'C' # We need to change a write-only mode to a read-write mode since we've # already written data to the file. if mode == 'w+': mode = 'r+' marray = np.memmap(filename, dtype=dtype, shape=shape, order=order, mode=mode, offset=offset_bytes) return marray