def write_array(fp, array, version=None): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int) or None, optional The version number of the format. None means use the oldest supported version that is able to store the data. Default: None Raises ------ ValueError If the array cannot be persisted. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ _check_version(version) used_ver = _write_array_header(fp, header_data_from_array_1_0(array), version) # this warning can be removed when 1.9 has aged enough if version != (2, 0) and used_ver == (2, 0): warnings.warn("Stored array in format 2.0. It can only be" "read by NumPy >= 1.9", UserWarning) # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data directly. # Instead, we will pickle it out with version 2 of the pickle protocol. pickle.dump(array, fp, protocol=2) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): fp.write(chunk.tobytes('C')) else: if isfileobj(fp): array.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): fp.write(chunk.tobytes('C'))
def write_array(fp, array, version=(1, 0)): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int), optional The version number of the format. Default: (1, 0) Raises ------ ValueError If the array cannot be persisted. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ if version != (1, 0): msg = "we only support format version (1,0), not %s" raise ValueError(msg % (version,)) fp.write(magic(*version)) write_array_header_1_0(fp, header_data_from_array_1_0(array)) # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data directly. # Instead, we will pickle it out with version 2 of the pickle protocol. pickle.dump(array, fp, protocol=2) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): fp.write(chunk.tostring('C')) else: if isfileobj(fp): array.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): fp.write(chunk.tostring('C'))
def test_isfileobj(): with tempdir(prefix="numpy_test_compat_") as folder: filename = join(folder, "a.bin") with open(filename, "wb") as f: assert_(isfileobj(f)) with open(filename, "ab") as f: assert_(isfileobj(f)) with open(filename, "rb") as f: assert_(isfileobj(f))
def test_isfileobj(): with tempdir(prefix="numpy_test_compat_") as folder: filename = join(folder, 'a.bin') with open(filename, 'wb') as f: assert_(isfileobj(f)) with open(filename, 'ab') as f: assert_(isfileobj(f)) with open(filename, 'rb') as f: assert_(isfileobj(f))
def write_array(fp, array, version=(1, 0)): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int), optional The version number of the format. Default: (1, 0) Raises ------ ValueError If the array cannot be persisted. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ if version != (1, 0): msg = "we only support format version (1,0), not %s" raise ValueError(msg % (version,)) fp.write(magic(*version)) write_array_header_1_0(fp, header_data_from_array_1_0(array)) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data directly. # Instead, we will pickle it out with version 2 of the pickle protocol. pickle.dump(array, fp, protocol=2) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: fp.write(array.T.tostring('C')) else: if isfileobj(fp): array.tofile(fp) else: # XXX: We could probably chunk this using something like # arrayterator. fp.write(array.tostring('C'))
def write_array(fp, array, version=(1,0)): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous or Fortran-contiguous AND if the filelike object is not a real file object, then this function will have to copy data in memory. Parameters ---------- fp : filelike object An open, writable file object or similar object with a `.write()` method. array : numpy.ndarray The array to write to disk. version : (int, int), optional The version number of the format. Raises ------ ValueError If the array cannot be persisted. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise arbitrary errors if the objects are not picklable. """ if version != (1, 0): msg = "we only support format version (1,0), not %s" raise ValueError(msg % (version,)) fp.write(magic(*version)) write_array_header_1_0(fp, header_data_from_array_1_0(array)) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data directly. # Instead, we will pickle it out with version 2 of the pickle protocol. cPickle.dump(array, fp, protocol=2) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: fp.write(array.T.tostring('C')) else: if isfileobj(fp): array.tofile(fp) else: # XXX: We could probably chunk this using something like # arrayterator. fp.write(array.tostring('C'))
def read_array(fp): """ Read an array from an NPY file. Parameters ---------- fp : file_like object If this is not a real file object, then this may take extra memory and time. Returns ------- array : ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid. """ version = read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) shape, fortran_order, dtype = read_array_header_1_0(fp) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. array = pickle.load(fp) else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the memory-intensive # way. # XXX: we can probably chunk this to avoid the memory hit. data = fp.read(int(count * dtype.itemsize)) array = numpy.fromstring(data, dtype=dtype, count=count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def read_array(fp): """ Read an array from an NPY file. Parameters ---------- fp : filelike object If this is not a real file object, then this may take extra memory and time. Returns ------- array : numpy.ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid. """ version = read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) shape, fortran_order, dtype = read_array_header_1_0(fp) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. array = cPickle.load(fp) else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the memory-intensive # way. # XXX: we can probably chunk this to avoid the memory hit. data = fp.read(int(count * dtype.itemsize)) array = numpy.fromstring(data, dtype=dtype, count=count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def read_fasta_dataset(fd): if isfileobj(fd): # file already opened ctx = fd else: # open file ctx = open(os.fspath(fd), 'r') with ctx as fd: fasta_dataset = dict() k = str() pattern = re.compile(r'^>\w+') for line in ctx: m = pattern.match(line) if m: # new FASTA ID k = m.group(0) fasta_dataset[k] = str() else: fasta_dataset[k] += line.strip() return fasta_dataset
def open_memmap(filename, mode='r+', dtype=None, shape=None, fortran_order=False, version=None): """ Open a .npy file as a memory-mapped array. This may be used to read an existing file or create a new one. Parameters ---------- filename : str or path-like The name of the file on disk. This may *not* be a file-like object. mode : str, optional The mode in which to open the file; the default is 'r+'. In addition to the standard file modes, 'c' is also accepted to mean "copy on write." See `memmap` for the available mode strings. dtype : data-type, optional The data type of the array if we are creating a new file in "write" mode, if not, `dtype` is ignored. The default value is None, which results in a data-type of `float64`. shape : tuple of int The shape of the array if we are creating a new file in "write" mode, in which case this parameter is required. Otherwise, this parameter is ignored and is thus optional. fortran_order : bool, optional Whether the array should be Fortran-contiguous (True) or C-contiguous (False, the default) if we are creating a new file in "write" mode. version : tuple of int (major, minor) or None If the mode is a "write" mode, then this is the version of the file format used to create the file. None means use the oldest supported version that is able to store the data. Default: None Returns ------- marray : memmap The memory-mapped array. Raises ------ ValueError If the data or the mode is invalid. IOError If the file is not found or cannot be opened correctly. See Also -------- memmap """ if isfileobj(filename): raise ValueError("Filename must be a string or a path-like object." " Memmap cannot use existing file handles.") if 'w' in mode: # We are creating the file, not reading it. # Check if we ought to create the file. _check_version(version) # Ensure that the given dtype is an authentic dtype object rather # than just something that can be interpreted as a dtype object. dtype = numpy.dtype(dtype) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) d = dict( descr=dtype_to_descr(dtype), fortran_order=fortran_order, shape=shape, ) # If we got here, then it should be safe to create the file. with open(os_fspath(filename), mode + 'b') as fp: _write_array_header(fp, d, version) offset = fp.tell() else: # Read the header of the file first. with open(os_fspath(filename), 'rb') as fp: version = read_magic(fp) _check_version(version) shape, fortran_order, dtype = _read_array_header(fp, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) offset = fp.tell() if fortran_order: order = 'F' else: order = 'C' # We need to change a write-only mode to a read-write mode since we've # already written data to the file. if mode == 'w+': mode = 'r+' marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order, mode=mode, offset=offset) return marray
def read_array(fp, allow_pickle=False, pickle_kwargs=None): """ Read an array from an NPY file. Parameters ---------- fp : file_like object If this is not a real file object, then this may take extra memory and time. allow_pickle : bool, optional Whether to allow writing pickled data. Default: False .. versionchanged:: 1.16.3 Made default False in response to CVE-2019-6446. pickle_kwargs : dict Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. Returns ------- array : ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid, or allow_pickle=False and the file contains an object array. """ version = read_magic(fp) _check_version(version) shape, fortran_order, dtype = _read_array_header(fp, version) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape, dtype=numpy.int64) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. if not allow_pickle: raise ValueError("Object arrays cannot be loaded when " "allow_pickle=False") if pickle_kwargs is None: pickle_kwargs = {} try: array = pickle.load(fp, **pickle_kwargs) except UnicodeError as err: if sys.version_info[0] >= 3: # Friendlier error message raise UnicodeError("Unpickling a python object failed: %r\n" "You may need to pass the encoding= option " "to numpy.load" % (err, )) raise else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, # breaking large reads from gzip streams. Chunk reads to # BUFFER_SIZE bytes to avoid issue and reduce memory overhead # of the read. In non-chunked case count < max_read_count, so # only one read is performed. # Use np.ndarray instead of np.empty since the latter does # not correctly instantiate zero-width string dtypes; see # https://github.com/numpy/numpy/pull/6430 array = numpy.ndarray(count, dtype=dtype) if dtype.itemsize > 0: # If dtype.itemsize == 0 then there's nothing more to read max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * dtype.itemsize) data = _read_bytes(fp, read_size, "array data") array[i:i + read_count] = numpy.frombuffer( data, dtype=dtype, count=read_count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int) or None, optional The version number of the format. None means use the oldest supported version that is able to store the data. Default: None allow_pickle : bool, optional Whether to allow writing pickled data. Default: True pickle_kwargs : dict, optional Additional keyword arguments to pass to pickle.dump, excluding 'protocol'. These are only useful when pickling objects in object arrays on Python 3 to Python 2 compatible format. Raises ------ ValueError If the array cannot be persisted. This includes the case of allow_pickle=False and array being an object array. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ _check_version(version) _write_array_header(fp, header_data_from_array_1_0(array), version) if array.itemsize == 0: buffersize = 0 else: # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024**2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data # directly. Instead, we will pickle it out if not allow_pickle: raise ValueError("Object arrays cannot be saved when " "allow_pickle=False") if pickle_kwargs is None: pickle_kwargs = {} pickle.dump(array, fp, protocol=3, **pickle_kwargs) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): fp.write(chunk.tobytes('C')) else: if isfileobj(fp): array.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): fp.write(chunk.tobytes('C'))
def open_memmap(filename, mode='r+', dtype=None, shape=None, fortran_order=False, version=None): """ Open a .npy file as a memory-mapped array. This may be used to read an existing file or create a new one. Parameters ---------- filename : str or path-like The name of the file on disk. This may *not* be a file-like object. mode : str, optional The mode in which to open the file; the default is 'r+'. In addition to the standard file modes, 'c' is also accepted to mean "copy on write." See `memmap` for the available mode strings. dtype : data-type, optional The data type of the array if we are creating a new file in "write" mode, if not, `dtype` is ignored. The default value is None, which results in a data-type of `float64`. shape : tuple of int The shape of the array if we are creating a new file in "write" mode, in which case this parameter is required. Otherwise, this parameter is ignored and is thus optional. fortran_order : bool, optional Whether the array should be Fortran-contiguous (True) or C-contiguous (False, the default) if we are creating a new file in "write" mode. version : tuple of int (major, minor) or None If the mode is a "write" mode, then this is the version of the file format used to create the file. None means use the oldest supported version that is able to store the data. Default: None Returns ------- marray : memmap The memory-mapped array. Raises ------ ValueError If the data or the mode is invalid. IOError If the file is not found or cannot be opened correctly. See Also -------- memmap """ if isfileobj(filename): raise ValueError("Filename must be a string or a path-like object." " Memmap cannot use existing file handles.") if 'w' in mode: # We are creating the file, not reading it. # Check if we ought to create the file. _check_version(version) # Ensure that the given dtype is an authentic dtype object rather # than just something that can be interpreted as a dtype object. dtype = numpy.dtype(dtype) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) d = dict( descr=dtype_to_descr(dtype), fortran_order=fortran_order, shape=shape, ) # If we got here, then it should be safe to create the file. fp = open(os_fspath(filename), mode+'b') try: used_ver = _write_array_header(fp, d, version) # this warning can be removed when 1.9 has aged enough if version != (2, 0) and used_ver == (2, 0): warnings.warn("Stored array in format 2.0. It can only be" "read by NumPy >= 1.9", UserWarning, stacklevel=2) offset = fp.tell() finally: fp.close() else: # Read the header of the file first. fp = open(os_fspath(filename), 'rb') try: version = read_magic(fp) _check_version(version) shape, fortran_order, dtype = _read_array_header(fp, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) offset = fp.tell() finally: fp.close() if fortran_order: order = 'F' else: order = 'C' # We need to change a write-only mode to a read-write mode since we've # already written data to the file. if mode == 'w+': mode = 'r+' marray = numpy.memmap(filename, dtype=dtype, shape=shape, order=order, mode=mode, offset=offset) return marray
def read_array(fp, allow_pickle=True, pickle_kwargs=None): """ Read an array from an NPY file. Parameters ---------- fp : file_like object If this is not a real file object, then this may take extra memory and time. allow_pickle : bool, optional Whether to allow reading pickled data. Default: True pickle_kwargs : dict Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. Returns ------- array : ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid, or allow_pickle=False and the file contains an object array. """ version = read_magic(fp) _check_version(version) shape, fortran_order, dtype = _read_array_header(fp, version) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape, dtype=numpy.int64) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. if not allow_pickle: raise ValueError("Object arrays cannot be loaded when " "allow_pickle=False") if pickle_kwargs is None: pickle_kwargs = {} try: array = pickle.load(fp, **pickle_kwargs) except UnicodeError as err: if sys.version_info[0] >= 3: # Friendlier error message raise UnicodeError("Unpickling a python object failed: %r\n" "You may need to pass the encoding= option " "to numpy.load" % (err,)) raise else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, # breaking large reads from gzip streams. Chunk reads to # BUFFER_SIZE bytes to avoid issue and reduce memory overhead # of the read. In non-chunked case count < max_read_count, so # only one read is performed. # Use np.ndarray instead of np.empty since the latter does # not correctly instantiate zero-width string dtypes; see # https://github.com/numpy/numpy/pull/6430 array = numpy.ndarray(count, dtype=dtype) if dtype.itemsize > 0: # If dtype.itemsize == 0 then there's nothing more to read max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * dtype.itemsize) data = _read_bytes(fp, read_size, "array data") array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, count=read_count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int) or None, optional The version number of the format. None means use the oldest supported version that is able to store the data. Default: None allow_pickle : bool, optional Whether to allow writing pickled data. Default: True pickle_kwargs : dict, optional Additional keyword arguments to pass to pickle.dump, excluding 'protocol'. These are only useful when pickling objects in object arrays on Python 3 to Python 2 compatible format. Raises ------ ValueError If the array cannot be persisted. This includes the case of allow_pickle=False and array being an object array. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ _check_version(version) used_ver = _write_array_header(fp, header_data_from_array_1_0(array), version) # this warning can be removed when 1.9 has aged enough if version != (2, 0) and used_ver == (2, 0): warnings.warn("Stored array in format 2.0. It can only be" "read by NumPy >= 1.9", UserWarning, stacklevel=2) if array.itemsize == 0: buffersize = 0 else: # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024 ** 2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data # directly. Instead, we will pickle it out with version 2 of the # pickle protocol. if not allow_pickle: raise ValueError("Object arrays cannot be saved when " "allow_pickle=False") if pickle_kwargs is None: pickle_kwargs = {} pickle.dump(array, fp, protocol=2, **pickle_kwargs) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): fp.write(chunk.tobytes('C')) else: if isfileobj(fp): array.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): fp.write(chunk.tobytes('C'))
def fromfile(fd, dtype=None, shape=None, offset=0, formats=None, names=None, titles=None, aligned=False, byteorder=None): """Create an array from binary file data If file is a string or a path-like object then that file is opened, else it is assumed to be a file object. The file object must support random access (i.e. it must have tell and seek methods). >>> from tempfile import TemporaryFile >>> a = np.empty(10,dtype='f8,i4,a5') >>> a[5] = (0.5,10,'abcde') >>> >>> fd=TemporaryFile() >>> a = a.newbyteorder('<') >>> a.tofile(fd) >>> >>> fd.seek(0) >>> r=np.core.records.fromfile(fd, formats='f8,i4,a5', shape=10, ... byteorder='<') >>> print(r[5]) (0.5, 10, 'abcde') >>> r.shape (10,) """ if dtype is None and formats is None: raise TypeError("fromfile() needs a 'dtype' or 'formats' argument") if (shape is None or shape == 0): shape = (-1,) elif isinstance(shape, (int, long)): shape = (shape,) if isfileobj(fd): # file already opened name = 0 else: # open file fd = open(os_fspath(fd), 'rb') name = 1 if (offset > 0): fd.seek(offset, 1) size = get_remaining_size(fd) if dtype is not None: descr = sb.dtype(dtype) else: descr = format_parser(formats, names, titles, aligned, byteorder)._descr itemsize = descr.itemsize shapeprod = sb.array(shape).prod(dtype=nt.intp) shapesize = shapeprod * itemsize if shapesize < 0: shape = list(shape) shape[shape.index(-1)] = size // -shapesize shape = tuple(shape) shapeprod = sb.array(shape).prod(dtype=nt.intp) nbytes = shapeprod * itemsize if nbytes > size: raise ValueError( "Not enough bytes left in file for specified shape and type") # create the array _array = recarray(shape, descr) nbytesread = fd.readinto(_array.data) if nbytesread != nbytes: raise IOError("Didn't read as many bytes as expected") if name: fd.close() return _array
def read_array(fp): """ Read an array from an NPY file. Parameters ---------- fp : file_like object If this is not a real file object, then this may take extra memory and time. Returns ------- array : ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid. """ version = read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) shape, fortran_order, dtype = read_array_header_1_0(fp) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. array = pickle.load(fp) else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the memory-intensive # way. # crc32 module fails on reads greater than 2 ** 32 bytes, breaking # large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to # avoid issue and reduce memory overhead of the read. In # non-chunked case count < max_read_count, so only one read is # performed. max_read_count = BUFFER_SIZE // dtype.itemsize array = numpy.empty(count, dtype=dtype) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) data = fp.read(int(read_count * dtype.itemsize)) array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype, count=read_count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def fromfile(fd, dtype=None, shape=None, offset=0, formats=None, names=None, titles=None, aligned=False, byteorder=None): """Create an array from binary file data Parameters ---------- fd : str or file type If file is a string or a path-like object then that file is opened, else it is assumed to be a file object. The file object must support random access (i.e. it must have tell and seek methods). dtype : data-type, optional valid dtype for all arrays shape : int or tuple of ints, optional shape of each array. offset : int, optional Position in the file to start reading from. formats, names, titles, aligned, byteorder : If `dtype` is ``None``, these arguments are passed to `numpy.format_parser` to construct a dtype. See that function for detailed documentation Returns ------- np.recarray record array consisting of data enclosed in file. Examples -------- >>> from tempfile import TemporaryFile >>> a = np.empty(10,dtype='f8,i4,a5') >>> a[5] = (0.5,10,'abcde') >>> >>> fd=TemporaryFile() >>> a = a.newbyteorder('<') >>> a.tofile(fd) >>> >>> _ = fd.seek(0) >>> r=np.core.records.fromfile(fd, formats='f8,i4,a5', shape=10, ... byteorder='<') >>> print(r[5]) (0.5, 10, 'abcde') >>> r.shape (10,) """ if dtype is None and formats is None: raise TypeError("fromfile() needs a 'dtype' or 'formats' argument") # NumPy 1.19.0, 2020-01-01 shape = _deprecate_shape_0_as_None(shape) if shape is None: shape = (-1, ) elif isinstance(shape, int): shape = (shape, ) if isfileobj(fd): # file already opened ctx = contextlib_nullcontext(fd) else: # open file ctx = open(os_fspath(fd), 'rb') with ctx as fd: if offset > 0: fd.seek(offset, 1) size = get_remaining_size(fd) if dtype is not None: descr = sb.dtype(dtype) else: descr = format_parser(formats, names, titles, aligned, byteorder).dtype itemsize = descr.itemsize shapeprod = sb.array(shape).prod(dtype=nt.intp) shapesize = shapeprod * itemsize if shapesize < 0: shape = list(shape) shape[shape.index(-1)] = size // -shapesize shape = tuple(shape) shapeprod = sb.array(shape).prod(dtype=nt.intp) nbytes = shapeprod * itemsize if nbytes > size: raise ValueError( "Not enough bytes left in file for specified shape and type") # create the array _array = recarray(shape, descr) nbytesread = fd.readinto(_array.data) if nbytesread != nbytes: raise IOError("Didn't read as many bytes as expected") return _array
def write_array(fp, array, version=None, pickle_kwargs=None): """ Write an array to an NPY file, including a header. If the array is neither C-contiguous nor Fortran-contiguous AND the file_like object is not a real file object, this function will have to copy data in memory. Parameters ---------- fp : file_like object An open, writable file object, or similar object with a ``.write()`` method. array : ndarray The array to write to disk. version : (int, int) or None, optional The version number of the format. None means use the oldest supported version that is able to store the data. Default: None pickle_kwargs : dict, optional Additional keyword arguments to pass to pickle.dump, excluding 'protocol'. These are only useful when pickling objects in object arrays on Python 3 to Python 2 compatible format. Raises ------ ValueError If the array cannot be persisted. Various other errors If the array contains Python objects as part of its dtype, the process of pickling them may raise various errors if the objects are not picklable. """ _check_version(version) used_ver = _write_array_header(fp, header_data_from_array_1_0(array), version) # this warning can be removed when 1.9 has aged enough if version != (2, 0) and used_ver == (2, 0): warnings.warn( "Stored array in format 2.0. It can only be" "read by NumPy >= 1.9", UserWarning) # Set buffer size to 16 MiB to hide the Python loop overhead. buffersize = max(16 * 1024**2 // array.itemsize, 1) if array.dtype.hasobject: # We contain Python objects so we cannot write out the data # directly. Instead, we will pickle it out with version 2 of the # pickle protocol. if pickle_kwargs is None: pickle_kwargs = {} pickle.dump(array, fp, protocol=2, **pickle_kwargs) elif array.flags.f_contiguous and not array.flags.c_contiguous: if isfileobj(fp): array.T.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='F'): fp.write(chunk.tobytes('C')) else: if isfileobj(fp): array.tofile(fp) else: for chunk in numpy.nditer( array, flags=['external_loop', 'buffered', 'zerosize_ok'], buffersize=buffersize, order='C'): fp.write(chunk.tobytes('C'))
def read_array(fp, pickle_kwargs=None): """ Read an array from an NPY file. Parameters ---------- fp : file_like object If this is not a real file object, then this may take extra memory and time. pickle_kwargs : dict Additional keyword arguments to pass to pickle.load. These are only useful when loading object arrays saved on Python 2 when using Python 3. Returns ------- array : ndarray The array from the data on disk. Raises ------ ValueError If the data is invalid. """ version = read_magic(fp) _check_version(version) shape, fortran_order, dtype = _read_array_header(fp, version) if len(shape) == 0: count = 1 else: count = numpy.multiply.reduce(shape) # Now read the actual data. if dtype.hasobject: # The array contained Python objects. We need to unpickle the data. if pickle_kwargs is None: pickle_kwargs = {} try: array = pickle.load(fp, **pickle_kwargs) except UnicodeError as err: if sys.version_info[0] >= 3: # Friendlier error message raise UnicodeError("Unpickling a python object failed: %r\n" "You may need to pass the encoding= option " "to numpy.load" % (err, )) raise else: if isfileobj(fp): # We can use the fast fromfile() function. array = numpy.fromfile(fp, dtype=dtype, count=count) else: # This is not a real file. We have to read it the # memory-intensive way. # crc32 module fails on reads greater than 2 ** 32 bytes, # breaking large reads from gzip streams. Chunk reads to # BUFFER_SIZE bytes to avoid issue and reduce memory overhead # of the read. In non-chunked case count < max_read_count, so # only one read is performed. max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize) array = numpy.empty(count, dtype=dtype) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * dtype.itemsize) data = _read_bytes(fp, read_size, "array data") array[i:i + read_count] = numpy.frombuffer(data, dtype=dtype, count=read_count) if fortran_order: array.shape = shape[::-1] array = array.transpose() else: array.shape = shape return array
def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None, names=None, titles=None, aligned=False, byteorder=None, copy=True): """Construct a record array from a wide-variety of objects. """ if (isinstance(obj, (type(None), str)) or isfileobj(obj)) \ and (formats is None) \ and (dtype is None): raise ValueError("Must define formats (or dtype) if object is "\ "None, string, or an open file") kwds = {} if dtype is not None: dtype = sb.dtype(dtype) elif formats is not None: dtype = format_parser(formats, names, titles, aligned, byteorder)._descr else: kwds = {'formats': formats, 'names' : names, 'titles' : titles, 'aligned' : aligned, 'byteorder' : byteorder } if obj is None: if shape is None: raise ValueError("Must define a shape if obj is None") return recarray(shape, dtype, buf=obj, offset=offset, strides=strides) elif isinstance(obj, bytes): return fromstring(obj, dtype, shape=shape, offset=offset, **kwds) elif isinstance(obj, (list, tuple)): if isinstance(obj[0], (tuple, list)): return fromrecords(obj, dtype=dtype, shape=shape, **kwds) else: return fromarrays(obj, dtype=dtype, shape=shape, **kwds) elif isinstance(obj, recarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() return new elif isfileobj(obj): return fromfile(obj, dtype=dtype, shape=shape, offset=offset) elif isinstance(obj, ndarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() res = new.view(recarray) if issubclass(res.dtype.type, nt.void): res.dtype = sb.dtype((record, res.dtype)) return res else: interface = getattr(obj, "__array_interface__", None) if interface is None or not isinstance(interface, dict): raise ValueError("Unknown input type") obj = sb.array(obj) if dtype is not None and (obj.dtype != dtype): obj = obj.view(dtype) res = obj.view(recarray) if issubclass(res.dtype.type, nt.void): res.dtype = sb.dtype((record, res.dtype)) return res
def fromfile(fd, dtype=None, shape=None, offset=0, formats=None, names=None, titles=None, aligned=False, byteorder=None): """Create an array from binary file data If file is a string or a path-like object then that file is opened, else it is assumed to be a file object. The file object must support random access (i.e. it must have tell and seek methods). >>> from tempfile import TemporaryFile >>> a = np.empty(10,dtype='f8,i4,a5') >>> a[5] = (0.5,10,'abcde') >>> >>> fd=TemporaryFile() >>> a = a.newbyteorder('<') >>> a.tofile(fd) >>> >>> _ = fd.seek(0) >>> r=np.core.records.fromfile(fd, formats='f8,i4,a5', shape=10, ... byteorder='<') >>> print(r[5]) (0.5, 10, 'abcde') >>> r.shape (10,) """ if dtype is None and formats is None: raise TypeError("fromfile() needs a 'dtype' or 'formats' argument") if (shape is None or shape == 0): shape = (-1,) elif isinstance(shape, (int, long)): shape = (shape,) if isfileobj(fd): # file already opened name = 0 else: # open file fd = open(os_fspath(fd), 'rb') name = 1 if (offset > 0): fd.seek(offset, 1) size = get_remaining_size(fd) if dtype is not None: descr = sb.dtype(dtype) else: descr = format_parser(formats, names, titles, aligned, byteorder)._descr itemsize = descr.itemsize shapeprod = sb.array(shape).prod(dtype=nt.intp) shapesize = shapeprod * itemsize if shapesize < 0: shape = list(shape) shape[shape.index(-1)] = size // -shapesize shape = tuple(shape) shapeprod = sb.array(shape).prod(dtype=nt.intp) nbytes = shapeprod * itemsize if nbytes > size: raise ValueError( "Not enough bytes left in file for specified shape and type") # create the array _array = recarray(shape, descr) nbytesread = fd.readinto(_array.data) if nbytesread != nbytes: raise IOError("Didn't read as many bytes as expected") if name: fd.close() return _array
def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None, names=None, titles=None, aligned=True, byteorder=None, copy=True): """Construct a record array from a wide-variety of objects. """ if ((isinstance(obj, (type(None), str)) or isfileobj(obj)) and (formats is None) and (dtype is None)): raise ValueError("Must define formats (or dtype) if object is " "None, string, or an open file") kwds = {} if dtype is not None: dtype = sb.dtype(dtype) elif formats is not None: dtype = format_parser(formats, names, titles, aligned, byteorder)._descr else: kwds = { 'formats': formats, 'names': names, 'titles': titles, 'aligned': aligned, 'byteorder': byteorder } if obj is None: if shape is None: raise ValueError("Must define a shape if obj is None") return recarray(shape, dtype, buf=obj, offset=offset, strides=strides) elif isinstance(obj, bytes): return fromstring(obj, dtype, shape=shape, offset=offset, **kwds) elif isinstance(obj, (list, tuple)): if isinstance(obj[0], (tuple, list)): return fromrecords(obj, dtype=dtype, shape=shape, **kwds) else: return fromarrays(obj, dtype=dtype, shape=shape, **kwds) elif isinstance(obj, recarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() return new elif isfileobj(obj): return fromfile(obj, dtype=dtype, shape=shape, offset=offset) elif isinstance(obj, ndarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() return new.view(recarray) else: interface = getattr(obj, "__array_interface__", None) if interface is None or not isinstance(interface, dict): raise ValueError("Unknown input type") obj = sb.array(obj) if dtype is not None and (obj.dtype != dtype): obj = obj.view(dtype) return obj.view(recarray)
def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None, names=None, titles=None, aligned=False, byteorder=None, copy=True): """ Construct a record array from a wide-variety of objects. A general-purpose record array constructor that dispatches to the appropriate `recarray` creation function based on the inputs (see Notes). Parameters ---------- obj: any Input object. See Notes for details on how various input types are treated. dtype: data-type, optional Valid dtype for array. shape: int or tuple of ints, optional Shape of each array. offset: int, optional Position in the file or buffer to start reading from. strides: tuple of ints, optional Buffer (`buf`) is interpreted according to these strides (strides define how many bytes each array element, row, column, etc. occupy in memory). formats, names, titles, aligned, byteorder : If `dtype` is ``None``, these arguments are passed to `numpy.format_parser` to construct a dtype. See that function for detailed documentation. copy: bool, optional Whether to copy the input object (True), or to use a reference instead. This option only applies when the input is an ndarray or recarray. Defaults to True. Returns ------- np.recarray Record array created from the specified object. Notes ----- If `obj` is ``None``, then call the `~numpy.recarray` constructor. If `obj` is a string, then call the `fromstring` constructor. If `obj` is a list or a tuple, then if the first object is an `~numpy.ndarray`, call `fromarrays`, otherwise call `fromrecords`. If `obj` is a `~numpy.recarray`, then make a copy of the data in the recarray (if ``copy=True``) and use the new formats, names, and titles. If `obj` is a file, then call `fromfile`. Finally, if obj is an `ndarray`, then return ``obj.view(recarray)``, making a copy of the data if ``copy=True``. Examples -------- >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) >>> np.core.records.array(a) rec.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=int32) >>> b = [(1, 1), (2, 4), (3, 9)] >>> c = np.core.records.array(b, formats = ['i2', 'f2'], names = ('x', 'y')) >>> c rec.array([(1, 1.0), (2, 4.0), (3, 9.0)], dtype=[('x', '<i2'), ('y', '<f2')]) >>> c.x rec.array([1, 2, 3], dtype=int16) >>> c.y rec.array([ 1.0, 4.0, 9.0], dtype=float16) >>> r = np.rec.array(['abc','def'], names=['col1','col2']) >>> print(r.col1) abc >>> r.col1 array('abc', dtype='<U3') >>> r.col2 array('def', dtype='<U3') """ if ((isinstance(obj, (type(None), str)) or isfileobj(obj)) and formats is None and dtype is None): raise ValueError("Must define formats (or dtype) if object is " "None, string, or an open file") kwds = {} if dtype is not None: dtype = sb.dtype(dtype) elif formats is not None: dtype = format_parser(formats, names, titles, aligned, byteorder).dtype else: kwds = { 'formats': formats, 'names': names, 'titles': titles, 'aligned': aligned, 'byteorder': byteorder } if obj is None: if shape is None: raise ValueError("Must define a shape if obj is None") return recarray(shape, dtype, buf=obj, offset=offset, strides=strides) elif isinstance(obj, bytes): return fromstring(obj, dtype, shape=shape, offset=offset, **kwds) elif isinstance(obj, (list, tuple)): if isinstance(obj[0], (tuple, list)): return fromrecords(obj, dtype=dtype, shape=shape, **kwds) else: return fromarrays(obj, dtype=dtype, shape=shape, **kwds) elif isinstance(obj, recarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() return new elif isfileobj(obj): return fromfile(obj, dtype=dtype, shape=shape, offset=offset) elif isinstance(obj, ndarray): if dtype is not None and (obj.dtype != dtype): new = obj.view(dtype) else: new = obj if copy: new = new.copy() return new.view(recarray) else: interface = getattr(obj, "__array_interface__", None) if interface is None or not isinstance(interface, dict): raise ValueError("Unknown input type") obj = sb.array(obj) if dtype is not None and (obj.dtype != dtype): obj = obj.view(dtype) return obj.view(recarray)