def read_npy(fp, prn=False): """ read an npy file quickly : fp = file path : : file = "c:/temp/a01.npy" """ frmt = """ Magic {} Shape {}, C-contig {}, dtype {} """ from numpy.lib import format as format_ with open(fp, 'rb') as f: major, minor = format_.read_magic(f) mag = format_.magic(major, minor) shp, is_fortran, dt = format_.read_array_header_1_0(f) count = np.multiply.reduce(shp, dtype=np.int64) #data = f.readlines() BUFFER_SIZE = 2**18 max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dt.itemsize) array = np.ndarray(count, dtype=dt) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * dt.itemsize) data = format_._read_bytes(f, read_size, "array data") array[i:i+read_count] = np.frombuffer(data, dtype=dt, count=read_count) array.shape = shp if prn: print(dedent(frmt).format(mag, shp, (not is_fortran), dt)) return array
def read_npy(fp, prn=False): """ read an npy file quickly : fp = file path : : file = "c:/temp/a01.npy" """ frmt = """ Magic {} Shape {}, C-contig {}, dtype {} """ from numpy.lib import format as format_ with open(fp, 'rb') as f: major, minor = format_.read_magic(f) mag = format_.magic(major, minor) shp, is_fortran, dt = format_.read_array_header_1_0(f) count = np.multiply.reduce(shp, dtype=np.int64) #data = f.readlines() BUFFER_SIZE = 2**18 max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dt.itemsize) array = np.ndarray(count, dtype=dt) for i in range(0, count, max_read_count): read_count = min(max_read_count, count - i) read_size = int(read_count * dt.itemsize) data = format_._read_bytes(f, read_size, "array data") array[i:i + read_count] = np.frombuffer(data, dtype=dt, count=read_count) array.shape = shp if prn: print(dedent(frmt).format(mag, shp, (not is_fortran), dt)) return array
def load_shape(n): with open(n, 'rb') as f: major, minor = read_magic(f) shape, fortran, dtype = read_array_header_1_0(f) if len(shape) != 4: raise TypeError('Errr! Single image... %s' % n) return shape
def _parse_npy(bio): mmapfile = bio.raw.fileobj if isinstance(mmapfile, mmap.mmap): version = read_magic(bio) _check_version(version) shape, fortran_order, dtype = _read_array_header(bio, version) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) order = "F" if fortran_order else "C" offset = bio.tell() # Add the offset from the Wrapper file offset += bio.raw.offset data = np.ndarray.__new__( np.memmap, shape, dtype=dtype, buffer=mmapfile, offset=offset, order=order, ) data._mmap = mmapfile data.offset = offset data.mode = "r+" else: b = BytesIO(bio.read()) data = np.load(b) return data
def _read_header(self): with open(self.path, "rb") as fp: version = format.read_magic(fp) try: format._check_version(version) except ValueError: raise ValueError("Invalid file format.") header_data = format._read_array_header(fp, version) self.shape, self.fortran_order, self.dtype = header_data
def read_header_data(fname): fp = open(fname, 'r') version = npfor.read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) shape, fortran_order, dtype = npfor.read_array_header_1_0(fp) header_length = fp.tell() return shape, fortran_order, dtype, header_length
def read_header_data(fname): fp = open(fname, 'r') version = npfor.read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version, )) shape, fortran_order, dtype = npfor.read_array_header_1_0(fp) header_length = fp.tell() return shape, fortran_order, dtype, header_length
def _get_info(self): from numpy.lib import format with self.f as fp: version = format.read_magic(fp) format._check_version(version) shape, fortran_order, dtype = format._read_array_header(fp, version) self.shape = shape self.dtype = dtype self.order = 'F' if fortran_order else 'C' self.offset = fp.tell()
def test_read_magic(): s1 = BytesIO() s2 = BytesIO() arr = np.ones((3, 6), dtype=float) format.write_array(s1, arr, version=(1, 0)) format.write_array(s2, arr, version=(2, 0)) s1.seek(0) s2.seek(0) version1 = format.read_magic(s1) version2 = format.read_magic(s2) assert_(version1 == (1, 0)) assert_(version2 == (2, 0)) assert_(s1.tell() == format.MAGIC_LEN) assert_(s2.tell() == format.MAGIC_LEN)
def load_npy_file(path, block_size): """ Loads a file in npy format (must be 2-dimensional). Parameters ---------- path : str Path to the npy file. block_size : tuple (int, int) Block size of the resulting ds-array. Returns ------- x : ds-array """ try: fid = open(path, "rb") version = format.read_magic(fid) format._check_version(version) shape, fortran_order, dtype = format._read_array_header(fid, version) if fortran_order: raise ValueError("Fortran order not supported for npy files") if len(shape) != 2: raise ValueError("Array is not 2-dimensional") if block_size[0] > shape[0] or block_size[1] > shape[1]: raise ValueError("Block size is larger than the array") blocks = [] n_blocks = int(ceil(shape[1] / block_size[1])) for i in range(0, shape[0], block_size[0]): read_count = min(block_size[0], shape[0] - i) read_size = int(read_count * shape[1] * dtype.itemsize) data = fid.read(read_size) out_blocks = [object() for _ in range(n_blocks)] _read_from_buffer(data, dtype, shape[1], block_size[1], out_blocks) blocks.append(out_blocks) return Array(blocks=blocks, top_left_shape=block_size, reg_shape=block_size, shape=shape, sparse=False) finally: fid.close()
def __init__(self, filename): self._filename = filename self._data = {} npz = np.load(filename) file = npz.zip.fp for key in npz.files: filename = '{}.npy'.format(key) npz.zip.open(filename) version = nlf.read_magic(file) shape, fortran_order, dtype = nlf.read_array_header_1_0(file) if version == (1, 0) \ else nlf.read_array_header_2_0(file) self._data[key] = np.memmap(file, dtype=dtype, mode='r', shape=shape, order='F' if fortran_order else 'C', offset=file.tell())
def read_npy(fp, prn=False): """ Read an npy file quickly fp : string The file path: "c:/temp/a01.npy" prn : boolean obtain full information if True Requires: --------- from numpy.lib import format Notes: ------- shortcut ... np.load("c:/temp/a01.npy") """ frmt = """ ---- npy reader --------------------------------------------------------- File {} Shape {}, C-contig {}, dtype {} Magic {} ------------------------------------------------------------------------- """ with open(fp, 'rb') as f: major, minor = format.read_magic(f) mag = format.magic(major, minor) shp, is_fortran, dt = format.read_array_header_1_0(f) count = np.multiply.reduce(shp, dtype=np.int64) BUFFER_SIZE = 2**18 max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dt.itemsize) array = np.ndarray(count, dtype=dt) for i in range(0, count, max_read_count): cnt = min(max_read_count, count - i) read_size = int(cnt * dt.itemsize) data = format._read_bytes(f, read_size, "array data") array[i:i + cnt] = np.frombuffer(data, dtype=dt, count=cnt) array.shape = shp if prn: print(dedent(frmt).format(fp, shp, (not is_fortran), dt, mag)) return array
def load(file_name, axis, mpi_comm=MPI.COMM_WORLD): """ Load a numpy array across parallel jobs in the MPI communicator. The array is sliced along the chosen dimension, with minimal bandwidth. Parameters ---------- file_name : str The numpy array file to load. axis : int The axis on which to distribute the array. mpi_comm : mpi4py.MPI.Comm, optional The MPI communicator used to distribute, by default MPI.COMM_WORLD. Returns ------- (numpy.ndarray, tuple(int)) The distributed array, and the size of the full array. Raises ------ ValueError If the numpy version used to save the file is not supported. NotImplementedError If the array is saved in Fortran order. """ header = None if is_root_process(mpi_comm): with open(file_name, 'rb') as fp: version, _ = npformat.read_magic(fp) if version == 1: header = npformat.read_array_header_1_0(fp) elif version == 2: header = npformat.read_array_header_2_0(fp) else: raise ValueError( "Invalid numpy format version: {}".format(version)) header = *header, fp.tell() header = mpi_comm.bcast(header, root=0) full_shape, fortran, dtype, header_offset = header if fortran: raise NotImplementedError( "Fortran-ordered (column-major) arrays are not supported") ndims = len(full_shape) axis = utils.positive_index(axis, ndims) i_start, bin_size = distribute_mpi(full_shape[axis], mpi_comm) l_shape = list(full_shape) l_shape[axis] = bin_size l_array = np.empty(l_shape, dtype=dtype) slice_type = create_slice_view(axis, bin_size, shape=full_shape, dtype=dtype) slice_type.Commit() single_slice_extent = slice_type.extent if bin_size != 0: single_slice_extent /= bin_size displacement = header_offset + i_start * single_slice_extent base_type = to_mpi_datatype(l_array.dtype) fh = MPI.File.Open(mpi_comm, file_name, MPI.MODE_RDONLY) fh.Set_view(displacement, filetype=slice_type) fh.Read_all([l_array, l_array.size, base_type]) fh.Close() slice_type.Free() return l_array, full_shape
def _init(self): #TODO: Can speed this up with PAG's regex header parser self._file = self._path.open('rb') version = npformat.read_magic(self._file) _, _, dtype = npformat._read_array_header(self._file, version) self._dtype = dtype
def open_memmap(filename, mode='r+', dtype=None, shape=None, fortran_order=False, version=(1,0), offset=0): """ Open a .npy file as a memory-mapped array, with offset argument. This may be used to read an existing file or create a new one. :param str filename: The name of the file on disk. This may not be a file-like object. :param str mode: The mode to open the file with. In addition to the standard file modes, 'c' is also accepted to mean "copy on write". See `numpy.memmap` for the available mode strings. :param dtype dtype: The data type of the array if we are creating a new file in "write" mode. :param tuple shape: The shape of the array if we are creating a new file in "write" mode. Shape of (contiguous) slice if opening an existing file. :param bool fortran_order: Whether the array should be Fortran-contiguous (True) or C-contiguous (False) if we are creating a new file in "write" mode. :param tuple version: If the mode is a "write" mode, then this is the version (major, minor) of the file format used to create the file. :param int offset: Number of elements to skip along the first dimension. :return numpy.memmap: The memory-mapped array. Raises: * :exc:`ValueError` if the data or the mode is invalid * :exc:`IOError` if the file is not found or cannot be opened correctly. .. seealso:: :func:`numpy.memmap` """ if not isinstance(filename, basestring): raise ValueError("Filename must be a string. Memmap cannot use" \ " existing file handles.") if 'w' in mode: assert offset == 0, "Cannot specify offset when creating memmap" # We are creating the file, not reading it. # Check if we ought to create the file. if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) # Ensure that the given dtype is an authentic dtype object rather than # just something that can be interpreted as a dtype object. dtype = np.dtype(dtype) if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) d = dict( descr=dtype_to_descr(dtype), fortran_order=fortran_order, shape=shape, ) # If we got here, then it should be safe to create the file. fp = open(filename, mode+'b') try: fp.write(magic(*version)) write_array_header_1_0(fp, d) offset = fp.tell() finally: fp.close() else: # Read the header of the file first. fp = open(filename, 'rb') try: version = read_magic(fp) if version != (1, 0): msg = "only support version (1,0) of file format, not %r" raise ValueError(msg % (version,)) fullshape, fortran_order, dtype = read_array_header_1_0(fp) if shape: length = np.atleast_1d(shape) msg = "Specify shape along first dimension only" assert length.ndim == 1, msg else: length = fullshape[0] - offset shape = (length,) + fullshape[1:] if dtype.hasobject: msg = "Array can't be memory-mapped: Python objects in dtype." raise ValueError(msg) offset_items = offset * np.prod(fullshape[1:], dtype=int) offset_bytes = fp.tell() + offset_items * dtype.itemsize finally: fp.close() if fortran_order: order = 'F' else: order = 'C' # We need to change a write-only mode to a read-write mode since we've # already written data to the file. if mode == 'w+': mode = 'r+' marray = np.memmap(filename, dtype=dtype, shape=shape, order=order, mode=mode, offset=offset_bytes) return marray
def load_hstack_npy_files(path, cols_per_block=None): """ Loads the .npy files in a directory into a ds-array, stacking them horizontally, like (A|B|C). The order of concatenation is alphanumeric. At least 1 valid .npy file must exist in the directory, and every .npy file must contain a valid array. Every array must have the same dtype, order, and number of rows. The blocks of the returned ds-array will have the same number of rows as the input arrays, and cols_per_block columns, which defaults to the number of columns of the first array. Parameters ---------- path : string Folder path. cols_per_block : tuple (int, int) Number of columns of the blocks for the output ds-array. If None, the number of columns of the first array is used. Returns ------- x : ds-array A distributed representation (ds-array) of the stacked arrays. """ dirlist = os.listdir(path) folder_paths = [os.path.join(path, name) for name in sorted(dirlist)] # Full path of .npy files in the folder files = [ pth for pth in folder_paths if os.path.isfile(pth) and pth[-4:] == '.npy' ] # Read the header of the first file to get shape, order, and dtype with open(files[0], "rb") as fid: version = format.read_magic(fid) format._check_version(version) shape0, order0, dtype0 = format._read_array_header(fid, version) rows = shape0[0] if cols_per_block is None: cols_per_block = shape0[1] # Check that all files have the same number of rows, order and datatype, # and store the number of columns for each file. files_cols = [shape0[1]] for filename in files[1:]: with open(filename, "rb") as fid: version = format.read_magic(fid) format._check_version(version) shape, order, dtype = format._read_array_header(fid, version) if shape[0] != shape0[0] or order0 != order or dtype0 != dtype: raise AssertionError() files_cols.append(shape[1]) # Compute the parameters block_files, start_col and end_col for each block, # and call the task _load_hstack_npy_block() to generate each block. blocks = [] file_idx = 0 start_col = 0 while file_idx < len(files): block_files = [files[file_idx]] cols = files_cols[file_idx] - start_col while cols < cols_per_block: # while block not completed if file_idx + 1 == len(files): # last file break file_idx += 1 block_files.append(files[file_idx]) cols += files_cols[file_idx] # Compute end_col of last file in block (last block may be smaller) end_col = files_cols[file_idx] - max(0, (cols - cols_per_block)) blocks.append(_load_hstack_npy_block(block_files, start_col, end_col)) if end_col == files_cols[file_idx]: # file completed file_idx += 1 start_col = 0 else: # file uncompleted start_col = end_col return Array(blocks=[blocks], top_left_shape=(rows, cols_per_block), reg_shape=(rows, cols_per_block), shape=(rows, sum(files_cols)), sparse=False)