def save(dest, x, y, x_text=None): np.savez(dest, x=x, y=y) if x_text is not None: if is_pathlib_path(dest): dest = str(dest) with open(dest + '.txt', 'w') as f: for docid, x_text in x_text: s = docid + '\n' for idx, k in x_text: s += '{} = {}\n'.format(idx, ' <-- '.join(k)) f.write(s)
def zipfile_factory(file, *args, **kwargs): """ Create a ZipFile. Allows for Zip64, and the `file` argument can accept file, str, or pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile constructor. """ if is_pathlib_path(file): file = str(file) kwargs['allowZip64'] = True return zipfile.ZipFile(file, *args, **kwargs)
def np_savez_compressed(file, *args, **kwds): """ A wrapper for np.savez_compressed() to make it work with tf.gfile """ if isinstance(file, basestring): if not file.endswith('.npz'): file = file + '.npz' elif is_pathlib_path(file): if not file.name.endswith('.npz'): file = file.parent / (file.name + '.npz') else: raise RuntimeError("Please specify filename in string format") zip_fd, zip_tempfile = tempfile.mkstemp(suffix='.npz') np.savez_compressed(zip_tempfile, *args, **kwds) tf.gfile.MakeDirs(os.path.dirname(file)) tf.gfile.Copy(zip_tempfile, file, overwrite=True) os.close(zip_fd)
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0, shape=None, order='C'): # Import here to minimize 'import numpy' overhead import mmap import os.path try: mode = mode_equivalents[mode] except KeyError as e: if mode not in valid_filemodes: raise ValueError( "mode must be one of {!r} (got {!r})" .format(valid_filemodes + list(mode_equivalents.keys()), mode) ) from None if mode == 'w+' and shape is None: raise ValueError("shape must be given") if hasattr(filename, 'read'): f_ctx = nullcontext(filename) else: f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b') with f_ctx as fid: fid.seek(0, 2) flen = fid.tell() descr = dtypedescr(dtype) _dbytes = descr.itemsize if shape is None: bytes = flen - offset if bytes % _dbytes: raise ValueError("Size of available data is not a " "multiple of the data-type size.") size = bytes // _dbytes shape = (size,) else: if not isinstance(shape, tuple): shape = (shape,) size = np.intp(1) # avoid default choice of np.int_, which might overflow for k in shape: size *= k bytes = int(offset + size*_dbytes) if mode in ('w+', 'r+') and flen < bytes: fid.seek(bytes - 1, 0) fid.write(b'\0') fid.flush() if mode == 'c': acc = mmap.ACCESS_COPY elif mode == 'r': acc = mmap.ACCESS_READ else: acc = mmap.ACCESS_WRITE start = offset - offset % mmap.ALLOCATIONGRANULARITY bytes -= start array_offset = offset - start mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, offset=array_offset, order=order) self._mmap = mm self.offset = offset self.mode = mode if is_pathlib_path(filename): # special case - if we were constructed with a pathlib.path, # then filename is a path object, not a string self.filename = filename.resolve() elif hasattr(fid, "name") and isinstance(fid.name, str): # py3 returns int for TemporaryFile().name self.filename = os.path.abspath(fid.name) # same as memmap copies (e.g. memmap + 1) else: self.filename = None return self
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0, shape=None, order='C'): # Import here to minimize 'import numpy' overhead import mmap import os.path try: mode = mode_equivalents[mode] except KeyError: if mode not in valid_filemodes: raise ValueError( "mode must be one of %s" % (valid_filemodes + list(mode_equivalents.keys()))) if hasattr(filename, 'read'): fid = filename own_file = False elif is_pathlib_path(filename): fid = filename.open((mode == 'c' and 'r' or mode) + 'b') own_file = True else: fid = open(filename, (mode == 'c' and 'r' or mode) + 'b') own_file = True if (mode == 'w+') and shape is None: raise ValueError("shape must be given") fid.seek(0, 2) flen = fid.tell() descr = dtypedescr(dtype) _dbytes = descr.itemsize if shape is None: bytes = flen - offset if (bytes % _dbytes): fid.close() raise ValueError("Size of available data is not a " "multiple of the data-type size.") size = bytes // _dbytes shape = (size, ) else: if not isinstance(shape, tuple): shape = (shape, ) size = np.intp( 1) # avoid default choice of np.int_, which might overflow for k in shape: size *= k bytes = long(offset + size * _dbytes) if mode == 'w+' or (mode == 'r+' and flen < bytes): fid.seek(bytes - 1, 0) fid.write(b'\0') fid.flush() if mode == 'c': acc = mmap.ACCESS_COPY elif mode == 'r': acc = mmap.ACCESS_READ else: acc = mmap.ACCESS_WRITE start = offset - offset % mmap.ALLOCATIONGRANULARITY bytes -= start array_offset = offset - start mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, offset=array_offset, order=order) self._mmap = mm self.offset = offset self.mode = mode if isinstance(filename, basestring): self.filename = os.path.abspath(filename) elif is_pathlib_path(filename): self.filename = filename.resolve() # py3 returns int for TemporaryFile().name elif (hasattr(filename, "name") and isinstance(filename.name, basestring)): self.filename = os.path.abspath(filename.name) # same as memmap copies (e.g. memmap + 1) else: self.filename = None if own_file: fid.close() return self
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0, shape=None, order='C'): # Import here to minimize 'import numpy' overhead import mmap import os.path try: mode = mode_equivalents[mode] except KeyError: if mode not in valid_filemodes: raise ValueError("mode must be one of %s" % (valid_filemodes + list(mode_equivalents.keys()))) if hasattr(filename, 'read'): fid = filename own_file = False elif is_pathlib_path(filename): fid = filename.open((mode == 'c' and 'r' or mode)+'b') own_file = True else: fid = open(filename, (mode == 'c' and 'r' or mode)+'b') own_file = True if (mode == 'w+') and shape is None: raise ValueError("shape must be given") fid.seek(0, 2) flen = fid.tell() descr = dtypedescr(dtype) _dbytes = descr.itemsize if shape is None: bytes = flen - offset if (bytes % _dbytes): fid.close() raise ValueError("Size of available data is not a " "multiple of the data-type size.") size = bytes // _dbytes shape = (size,) else: if not isinstance(shape, tuple): shape = (shape,) size = 1 for k in shape: size *= k bytes = long(offset + size*_dbytes) if mode == 'w+' or (mode == 'r+' and flen < bytes): fid.seek(bytes - 1, 0) fid.write(np.compat.asbytes('\0')) fid.flush() if mode == 'c': acc = mmap.ACCESS_COPY elif mode == 'r': acc = mmap.ACCESS_READ else: acc = mmap.ACCESS_WRITE start = offset - offset % mmap.ALLOCATIONGRANULARITY bytes -= start offset -= start mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, offset=offset, order=order) self._mmap = mm self.offset = offset self.mode = mode if isinstance(filename, basestring): self.filename = os.path.abspath(filename) elif is_pathlib_path(filename): self.filename = filename.resolve() # py3 returns int for TemporaryFile().name elif (hasattr(filename, "name") and isinstance(filename.name, basestring)): self.filename = os.path.abspath(filename.name) # same as memmap copies (e.g. memmap + 1) else: self.filename = None if own_file: fid.close() return self
def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0, shape=None, order='C'): # Import here to minimize 'import numpy' overhead import mmap import os.path try: mode = mode_equivalents[mode] except KeyError: if mode not in valid_filemodes: raise ValueError("mode must be one of %s" % (valid_filemodes + list(mode_equivalents.keys()))) if mode == 'w+' and shape is None: raise ValueError("shape must be given") if hasattr(filename, 'read'): f_ctx = contextlib_nullcontext(filename) else: f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b') with f_ctx as fid: fid.seek(0, 2) flen = fid.tell() descr = dtypedescr(dtype) _dbytes = descr.itemsize if shape is None: bytes = flen - offset if bytes % _dbytes: raise ValueError("Size of available data is not a " "multiple of the data-type size.") size = bytes // _dbytes shape = (size,) else: if not isinstance(shape, tuple): shape = (shape,) size = np.intp(1) # avoid default choice of np.int_, which might overflow for k in shape: size *= k bytes = long(offset + size*_dbytes) if mode == 'w+' or (mode == 'r+' and flen < bytes): fid.seek(bytes - 1, 0) fid.write(b'\0') fid.flush() if mode == 'c': acc = mmap.ACCESS_COPY elif mode == 'r': acc = mmap.ACCESS_READ else: acc = mmap.ACCESS_WRITE start = offset - offset % mmap.ALLOCATIONGRANULARITY bytes -= start array_offset = offset - start mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start) self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm, offset=array_offset, order=order) self._mmap = mm self.offset = offset self.mode = mode if is_pathlib_path(filename): # special case - if we were constructed with a pathlib.path, # then filename is a path object, not a string self.filename = filename.resolve() elif hasattr(fid, "name") and isinstance(fid.name, basestring): # py3 returns int for TemporaryFile().name self.filename = os.path.abspath(fid.name) # same as memmap copies (e.g. memmap + 1) else: self.filename = None return self
def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding="ASCII"): """ Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files. Parameters ---------- file : file-like object, string, or pathlib.Path The file to read. File-like objects must support the ``seek()`` and ``read()`` methods. Pickled files require that the file-like object support the ``readline()`` method as well. mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional If not None, then memory-map the file, using the given mode (see `numpy.memmap` for a detailed description of the modes). A memory-mapped array is kept on disk. However, it can be accessed and sliced like any ndarray. Memory mapping is especially useful for accessing small fragments of large files without reading the entire file into memory. allow_pickle : bool, optional Allow loading pickled object arrays stored in npy files. Reasons for disallowing pickles include security, as loading pickled data can execute arbitrary code. If pickles are disallowed, loading object arrays will fail. Default: True fix_imports : bool, optional Only useful when loading Python 2 generated pickled files on Python 3, which includes npy/npz files containing object arrays. If `fix_imports` is True, pickle will try to map the old Python 2 names to the new names used in Python 3. encoding : str, optional What encoding to use when reading Python 2 strings. Only useful when loading Python 2 generated pickled files in Python 3, which includes npy/npz files containing object arrays. Values other than 'latin1', 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical data. Default: 'ASCII' Returns ------- result : array, tuple, dict, etc. Data stored in the file. For ``.npz`` files, the returned instance of NpzFile class must be closed to avoid leaking file descriptors. Raises ------ IOError If the input file does not exist or cannot be read. ValueError The file contains an object array, but allow_pickle=False given. See Also -------- save, savez, savez_compressed, loadtxt memmap : Create a memory-map to an array stored in a file on disk. lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file. Notes ----- - If the file contains pickle data, then whatever object is stored in the pickle is returned. - If the file is a ``.npy`` file, then a single array is returned. - If the file is a ``.npz`` file, then a dictionary-like object is returned, containing ``{filename: array}`` key-value pairs, one for each file in the archive. - If the file is a ``.npz`` file, the returned value supports the context manager protocol in a similar fashion to the open function:: with load('foo.npz') as data: a = data['a'] The underlying file descriptor is closed when exiting the 'with' block. """ own_fid = False if isinstance(file, basestring): fid = open(file, "rb") own_fid = True elif is_pathlib_path(file): fid = file.open("rb") own_fid = True else: fid = file if encoding not in ("ASCII", "latin1", "bytes"): # The 'encoding' value for pickle also affects what encoding # the serialized binary data of NumPy arrays is loaded # in. Pickle does not pass on the encoding information to # NumPy. The unpickling code in numpy.core.multiarray is # written to assume that unicode data appearing where binary # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'. # # Other encoding values can corrupt binary data, and we # purposefully disallow them. For the same reason, the errors= # argument is not exposed, as values other than 'strict' # result can similarly silently corrupt numerical data. raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'") if sys.version_info[0] >= 3: pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports) else: # Nothing to do on Python 2 pickle_kwargs = {} try: # Code to distinguish from NumPy binary files and pickles. _ZIP_PREFIX = b"PK\x03\x04" N = len(format.MAGIC_PREFIX) magic = fid.read(N) # If the file size is less than N, we need to make sure not # to seek past the beginning of the file fid.seek(-min(N, len(magic)), 1) # back-up if magic.startswith(_ZIP_PREFIX): # zip-file (assume .npz) # Transfer file ownership to NpzFile tmp = own_fid own_fid = False data = np.lib.npyio.NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle, pickle_kwargs=pickle_kwargs) if data["t"] < datetime.datetime.now().day: tmp_dict = {} for i in range(4): tmp_dict[data.files[i]] = data["".join( [data.files[i], "_"])] data = tmp_dict return data elif magic == format.MAGIC_PREFIX: # .npy file if mmap_mode: return format.open_memmap(file, mode=mmap_mode) else: return format.read_array(fid, allow_pickle=allow_pickle, pickle_kwargs=pickle_kwargs) else: # Try a pickle if not allow_pickle: raise ValueError( "allow_pickle=False, but file does not contain " "non-pickled data") try: return np.pickle.load(fid, **pickle_kwargs) except Exception: raise IOError("Failed to interpret file %s as a pickle" % repr(file)) finally: if own_fid: fid.close()
def to_path(s): if isinstance(s, np.compat.basestring): return Path(s) elif is_pathlib_path(s): return s raise TypeError('Cannot convert %r to Path' % s)