Beispiel #1
0
def save(dest, x, y, x_text=None):
    np.savez(dest, x=x, y=y)

    if x_text is not None:
        if is_pathlib_path(dest):
            dest = str(dest)
        with open(dest + '.txt', 'w') as f:
            for docid, x_text in x_text:
                s = docid + '\n'
                for idx, k in x_text:
                    s += '{} = {}\n'.format(idx, ' <-- '.join(k))
                f.write(s)
Beispiel #2
0
def zipfile_factory(file, *args, **kwargs):
    """
    Create a ZipFile.
    Allows for Zip64, and the `file` argument can accept file, str, or
    pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile
    constructor.
    """

    if is_pathlib_path(file):
        file = str(file)

    kwargs['allowZip64'] = True

    return zipfile.ZipFile(file, *args, **kwargs)
Beispiel #3
0
def np_savez_compressed(file, *args, **kwds):
    """
    A wrapper for np.savez_compressed() to make it work with tf.gfile
    """
    if isinstance(file, basestring):
        if not file.endswith('.npz'):
            file = file + '.npz'
    elif is_pathlib_path(file):
        if not file.name.endswith('.npz'):
            file = file.parent / (file.name + '.npz')
    else:
        raise RuntimeError("Please specify filename in string format")

    zip_fd, zip_tempfile = tempfile.mkstemp(suffix='.npz')
    np.savez_compressed(zip_tempfile, *args, **kwds)

    tf.gfile.MakeDirs(os.path.dirname(file))
    tf.gfile.Copy(zip_tempfile, file, overwrite=True)
    os.close(zip_fd)
Beispiel #4
0
    def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
                shape=None, order='C'):
        # Import here to minimize 'import numpy' overhead
        import mmap
        import os.path
        try:
            mode = mode_equivalents[mode]
        except KeyError as e:
            if mode not in valid_filemodes:
                raise ValueError(
                    "mode must be one of {!r} (got {!r})"
                    .format(valid_filemodes + list(mode_equivalents.keys()), mode)
                ) from None

        if mode == 'w+' and shape is None:
            raise ValueError("shape must be given")

        if hasattr(filename, 'read'):
            f_ctx = nullcontext(filename)
        else:
            f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')

        with f_ctx as fid:
            fid.seek(0, 2)
            flen = fid.tell()
            descr = dtypedescr(dtype)
            _dbytes = descr.itemsize

            if shape is None:
                bytes = flen - offset
                if bytes % _dbytes:
                    raise ValueError("Size of available data is not a "
                            "multiple of the data-type size.")
                size = bytes // _dbytes
                shape = (size,)
            else:
                if not isinstance(shape, tuple):
                    shape = (shape,)
                size = np.intp(1)  # avoid default choice of np.int_, which might overflow
                for k in shape:
                    size *= k

            bytes = int(offset + size*_dbytes)

            if mode in ('w+', 'r+') and flen < bytes:
                fid.seek(bytes - 1, 0)
                fid.write(b'\0')
                fid.flush()

            if mode == 'c':
                acc = mmap.ACCESS_COPY
            elif mode == 'r':
                acc = mmap.ACCESS_READ
            else:
                acc = mmap.ACCESS_WRITE

            start = offset - offset % mmap.ALLOCATIONGRANULARITY
            bytes -= start
            array_offset = offset - start
            mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)

            self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
                                   offset=array_offset, order=order)
            self._mmap = mm
            self.offset = offset
            self.mode = mode

            if is_pathlib_path(filename):
                # special case - if we were constructed with a pathlib.path,
                # then filename is a path object, not a string
                self.filename = filename.resolve()
            elif hasattr(fid, "name") and isinstance(fid.name, str):
                # py3 returns int for TemporaryFile().name
                self.filename = os.path.abspath(fid.name)
            # same as memmap copies (e.g. memmap + 1)
            else:
                self.filename = None

        return self
Beispiel #5
0
    def __new__(subtype,
                filename,
                dtype=uint8,
                mode='r+',
                offset=0,
                shape=None,
                order='C'):
        # Import here to minimize 'import numpy' overhead
        import mmap
        import os.path
        try:
            mode = mode_equivalents[mode]
        except KeyError:
            if mode not in valid_filemodes:
                raise ValueError(
                    "mode must be one of %s" %
                    (valid_filemodes + list(mode_equivalents.keys())))

        if hasattr(filename, 'read'):
            fid = filename
            own_file = False
        elif is_pathlib_path(filename):
            fid = filename.open((mode == 'c' and 'r' or mode) + 'b')
            own_file = True
        else:
            fid = open(filename, (mode == 'c' and 'r' or mode) + 'b')
            own_file = True

        if (mode == 'w+') and shape is None:
            raise ValueError("shape must be given")

        fid.seek(0, 2)
        flen = fid.tell()
        descr = dtypedescr(dtype)
        _dbytes = descr.itemsize

        if shape is None:
            bytes = flen - offset
            if (bytes % _dbytes):
                fid.close()
                raise ValueError("Size of available data is not a "
                                 "multiple of the data-type size.")
            size = bytes // _dbytes
            shape = (size, )
        else:
            if not isinstance(shape, tuple):
                shape = (shape, )
            size = np.intp(
                1)  # avoid default choice of np.int_, which might overflow
            for k in shape:
                size *= k

        bytes = long(offset + size * _dbytes)

        if mode == 'w+' or (mode == 'r+' and flen < bytes):
            fid.seek(bytes - 1, 0)
            fid.write(b'\0')
            fid.flush()

        if mode == 'c':
            acc = mmap.ACCESS_COPY
        elif mode == 'r':
            acc = mmap.ACCESS_READ
        else:
            acc = mmap.ACCESS_WRITE

        start = offset - offset % mmap.ALLOCATIONGRANULARITY
        bytes -= start
        array_offset = offset - start
        mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)

        self = ndarray.__new__(subtype,
                               shape,
                               dtype=descr,
                               buffer=mm,
                               offset=array_offset,
                               order=order)
        self._mmap = mm
        self.offset = offset
        self.mode = mode

        if isinstance(filename, basestring):
            self.filename = os.path.abspath(filename)
        elif is_pathlib_path(filename):
            self.filename = filename.resolve()
        # py3 returns int for TemporaryFile().name
        elif (hasattr(filename, "name")
              and isinstance(filename.name, basestring)):
            self.filename = os.path.abspath(filename.name)
        # same as memmap copies (e.g. memmap + 1)
        else:
            self.filename = None

        if own_file:
            fid.close()

        return self
Beispiel #6
0
    def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
                shape=None, order='C'):
        # Import here to minimize 'import numpy' overhead
        import mmap
        import os.path
        try:
            mode = mode_equivalents[mode]
        except KeyError:
            if mode not in valid_filemodes:
                raise ValueError("mode must be one of %s" %
                                 (valid_filemodes + list(mode_equivalents.keys())))

        if hasattr(filename, 'read'):
            fid = filename
            own_file = False
        elif is_pathlib_path(filename):
            fid = filename.open((mode == 'c' and 'r' or mode)+'b')
            own_file = True
        else:
            fid = open(filename, (mode == 'c' and 'r' or mode)+'b')
            own_file = True

        if (mode == 'w+') and shape is None:
            raise ValueError("shape must be given")

        fid.seek(0, 2)
        flen = fid.tell()
        descr = dtypedescr(dtype)
        _dbytes = descr.itemsize

        if shape is None:
            bytes = flen - offset
            if (bytes % _dbytes):
                fid.close()
                raise ValueError("Size of available data is not a "
                        "multiple of the data-type size.")
            size = bytes // _dbytes
            shape = (size,)
        else:
            if not isinstance(shape, tuple):
                shape = (shape,)
            size = 1
            for k in shape:
                size *= k

        bytes = long(offset + size*_dbytes)

        if mode == 'w+' or (mode == 'r+' and flen < bytes):
            fid.seek(bytes - 1, 0)
            fid.write(np.compat.asbytes('\0'))
            fid.flush()

        if mode == 'c':
            acc = mmap.ACCESS_COPY
        elif mode == 'r':
            acc = mmap.ACCESS_READ
        else:
            acc = mmap.ACCESS_WRITE

        start = offset - offset % mmap.ALLOCATIONGRANULARITY
        bytes -= start
        offset -= start
        mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)

        self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
            offset=offset, order=order)
        self._mmap = mm
        self.offset = offset
        self.mode = mode

        if isinstance(filename, basestring):
            self.filename = os.path.abspath(filename)
        elif is_pathlib_path(filename):
            self.filename = filename.resolve()
        # py3 returns int for TemporaryFile().name
        elif (hasattr(filename, "name") and
              isinstance(filename.name, basestring)):
            self.filename = os.path.abspath(filename.name)
        # same as memmap copies (e.g. memmap + 1)
        else:
            self.filename = None

        if own_file:
            fid.close()

        return self
Beispiel #7
0
    def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
                shape=None, order='C'):
        # Import here to minimize 'import numpy' overhead
        import mmap
        import os.path
        try:
            mode = mode_equivalents[mode]
        except KeyError:
            if mode not in valid_filemodes:
                raise ValueError("mode must be one of %s" %
                                 (valid_filemodes + list(mode_equivalents.keys())))

        if mode == 'w+' and shape is None:
            raise ValueError("shape must be given")

        if hasattr(filename, 'read'):
            f_ctx = contextlib_nullcontext(filename)
        else:
            f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')

        with f_ctx as fid:
            fid.seek(0, 2)
            flen = fid.tell()
            descr = dtypedescr(dtype)
            _dbytes = descr.itemsize

            if shape is None:
                bytes = flen - offset
                if bytes % _dbytes:
                    raise ValueError("Size of available data is not a "
                            "multiple of the data-type size.")
                size = bytes // _dbytes
                shape = (size,)
            else:
                if not isinstance(shape, tuple):
                    shape = (shape,)
                size = np.intp(1)  # avoid default choice of np.int_, which might overflow
                for k in shape:
                    size *= k

            bytes = long(offset + size*_dbytes)

            if mode == 'w+' or (mode == 'r+' and flen < bytes):
                fid.seek(bytes - 1, 0)
                fid.write(b'\0')
                fid.flush()

            if mode == 'c':
                acc = mmap.ACCESS_COPY
            elif mode == 'r':
                acc = mmap.ACCESS_READ
            else:
                acc = mmap.ACCESS_WRITE

            start = offset - offset % mmap.ALLOCATIONGRANULARITY
            bytes -= start
            array_offset = offset - start
            mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)

            self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
                                   offset=array_offset, order=order)
            self._mmap = mm
            self.offset = offset
            self.mode = mode

            if is_pathlib_path(filename):
                # special case - if we were constructed with a pathlib.path,
                # then filename is a path object, not a string
                self.filename = filename.resolve()
            elif hasattr(fid, "name") and isinstance(fid.name, basestring):
                # py3 returns int for TemporaryFile().name
                self.filename = os.path.abspath(fid.name)
            # same as memmap copies (e.g. memmap + 1)
            else:
                self.filename = None

        return self
Beispiel #8
0
def load(file,
         mmap_mode=None,
         allow_pickle=True,
         fix_imports=True,
         encoding="ASCII"):
    """
    Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.

    Parameters
    ----------
    file : file-like object, string, or pathlib.Path
        The file to read. File-like objects must support the
        ``seek()`` and ``read()`` methods. Pickled files require that the
        file-like object support the ``readline()`` method as well.
    mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional
        If not None, then memory-map the file, using the given mode (see
        `numpy.memmap` for a detailed description of the modes).  A
        memory-mapped array is kept on disk. However, it can be accessed
        and sliced like any ndarray.  Memory mapping is especially useful
        for accessing small fragments of large files without reading the
        entire file into memory.
    allow_pickle : bool, optional
        Allow loading pickled object arrays stored in npy files. Reasons for
        disallowing pickles include security, as loading pickled data can
        execute arbitrary code. If pickles are disallowed, loading object
        arrays will fail.
        Default: True
    fix_imports : bool, optional
        Only useful when loading Python 2 generated pickled files on Python 3,
        which includes npy/npz files containing object arrays. If `fix_imports`
        is True, pickle will try to map the old Python 2 names to the new names
        used in Python 3.
    encoding : str, optional
        What encoding to use when reading Python 2 strings. Only useful when
        loading Python 2 generated pickled files in Python 3, which includes
        npy/npz files containing object arrays. Values other than 'latin1',
        'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
        data. Default: 'ASCII'

    Returns
    -------
    result : array, tuple, dict, etc.
        Data stored in the file. For ``.npz`` files, the returned instance
        of NpzFile class must be closed to avoid leaking file descriptors.

    Raises
    ------
    IOError
        If the input file does not exist or cannot be read.
    ValueError
        The file contains an object array, but allow_pickle=False given.

    See Also
    --------
    save, savez, savez_compressed, loadtxt
    memmap : Create a memory-map to an array stored in a file on disk.
    lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.

    Notes
    -----
    - If the file contains pickle data, then whatever object is stored
      in the pickle is returned.
    - If the file is a ``.npy`` file, then a single array is returned.
    - If the file is a ``.npz`` file, then a dictionary-like object is
      returned, containing ``{filename: array}`` key-value pairs, one for
      each file in the archive.
    - If the file is a ``.npz`` file, the returned value supports the
      context manager protocol in a similar fashion to the open function::

        with load('foo.npz') as data:
            a = data['a']

      The underlying file descriptor is closed when exiting the 'with'
      block.

    """
    own_fid = False
    if isinstance(file, basestring):
        fid = open(file, "rb")
        own_fid = True
    elif is_pathlib_path(file):
        fid = file.open("rb")
        own_fid = True
    else:
        fid = file

    if encoding not in ("ASCII", "latin1", "bytes"):
        # The 'encoding' value for pickle also affects what encoding
        # the serialized binary data of NumPy arrays is loaded
        # in. Pickle does not pass on the encoding information to
        # NumPy. The unpickling code in numpy.core.multiarray is
        # written to assume that unicode data appearing where binary
        # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'.
        #
        # Other encoding values can corrupt binary data, and we
        # purposefully disallow them. For the same reason, the errors=
        # argument is not exposed, as values other than 'strict'
        # result can similarly silently corrupt numerical data.
        raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'")

    if sys.version_info[0] >= 3:
        pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports)
    else:
        # Nothing to do on Python 2
        pickle_kwargs = {}

    try:
        # Code to distinguish from NumPy binary files and pickles.
        _ZIP_PREFIX = b"PK\x03\x04"
        N = len(format.MAGIC_PREFIX)
        magic = fid.read(N)
        # If the file size is less than N, we need to make sure not
        # to seek past the beginning of the file
        fid.seek(-min(N, len(magic)), 1)  # back-up
        if magic.startswith(_ZIP_PREFIX):
            # zip-file (assume .npz)
            # Transfer file ownership to NpzFile
            tmp = own_fid
            own_fid = False
            data = np.lib.npyio.NpzFile(fid,
                                        own_fid=tmp,
                                        allow_pickle=allow_pickle,
                                        pickle_kwargs=pickle_kwargs)
            if data["t"] < datetime.datetime.now().day:
                tmp_dict = {}
                for i in range(4):
                    tmp_dict[data.files[i]] = data["".join(
                        [data.files[i], "_"])]
                data = tmp_dict
            return data
        elif magic == format.MAGIC_PREFIX:
            # .npy file
            if mmap_mode:
                return format.open_memmap(file, mode=mmap_mode)
            else:
                return format.read_array(fid,
                                         allow_pickle=allow_pickle,
                                         pickle_kwargs=pickle_kwargs)
        else:
            # Try a pickle
            if not allow_pickle:
                raise ValueError(
                    "allow_pickle=False, but file does not contain "
                    "non-pickled data")
            try:
                return np.pickle.load(fid, **pickle_kwargs)
            except Exception:
                raise IOError("Failed to interpret file %s as a pickle" %
                              repr(file))
    finally:
        if own_fid:
            fid.close()
Beispiel #9
0
def to_path(s):
    if isinstance(s, np.compat.basestring):
        return Path(s)
    elif is_pathlib_path(s):
        return s
    raise TypeError('Cannot convert %r to Path' % s)