Beispiel #1
0
    def compress_file(self, src_path, dest_path, compression):
        if compression is None:
            shutil.copyfile(src_path, dest_path)
            return

        if compression == "gzip":
            f = gzip.open(dest_path, "w")
        elif compression == "bz2":
            f = bz2.BZ2File(dest_path, "w")
        elif compression == "zip":
            with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
                f.write(src_path, os.path.basename(src_path))
        elif compression == "xz":
            f = _get_lzma_file(lzma)(dest_path, "w")
        else:
            msg = "Unrecognized compression type: {}".format(compression)
            raise ValueError(msg)

        if compression != "zip":
            with open(src_path, "rb") as fh, f:
                f.write(fh.read())
Beispiel #2
0
def get_handle(
    path_or_buf,
    mode: str,
    encoding=None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors=None,
):
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is one of
        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
        other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0

           Passing compression options as keys in dict is now
           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.

    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.

        .. versionadded:: 1.1.0

    Returns
    -------
    f : file-like
        A file-like object.
    handles : list of file-like objects
        A list of file-like object that were opened in this function.
    """
    need_text_wrapping: Tuple[Type["IOBase"], ...]
    try:
        from s3fs import S3File

        need_text_wrapping = (BufferedIOBase, RawIOBase, S3File)
    except ImportError:
        need_text_wrapping = (BufferedIOBase, RawIOBase)
    # fsspec is an optional dependency. If it is available, add its file-object
    # class to the list of classes that need text wrapping. If fsspec is too old and is
    # needed, get_filepath_or_buffer would already have thrown an exception.
    try:
        from fsspec.spec import AbstractFileSystem

        need_text_wrapping = (*need_text_wrapping, AbstractFileSystem)
    except ImportError:
        pass

    handles: List[Union[IO, _MMapWrapper]] = list()
    f = path_or_buf

    # Convert pathlib.Path/py.path.local or string
    path_or_buf = stringify_path(path_or_buf)
    is_path = isinstance(path_or_buf, str)

    compression, compression_args = get_compression_method(compression)
    if is_path:
        compression = infer_compression(path_or_buf, compression)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                f = gzip.GzipFile(filename=path_or_buf,
                                  mode=mode,
                                  **compression_args)
            else:
                f = gzip.GzipFile(fileobj=path_or_buf,
                                  mode=mode,
                                  **compression_args)

        # BZ Compression
        elif compression == "bz2":
            f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)

        # ZIP Compression
        elif compression == "zip":
            zf = _BytesZipFile(path_or_buf, mode, **compression_args)
            # Ensure the container is closed as well.
            handles.append(zf)
            if zf.mode == "w":
                f = zf
            elif zf.mode == "r":
                zip_names = zf.namelist()
                if len(zip_names) == 1:
                    f = zf.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in ZIP file. "
                                     f"Only one file per ZIP: {zip_names}")

        # XZ Compression
        elif compression == "xz":
            f = _get_lzma_file(lzma)(path_or_buf, mode)

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        handles.append(f)

    elif is_path:
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        is_binary_mode = "b" in mode

        if encoding and not is_binary_mode:
            # Encoding
            f = open(path_or_buf,
                     mode,
                     encoding=encoding,
                     errors=errors,
                     newline="")
        elif is_text and not is_binary_mode:
            # No explicit encoding
            f = open(path_or_buf, mode, errors="replace", newline="")
        else:
            # Binary mode
            f = open(path_or_buf, mode)
        handles.append(f)

    # Convert BytesIO or file objects passed with an encoding
    if is_text and (compression or isinstance(f, need_text_wrapping)):
        from io import TextIOWrapper

        g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="")
        if not isinstance(f, (BufferedIOBase, RawIOBase)):
            handles.append(g)
        f = g

    if memory_map and hasattr(f, "fileno"):
        try:
            wrapped = _MMapWrapper(f)
            f.close()
            handles.remove(f)
            handles.append(wrapped)
            f = wrapped
        except Exception:
            # we catch any errors that may have occurred
            # because that is consistent with the lower-level
            # functionality of the C engine (pd.read_csv), so
            # leave the file handler as is then
            pass

    return f, handles
Beispiel #3
0
def _get_handle(
    path_or_buf,
    mode: str,
    encoding=None,
    compression: Optional[Union[str, Mapping[str, Any]]] = None,
    memory_map: bool = False,
    is_text: bool = True,
):
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is 'zip' or inferred as
        'zip', other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        whether file/buffer is in text format (csv, json, etc.), or in binary
        mode (pickle, etc.).

    Returns
    -------
    f : file-like
        A file-like object.
    handles : list of file-like objects
        A list of file-like object that were opened in this function.
    """
    try:
        from s3fs import S3File

        need_text_wrapping = (BufferedIOBase, S3File)
    except ImportError:
        need_text_wrapping = BufferedIOBase  # type: ignore

    handles = list()  # type: List[IO]
    f = path_or_buf

    # Convert pathlib.Path/py.path.local or string
    path_or_buf = _stringify_path(path_or_buf)
    is_path = isinstance(path_or_buf, str)

    compression, compression_args = _get_compression_method(compression)
    if is_path:
        compression = _infer_compression(path_or_buf, compression)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                f = gzip.open(path_or_buf, mode)
            else:
                f = gzip.GzipFile(fileobj=path_or_buf)

        # BZ Compression
        elif compression == "bz2":
            if is_path:
                f = bz2.BZ2File(path_or_buf, mode)
            else:
                f = bz2.BZ2File(path_or_buf)

        # ZIP Compression
        elif compression == "zip":
            zf = BytesZipFile(path_or_buf, mode, **compression_args)
            # Ensure the container is closed as well.
            handles.append(zf)
            if zf.mode == "w":
                f = zf
            elif zf.mode == "r":
                zip_names = zf.namelist()
                if len(zip_names) == 1:
                    f = zf.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        "Zero files found in ZIP file {}".format(path_or_buf))
                else:
                    raise ValueError(
                        "Multiple files found in ZIP file."
                        " Only one file per ZIP: {}".format(zip_names))

        # XZ Compression
        elif compression == "xz":
            f = _get_lzma_file(lzma)(path_or_buf, mode)

        # Unrecognized Compression
        else:
            msg = "Unrecognized compression type: {}".format(compression)
            raise ValueError(msg)

        handles.append(f)

    elif is_path:
        if encoding:
            # Encoding
            f = open(path_or_buf, mode, encoding=encoding, newline="")
        elif is_text:
            # No explicit encoding
            f = open(path_or_buf, mode, errors="replace", newline="")
        else:
            # Binary mode
            f = open(path_or_buf, mode)
        handles.append(f)

    # Convert BytesIO or file objects passed with an encoding
    if is_text and (compression or isinstance(f, need_text_wrapping)):
        from io import TextIOWrapper

        g = TextIOWrapper(f, encoding=encoding, newline="")
        if not isinstance(f, BufferedIOBase):
            handles.append(g)
        f = g

    if memory_map and hasattr(f, "fileno"):
        try:
            wrapped = MMapWrapper(f)
            f.close()
            f = wrapped
        except Exception:
            # we catch any errors that may have occurred
            # because that is consistent with the lower-level
            # functionality of the C engine (pd.read_csv), so
            # leave the file handler as is then
            pass

    return f, handles
Beispiel #4
0
def _get_handle(path_or_buf,
                mode,
                encoding=None,
                compression=None,
                memory_map=False,
                is_text=True):
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf :
        a path (str) or buffer
    mode : str
        mode to open path_or_buf with
    encoding : str or None
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
        If 'infer' and `filepath_or_buffer` is path-like, then detect
        compression from the following extensions: '.gz', '.bz2', '.zip',
        or '.xz' (otherwise no compression).
    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        whether file/buffer is in text format (csv, json, etc.), or in binary
        mode (pickle, etc.)

    Returns
    -------
    f : file-like
        A file-like object
    handles : list of file-like objects
        A list of file-like object that were opened in this function.
    """
    try:
        from s3fs import S3File

        need_text_wrapping = (BytesIO, S3File)
    except ImportError:
        need_text_wrapping = (BytesIO, )

    handles = list()
    f = path_or_buf

    # Convert pathlib.Path/py.path.local or string
    path_or_buf = _stringify_path(path_or_buf)
    is_path = isinstance(path_or_buf, str)

    if is_path:
        compression = _infer_compression(path_or_buf, compression)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                f = gzip.open(path_or_buf, mode)
            else:
                f = gzip.GzipFile(fileobj=path_or_buf)

        # BZ Compression
        elif compression == "bz2":
            if is_path:
                f = bz2.BZ2File(path_or_buf, mode)
            else:
                f = bz2.BZ2File(path_or_buf)

        # ZIP Compression
        elif compression == "zip":
            zf = BytesZipFile(path_or_buf, mode)
            # Ensure the container is closed as well.
            handles.append(zf)
            if zf.mode == "w":
                f = zf
            elif zf.mode == "r":
                zip_names = zf.namelist()
                if len(zip_names) == 1:
                    f = zf.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        "Zero files found in ZIP file {}".format(path_or_buf))
                else:
                    raise ValueError(
                        "Multiple files found in ZIP file."
                        " Only one file per ZIP: {}".format(zip_names))

        # XZ Compression
        elif compression == "xz":
            f = _get_lzma_file(lzma)(path_or_buf, mode)

        # Unrecognized Compression
        else:
            msg = "Unrecognized compression type: {}".format(compression)
            raise ValueError(msg)

        handles.append(f)

    elif is_path:
        if encoding:
            # Encoding
            f = open(path_or_buf, mode, encoding=encoding, newline="")
        elif is_text:
            # No explicit encoding
            f = open(path_or_buf, mode, errors="replace", newline="")
        else:
            # Binary mode
            f = open(path_or_buf, mode)
        handles.append(f)

    # Convert BytesIO or file objects passed with an encoding
    if is_text and (compression or isinstance(f, need_text_wrapping)):
        from io import TextIOWrapper

        f = TextIOWrapper(f, encoding=encoding, newline="")
        handles.append(f)

    if memory_map and hasattr(f, "fileno"):
        try:
            g = MMapWrapper(f)
            f.close()
            f = g
        except Exception:
            # we catch any errors that may have occurred
            # because that is consistent with the lower-level
            # functionality of the C engine (pd.read_csv), so
            # leave the file handler as is then
            pass

    return f, handles