def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional Returns ------- Tuple[FilePathOrBuffer, str, str, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs return gcs.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return filepath_or_buffer, None, compression, False
def stringify_path( filepath_or_buffer: FilePath | BaseBufferT, convert_file_like: bool = False, ) -> str | BaseBufferT: """ Attempt to convert a path-like object to a string. Parameters ---------- filepath_or_buffer : object to be converted Returns ------- str_filepath_or_buffer : maybe a string version of the object Notes ----- Objects supporting the fspath protocol (python 3.6+) are coerced according to its __fspath__ method. Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ if not convert_file_like and is_file_like(filepath_or_buffer): # GH 38125: some fsspec objects implement os.PathLike but have already opened a # file. This prevents opening the file a second time. infer_compression calls # this function with convert_file_like=True to infer the compression. return cast(BaseBufferT, filepath_or_buffer) if isinstance(filepath_or_buffer, os.PathLike): filepath_or_buffer = filepath_or_buffer.__fspath__() return _expand_user(filepath_or_buffer)
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None, mode=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional Returns ------- tuple of ({a filepath_ or buffer or S3File instance}, encoding, str, compression, str, should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs return gcs.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression, False
def _check_file_or_buffer(self, f, engine): # see gh-16530 if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): # The C engine doesn't need the file-like to have the "__next__" # attribute. However, the Python engine explicitly calls # "__next__(...)" when iterating through such an object, meaning it # needs to have that attribute raise ValueError( "The 'python' engine cannot iterate through this file buffer.")
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None, mode=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' mode : str, optional Returns ------- tuple of ({a filepath_ or buffer or S3File instance}, encoding, str, compression, str, should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): req = _urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs return gcs.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression, False
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): url = str(filepath_or_buffer) req = _urlopen(url) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # Convert pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): url = str(filepath_or_buffer) req = _urlopen(url) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) # Convert pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: EncodingVar = None, # type: ignore[assignment] compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values .. versionadded:: 1.2.0 ..versionchange:: 1.2.0 Returns the dataclass IOargs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files compression_method = infer_compression( filepath_or_buffer, get_compression_method(compression)[0]) if (mode and "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"]): warnings.warn( f"{compression} will not write the byte order mark for {encoding}", UnicodeWarning, ) # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. fsspec_mode = mode or "rb" if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return IOargs( filepath_or_buffer=reader, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: List[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ClientError, NoCredentialsError err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open(filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open(filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})).open() return IOargs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path") if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return IOargs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, should_close=False, mode=mode, ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return IOargs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, should_close=False, mode=mode, )
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options: dict, optional passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- Tuple[FilePathOrBuffer, str, str, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: List[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ClientError, NoCredentialsError err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return filepath_or_buffer, None, compression, False
def _get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: str = "utf-8", compression: CompressionOptions = None, mode: str = "r", storage_options: StorageOptions = None, ) -> IOArgs: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values .. versionadded:: 1.2.0 ..versionchange:: 1.2.0 Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) # handle compression dict compression_method, compression = get_compression_method(compression) compression_method = infer_compression(filepath_or_buffer, compression_method) # GH21227 internal compression is not used for non-binary handles. if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) compression_method = None compression = dict(compression, method=compression_method) # uniform encoding names if encoding is not None: encoding = encoding.replace("_", "-").lower() # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"] ): warnings.warn( f"{compression} will not write the byte order mark for {encoding}", UnicodeWarning, ) # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. fsspec_mode = mode if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this # unchanged. using fsspec appears to break the ability to infer if the # server responded with gzipped data storage_options = storage_options or {} # waiting until now for importing to match intended lazy logic of # urlopen function defined elsewhere in this module import urllib.request # assuming storage_options is to be interpreted as headers req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options) with urlopen(req_info) as req: content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = {"method": "gzip"} reader = BytesIO(req.read()) return IOArgs( filepath_or_buffer=reader, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) if is_fsspec_url(filepath_or_buffer): assert isinstance( filepath_or_buffer, str ) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: list[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ( ClientError, NoCredentialsError, ) err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() return IOArgs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return IOArgs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, should_close=False, mode=mode, ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, should_close=False, mode=mode, )
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options: dict, optional passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- Tuple[FilePathOrBuffer, str, str, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return filepath_or_buffer, None, compression, False