def is_fsspec_url(url: FilePathOrBuffer) -> bool: """ Returns true if the given URL looks like something fsspec can handle """ return (isinstance(url, str) and "://" in url and not url.startswith( ("http://", "https://")))
def infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: str | None ) -> str | None: """ Get the compression method for filepath_or_buffer. If compression='infer', the inferred compression method is returned. Otherwise, the input compression method is returned unchanged, unless it's invalid, in which case an error is raised. Parameters ---------- filepath_or_buffer : str or file handle File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). Returns ------- string or None Raises ------ ValueError on invalid compression specified. """ if compression is None: return None # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression return None # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): if filepath_or_buffer.lower().endswith(extension): return compression return None # Compression has been specified. Check that it's valid if compression in _compression_to_extension: return compression # https://github.com/python/mypy/issues/5492 # Unsupported operand types for + ("List[Optional[str]]" and "List[str]") valid = ["infer", None] + sorted( _compression_to_extension ) # type: ignore[operator] msg = ( f"Unrecognized compression type: {compression}\n" f"Valid compression types are {valid}" ) raise ValueError(msg)
def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: """ Get the compression method for filepath_or_buffer. If compression='infer', the inferred compression method is returned. Otherwise, the input compression method is returned unchanged, unless it's invalid, in which case an error is raised. Parameters ---------- filepath_or_buffer : str or file handle File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). Returns ------- string or None Raises ------ ValueError on invalid compression specified. """ # No compression has been explicitly specified if compression is None: return None # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression return None # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): if filepath_or_buffer.endswith(extension): return compression return None # Compression has been specified. Check that it's valid if compression in _compression_to_extension: return compression msg = "Unrecognized compression type: {}".format(compression) valid = ["infer", None] + sorted(_compression_to_extension) msg += "\nValid compression types are {}".format(valid) raise ValueError(msg)
def read_sas( filepath_or_buffer: FilePathOrBuffer, format: Optional[str] = None, index: Optional[Label] = None, encoding: Optional[str] = None, chunksize: Optional[int] = None, iterator: bool = False, ) -> Union["DataFrame", ReaderBase]: """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : str, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify a format string") filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader return reader.read()
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: EncodingVar = None, # type: ignore[assignment] compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values .. versionadded:: 1.2.0 ..versionchange:: 1.2.0 Returns the dataclass IOargs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files compression_method = infer_compression( filepath_or_buffer, get_compression_method(compression)[0]) if (mode and "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"]): warnings.warn( f"{compression} will not write the byte order mark for {encoding}", UnicodeWarning, ) # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. fsspec_mode = mode or "rb" if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return IOargs( filepath_or_buffer=reader, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: List[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ClientError, NoCredentialsError err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open(filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open(filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})).open() return IOargs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path") if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return IOargs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, should_close=False, mode=mode, ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return IOargs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, should_close=False, mode=mode, )
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options: dict, optional passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- Tuple[FilePathOrBuffer, str, str, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: List[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ClientError, NoCredentialsError err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return filepath_or_buffer, None, compression, False
def _get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: str = "utf-8", compression: CompressionOptions = None, mode: str = "r", storage_options: StorageOptions = None, ) -> IOArgs: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values .. versionadded:: 1.2.0 ..versionchange:: 1.2.0 Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) # handle compression dict compression_method, compression = get_compression_method(compression) compression_method = infer_compression(filepath_or_buffer, compression_method) # GH21227 internal compression is not used for non-binary handles. if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) compression_method = None compression = dict(compression, method=compression_method) # uniform encoding names if encoding is not None: encoding = encoding.replace("_", "-").lower() # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"] ): warnings.warn( f"{compression} will not write the byte order mark for {encoding}", UnicodeWarning, ) # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. fsspec_mode = mode if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this # unchanged. using fsspec appears to break the ability to infer if the # server responded with gzipped data storage_options = storage_options or {} # waiting until now for importing to match intended lazy logic of # urlopen function defined elsewhere in this module import urllib.request # assuming storage_options is to be interpreted as headers req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options) with urlopen(req_info) as req: content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = {"method": "gzip"} reader = BytesIO(req.read()) return IOArgs( filepath_or_buffer=reader, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) if is_fsspec_url(filepath_or_buffer): assert isinstance( filepath_or_buffer, str ) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") # If botocore is installed we fallback to reading with anon=True # to allow reads from public buckets err_types_to_retry_with_anon: list[Any] = [] try: import_optional_dependency("botocore") from botocore.exceptions import ( ClientError, NoCredentialsError, ) err_types_to_retry_with_anon = [ ClientError, NoCredentialsError, PermissionError, ] except ImportError: pass try: file_obj = fsspec.open( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): if storage_options is None: storage_options = {"anon": True} else: # don't mutate user input. storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() return IOArgs( filepath_or_buffer=file_obj, encoding=encoding, compression=compression, should_close=True, mode=fsspec_mode, ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return IOArgs( filepath_or_buffer=_expand_user(filepath_or_buffer), encoding=encoding, compression=compression, should_close=False, mode=mode, ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=encoding, compression=compression, should_close=False, mode=mode, )
def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional storage_options: dict, optional passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- Tuple[FilePathOrBuffer, str, str, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 if filepath_or_buffer.startswith("s3a://"): filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") if filepath_or_buffer.startswith("s3n://"): filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") file_obj = fsspec.open(filepath_or_buffer, mode=mode or "rb", **(storage_options or {})).open() return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) return filepath_or_buffer, None, compression, False