def test_stringify_path_localpath(self): path = os.path.join("foo", "bar") abs_path = os.path.abspath(path) lpath = LocalPath(path) assert icom.stringify_path(lpath) == abs_path
def test_stringify_path_fspath(self): p = CustomFSPath("foo/bar.csv") result = icom.stringify_path(p) assert result == "foo/bar.csv"
def __fspath__(self): return stringify_path(self.path)
def test_stringify_path_pathlib(self): rel_path = icom.stringify_path(Path(".")) assert rel_path == "." redundant_path = icom.stringify_path(Path("foo//bar")) assert redundant_path == os.path.join("foo", "bar")
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = LooseVersion(get_version(xlrd)) if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content_or_path=path_or_buffer, storage_options=storage_options) if engine is None: # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and ext != "xls" and xlrd_version is not None: if xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) else: caller = inspect.stack()[1] if (caller.filename.endswith( os.path.join("pandas", "io", "excel", "_base.py")) and caller.function == "read_excel"): stacklevel = 4 else: stacklevel = 2 warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. As a result, the " f"openpyxl engine will be used if it is installed and the " f"engine argument is not specified. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
def read_html( io: FilePathOrBuffer, match: str | Pattern = ".+", flavor: str | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, attrs: dict[str, str] | None = None, parse_dates: bool = False, thousands: str | None = ",", encoding: str | None = None, decimal: str = ".", converters: dict | None = None, na_values=None, keep_default_na: bool = True, displayed_only: bool = True, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- io : str, path object or file-like object A URL, a file-like object, or a raw string containing HTML. Note that lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be returned. Unless the HTML is extremely simple you will probably need to pass a non-empty string here. Defaults to '.+' (match any non-empty string). The default value will return all tables contained on a page. This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. flavor : str, optional The parsing engine to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. index_col : int or list-like, optional The column (or list of columns) to use to create the index. skiprows : int, list-like or slice, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth row' whereas an integer means 'skip n rows'. attrs : dict, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: attrs = {'id': 'table'} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. :: attrs = {'asdf': 'table'} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 table attributes can be found `here <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A working draft of the HTML 5 spec can be found `here <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the latest information on table attributes for the modern web. parse_dates : bool, optional See :func:`~read_csv` for more details. thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. encoding : str, optional The encoding used to decode the web page. Defaults to ``None``.``None`` preserves the previous encoding behavior, which depends on the underlying parser library (e.g., the parser library will try to use the encoding provided by the document). decimal : str, default '.' Character to recognize as decimal point (e.g. use ',' for European data). converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one input argument, the cell (not column) content, and return the transformed content. na_values : iterable, default None Custom NA values. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. displayed_only : bool, default True Whether elements with "display: none" should be parsed. Returns ------- dfs A list of DataFrames. See Also -------- read_csv : Read a comma-separated values (csv) file into DataFrame. Notes ----- Before using this function you should read the :ref:`gotchas about the HTML parsing libraries <io.html.gotchas>`. Expect to do some cleanup after you call this function. For example, you might need to manually assign column names if the column names are converted to NaN when you pass the `header=0` argument. We try to assume as little as possible about the structure of the table and push the idiosyncrasies of the HTML contained in the table to the user. This function searches for ``<table>`` elements and only for ``<tr>`` and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>`` element in the table. ``<td>`` stands for "table data". This function attempts to properly handle ``colspan`` and ``rowspan`` attributes. If the function has a ``<thead>`` argument, it is used to construct the header, otherwise the function attempts to find the header within the body (by putting rows with only ``<th>`` elements into the header). Similar to :func:`~read_csv` the `header` argument is applied **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* it will fail, e.g., it will *not* return an empty list. Examples -------- See the :ref:`read_html documentation in the IO section of the docs <io.read_html>` for some examples of reading in HTML tables. """ _importers() # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: raise ValueError("cannot skip rows starting from the end of the " "data (you passed a negative value)") validate_header_arg(header) io = stringify_path(io) return _parse( flavor=flavor, io=io, match=match, header=header, index_col=index_col, skiprows=skiprows, parse_dates=parse_dates, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, keep_default_na=keep_default_na, displayed_only=displayed_only, )
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if (import_optional_dependency( "xlrd", raise_on_missing=False, on_version="ignore") is None): xlrd_version = None else: import xlrd xlrd_version = LooseVersion(xlrd.__version__) if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): ext = inspect_excel_format(content=path_or_buffer, storage_options=storage_options) elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: # path_or_buffer is path-like, use stringified path ext = inspect_excel_format(path=str(self._io), storage_options=storage_options) if engine is None: if ext == "ods": engine = "odf" elif ext == "xls": engine = "xlrd" else: # GH 35029 - Prefer openpyxl except for xls files if (import_optional_dependency("openpyxl", raise_on_missing=False, on_version="ignore") is not None): engine = "openpyxl" else: engine = "xlrd" if engine == "xlrd" and ext != "xls" and xlrd_version is not None: if xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) else: caller = inspect.stack()[1] if (caller.filename.endswith( os.path.join("pandas", "io", "excel", "_base.py")) and caller.function == "read_excel"): stacklevel = 4 else: stacklevel = 2 warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. As a result, the " f"openpyxl engine will be used if it is installed and the " f"engine argument is not specified. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
def __fspath__(self): # pandas\io\excel\_base.py:744: error: Argument 1 to "stringify_path" # has incompatible type "Optional[Any]"; expected "Union[str, Path, # IO[Any], IOBase]" [arg-type] return stringify_path(self.path) # type: ignore[arg-type]
def to_string( self, buf=None, na_rep="NaN", float_format=None, header=True, index=True, length=False, dtype=False, name=False, max_rows=None, min_rows=None, ) -> Optional[str]: """ Render a string representation of the Series. Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``. See Also -------- :pandas_api_docs:`pandas.Series.to_string` for argument details. """ # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this # by limiting rows by default. num_rows = len(self) # avoid multiple calls if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED: if max_rows is None: max_rows = num_rows else: max_rows = min(num_rows, max_rows) elif max_rows is None: warnings.warn( f"Series.to_string called without max_rows set " f"- this will return entire index results. " f"Setting max_rows={DEFAULT_NUM_ROWS_DISPLAYED}" f" overwrite if different behaviour is required.", UserWarning, ) max_rows = DEFAULT_NUM_ROWS_DISPLAYED # because of the way pandas handles max_rows=0, not having this throws an error # see eland issue #56 if max_rows == 0: max_rows = 1 # Create a slightly bigger dataframe than display temp_series = self._build_repr(max_rows + 1) if buf is not None: _buf = _expand_user(stringify_path(buf)) else: _buf = StringIO() # Create repr of fake series without name, length, dtype summary temp_series.to_string( buf=_buf, na_rep=na_rep, float_format=float_format, header=header, index=index, length=False, dtype=False, name=False, max_rows=max_rows, ) # Create the summary footer = [] if name and self.name is not None: footer.append(f"Name: {self.name}") if length and len(self) > max_rows: footer.append(f"Length: {len(self.index)}") if dtype: footer.append(f"dtype: {temp_series.dtype}") if footer: _buf.write(f"\n{', '.join(footer)}") if buf is None: result = _buf.getvalue() return result
def test_stringify_file_and_path_like(self): # GH 38125: do not stringify file objects that are also path-like fsspec = pytest.importorskip("fsspec") with tm.ensure_clean() as path: with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj: assert fsspec_obj == icom.stringify_path(fsspec_obj)
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is None: # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" if (import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore") is not None): from xlrd import Book if isinstance(path_or_buffer, Book): engine = "xlrd" # GH 35029 - Prefer openpyxl except for xls files if engine is None: if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": engine = "xlrd" elif (import_optional_dependency("openpyxl", raise_on_missing=False, on_version="ignore") is not None): engine = "openpyxl" else: caller = inspect.stack()[1] if (caller.filename.endswith("pandas/io/excel/_base.py") and caller.function == "read_excel"): stacklevel = 4 else: stacklevel = 2 warnings.warn( "The xlrd engine is no longer maintained and is not " "supported when using pandas with python >= 3.9. However, " "the engine xlrd will continue to be allowed for the " "indefinite future. Beginning with pandas 1.2.0, the " "openpyxl engine will be used if it is installed and the " "engine argument is not specified. Either install openpyxl " "or specify engine='xlrd' to silence this warning.", FutureWarning, stacklevel=stacklevel, ) engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") self.engine = engine self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) self._reader = self._engines[engine](self._io, storage_options=storage_options)
def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. path : str File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 See Also -------- read_pickle : Load pickled pandas object (or any object) from file. DataFrame.to_hdf : Write DataFrame to an HDF5 file. DataFrame.to_sql : Write DataFrame to a SQL database. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = stringify_path(path) f, fh = get_handle(path, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: f.write(pickle.dumps(obj, protocol=protocol)) finally: f.close() for _f in fh: _f.close()
def read_pickle(path, compression="infer"): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Notes ----- read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = stringify_path(path) f, fh = get_handle(path, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib" return pc.load(f, encoding=None) except UnicodeDecodeError: # e.g. can occur for files written in py27; see GH#28645 return pc.load(f, encoding="latin-1") finally: f.close() for _f in fh: _f.close()
def read_sas( filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, ): """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : str, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify a format string" ) filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader data = reader.read() reader.close() return data
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # First argument can also be bytes, so create a buffer if isinstance(path_or_buffer, bytes): path_or_buffer = BytesIO(path_or_buffer) # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = Version(get_version(xlrd)) ext = None if engine is None: # Only determine ext if it is needed if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content_or_path=path_or_buffer, storage_options=storage_options) if ext is None: raise ValueError( "Excel file format cannot be determined, you must specify " "an engine manually.") engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and xlrd_version is not None: if ext is None: # Need ext to determine ext in order to raise/warn if isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(path_or_buffer, storage_options=storage_options) # Pass through if ext is None, otherwise check if ext valid for xlrd if ext and ext != "xls" and xlrd_version >= Version("2"): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) elif ext and ext != "xls": stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
def to_json( path_or_buf, obj, orient: Optional[str] = None, date_format: str = "epoch", double_precision: int = 10, force_ascii: bool = True, date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, compression: Optional[str] = "infer", index: bool = True, indent: int = 0, ): if not index and orient not in ["split", "table"]: raise ValueError( "'index=False' is only valid when 'orient' is 'split' or 'table'") path_or_buf = stringify_path(path_or_buf) if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") writer: Type["Writer"] if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, index=index, indent=indent, ).write() if lines: s = convert_to_line_delimits(s) if isinstance(path_or_buf, str): fh, handles = get_handle(path_or_buf, "w", compression=compression) try: fh.write(s) finally: fh.close() elif path_or_buf is None: return s else: path_or_buf.write(s)
def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, index: Hashable | None = None, encoding: str | None = None, chunksize: int | None = None, iterator: bool = False, ) -> DataFrame | ReaderBase: """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``read()`` function. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : str, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify a format string" ) filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader with reader: return reader.read()