def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : string or file-like object Path to the SAS file. format : string {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : string, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ("If this is a buffer object rather " "than a string name, you must specify " "a format string") filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, compat.string_types): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") if format.lower() == 'xport': from pandas.io.sas.sas_xport import XportReader reader = XportReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) elif format.lower() == 'sas7bdat': from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) else: raise ValueError('unknown SAS format') if iterator or chunksize: return reader data = reader.read() reader.close() return data
def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: try: df1[col] = df1[col].str.decode('utf-8') except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() rdr.close() for x, y in zip(df1.columns, df3.columns): assert (x == y.decode())
def load(filepath, cols): # Irritatingly, have to use SAS7BDATReader instead of more natural pd.read_sas() # ...because of an issue with converting SAS dates that causes the read to fail. # pd.read_sas() is just a thin wrapper that checks SAS datafile type from file extension. # For '.sas7bdat' files, it then creates a SAS7BDATReader anyway... # ...but pd.read_csv() doesn't expose SAS7BDATReader's "convert_dates=False" # ...so only way out is to just directly create the SAS7BDATReader! print("load " + str(filepath)) in_reader = SAS7BDATReader(filepath, convert_dates=False, convert_text=False) # This creates a pd.DataFrame from the SAS7BDATReader object. # Can specify a number of rows inside read() - empty () means read all the rows! data = in_reader.read() data = data[cols] return data
def read_sas( filepath_or_buffer: FilePathOrBuffer, format: Optional[str] = None, index: Optional[Label] = None, encoding: Optional[str] = None, chunksize: Optional[int] = None, iterator: bool = False, ) -> Union["DataFrame", ReaderBase]: """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : str, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify a format string") filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader return reader.read()
def read_sas( filepath_or_buffer: FilePath | ReadBuffer[bytes], format: str | None = None, index: Hashable | None = None, encoding: str | None = None, chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", ) -> DataFrame | ReaderBase: """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a binary ``read()`` function. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. format : str {{'xport', 'sas7bdat'}} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : str, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. {decompression_options} Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify a format string") filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if ".xpt" in fname: format = "xport" elif ".sas7bdat" in fname: format = "sas7bdat" else: raise ValueError( f"unable to infer format of SAS file from filename: {repr(fname)}" ) reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, compression=compression, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize, compression=compression, ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader with reader: return reader.read()