def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], index=None, encoding: str | None = "ISO-8859-1", chunksize=None, compression: CompressionOptions = "infer", ) -> None: self._encoding = encoding self._lines_read = 0 self._index = index self._chunksize = chunksize self.handles = get_handle( filepath_or_buffer, "rb", encoding=encoding, is_text=False, compression=compression, ) self.filepath_or_buffer = self.handles.handle try: self._read_header() except Exception: self.close() raise
def _get_path_or_handle( path: FilePathOrBuffer, fs: Any, storage_options: StorageOptions = None, mode: str = "rb", is_dir: bool = False, ) -> Tuple[FilePathOrBuffer, Optional[IOHandles], Any]: """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: fsspec = import_optional_dependency("fsspec") fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle, **(storage_options or {})) elif storage_options: raise ValueError( "storage_options passed with buffer or non-fsspec filepath") handles = None if (not fs and not is_dir and isinstance(path_or_handle, str) and not os.path.isdir(path_or_handle)): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path_or_handle, mode, is_text=False) fs = None path_or_handle = handles.handle return path_or_handle, handles, fs
def test_to_csv_compression(self, df, encoding, compression): with tm.ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv result = read_csv( filename, compression=compression, index_col=0, encoding=encoding ) tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv with get_handle( filename, "w", compression=compression, encoding=encoding ) as handles: df.to_csv(handles.handle, encoding=encoding) assert not handles.handle.closed result = read_csv( filename, compression=compression, encoding=encoding, index_col=0, ).squeeze("columns") tm.assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: text = fh.read().decode(encoding or "utf8") for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding))
def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: with icom.get_handle( path, "w:gz" if compression_only == "tar" else "w", compression=compression_only, ) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=None) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size
def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) 3. JSON string This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. """ # if it is a string but the file does not exist, it might be a JSON string exists = False if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): pass if exists or not isinstance(filepath_or_buffer, str): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, ) filepath_or_buffer = self.handles.handle return filepath_or_buffer
def read(cls, path_or_buff, *, mode: str = "r", **kwargs) -> str: """ Reads using Pandas's ``get_handle``. By default (unless ``compression=`` is set), infers the compression type from the filename suffix (e.g. ``.csv.gz``). """ kwargs = {**dict(compression="infer"), **kwargs} with get_handle(path_or_buff, mode, **kwargs) as f: return f.handle.read()
def test_compression_warning(compression_only): # Assert that passing a file object to to_csv while explicitly specifying a # compression protocol triggers a RuntimeWarning, as per GH21227. df = pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=compression_only) as handles: with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): df.to_csv(handles.handle, compression=compression_only)
def from_json(cls, path_or_buf=None): """ Returns an ACN-Sim object loaded from in_registry. Note URLs have not been tested as path_or_buf input. Args: path_or_buf (Union[str, FilePathOrBuffer]): a valid JSON str, path object or file-like object. Any valid string path is acceptable. """ # The code here is from pandas 1.0.1, io.json.from_json(), with # modifications. filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( path_or_buf) exists = False if isinstance(filepath_or_buffer, str): try: exists = os.path.exists(filepath_or_buffer) except (TypeError, ValueError): pass if exists: filepath_or_buffer, _ = get_handle(filepath_or_buffer, "r") should_close = True if isinstance(filepath_or_buffer, str): should_close = False out_registry = json.loads(filepath_or_buffer) else: out_registry = json.load(filepath_or_buffer) if should_close: filepath_or_buffer.close() if out_registry["version"] is None: warnings.warn( f"Missing a recorded version of acnportal in the " f"loaded registry. Object may have been dumped with a " f"different version of acnportal.", UserWarning, ) if out_registry["dependency_versions"] is None: warnings.warn( f"Missing recorded dependency versions of acnportal in " f"the loaded registry. Object may have been dumped " f"with different dependency versions of acnportal.", UserWarning, ) out_obj = cls._from_registry(out_registry)[0] return out_obj
def read(self, path, columns=None, storage_options: StorageOptions = None, **kwargs): path = stringify_path(path) handles = None fs = kwargs.pop("filesystem", None) if is_fsspec_url(path) and fs is None: fsspec = import_optional_dependency("fsspec") fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) elif storage_options: raise ValueError( "storage_options passed with buffer or non-fsspec filepath") if not fs and isinstance(path, str) and not os.path.isdir(path): # use get_handle only when we are very certain that it is not a directory # fsspec resources can also point to directories # this branch is used for example when reading from non-fsspec URLs handles = get_handle(path, "rb", is_text=False) path = handles.handle kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table(path, columns=columns, filesystem=fs, **kwargs).to_pandas() if handles is not None: handles.close() return result
def save(self) -> None: """ Create the writer & save. """ # GH21227 internal compression is not used for non-binary handles. if ( self.compression and hasattr(self.path_or_buf, "write") and "b" not in self.mode ): warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) self.compression = None # get a handle or wrap an existing handle to take care of 1) compression and # 2) text -> byte conversion f, handles = get_handle( self.path_or_buf, self.mode, encoding=self.encoding, errors=self.errors, compression=dict(self.compression_args, method=self.compression), ) try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( f, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar, ) self._save() finally: if self.should_close: f.close() elif ( isinstance(f, TextIOWrapper) and not f.closed and f != self.path_or_buf and hasattr(self.path_or_buf, "write") ): # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper # closes the wrapped handle if it is not detached. f.flush() # make sure everything is written f.detach() # makes f unusable del f elif f != self.path_or_buf: f.close() for _fh in handles: _fh.close()
def read_feather(path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None): """ Load a feather-format object from the file path. Parameters ---------- path : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.feather``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. .. versionadded:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. .. versionadded:: 1.2.0 Returns ------- type of object stored in file """ import_optional_dependency("pyarrow") from pyarrow import feather with get_handle(path, "rb", storage_options=storage_options, is_text=False) as handles: return feather.read_feather(handles.handle, columns=columns, use_threads=bool(use_threads))