Example #1
0
    def __init__(
        self,
        filepath_or_buffer: FilePath | ReadBuffer[bytes],
        index=None,
        encoding: str | None = "ISO-8859-1",
        chunksize=None,
        compression: CompressionOptions = "infer",
    ) -> None:

        self._encoding = encoding
        self._lines_read = 0
        self._index = index
        self._chunksize = chunksize

        self.handles = get_handle(
            filepath_or_buffer,
            "rb",
            encoding=encoding,
            is_text=False,
            compression=compression,
        )
        self.filepath_or_buffer = self.handles.handle

        try:
            self._read_header()
        except Exception:
            self.close()
            raise
Example #2
0
def _get_path_or_handle(
    path: FilePathOrBuffer,
    fs: Any,
    storage_options: StorageOptions = None,
    mode: str = "rb",
    is_dir: bool = False,
) -> Tuple[FilePathOrBuffer, Optional[IOHandles], Any]:
    """File handling for PyArrow."""
    path_or_handle = stringify_path(path)
    if is_fsspec_url(path_or_handle) and fs is None:
        fsspec = import_optional_dependency("fsspec")

        fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle,
                                                   **(storage_options or {}))
    elif storage_options:
        raise ValueError(
            "storage_options passed with buffer or non-fsspec filepath")

    handles = None
    if (not fs and not is_dir and isinstance(path_or_handle, str)
            and not os.path.isdir(path_or_handle)):
        # use get_handle only when we are very certain that it is not a directory
        # fsspec resources can also point to directories
        # this branch is used for example when reading from non-fsspec URLs
        handles = get_handle(path_or_handle, mode, is_text=False)
        fs = None
        path_or_handle = handles.handle
    return path_or_handle, handles, fs
Example #3
0
    def test_to_csv_compression(self, df, encoding, compression):

        with tm.ensure_clean() as filename:

            df.to_csv(filename, compression=compression, encoding=encoding)
            # test the round trip - to_csv -> read_csv
            result = read_csv(
                filename, compression=compression, index_col=0, encoding=encoding
            )
            tm.assert_frame_equal(df, result)

            # test the round trip using file handle - to_csv -> read_csv
            with get_handle(
                filename, "w", compression=compression, encoding=encoding
            ) as handles:
                df.to_csv(handles.handle, encoding=encoding)
                assert not handles.handle.closed

            result = read_csv(
                filename,
                compression=compression,
                encoding=encoding,
                index_col=0,
            ).squeeze("columns")
            tm.assert_frame_equal(df, result)

            # explicitly make sure file is compressed
            with tm.decompress_file(filename, compression) as fh:
                text = fh.read().decode(encoding or "utf8")
                for col in df.columns:
                    assert col in text

            with tm.decompress_file(filename, compression) as fh:
                tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding))
Example #4
0
def test_compression_size_fh(obj, method, compression_only):
    with tm.ensure_clean() as path:
        with icom.get_handle(
            path,
            "w:gz" if compression_only == "tar" else "w",
            compression=compression_only,
        ) as handles:
            getattr(obj, method)(handles.handle)
            assert not handles.handle.closed
        compressed_size = os.path.getsize(path)
    with tm.ensure_clean() as path:
        with icom.get_handle(path, "w", compression=None) as handles:
            getattr(obj, method)(handles.handle)
            assert not handles.handle.closed
        uncompressed_size = os.path.getsize(path)
        assert uncompressed_size > compressed_size
Example #5
0
    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        The function read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.
        """
        # if it is a string but the file does not exist, it might be a JSON string
        exists = False
        if isinstance(filepath_or_buffer, str):
            try:
                exists = os.path.exists(filepath_or_buffer)
            # gh-5874: if the filepath is too long will raise here
            except (TypeError, ValueError):
                pass

        if exists or not isinstance(filepath_or_buffer, str):
            self.handles = get_handle(
                filepath_or_buffer,
                "r",
                encoding=self.encoding,
                compression=self.compression,
            )
            filepath_or_buffer = self.handles.handle

        return filepath_or_buffer
Example #6
0
 def read(cls, path_or_buff, *, mode: str = "r", **kwargs) -> str:
     """
     Reads using Pandas's ``get_handle``.
     By default (unless ``compression=`` is set), infers the compression type from the filename suffix
     (e.g. ``.csv.gz``).
     """
     kwargs = {**dict(compression="infer"), **kwargs}
     with get_handle(path_or_buff, mode, **kwargs) as f:
         return f.handle.read()
Example #7
0
def test_compression_warning(compression_only):
    # Assert that passing a file object to to_csv while explicitly specifying a
    # compression protocol triggers a RuntimeWarning, as per GH21227.
    df = pd.DataFrame(
        100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
        columns=["X", "Y", "Z"],
    )
    with tm.ensure_clean() as path:
        with icom.get_handle(path, "w", compression=compression_only) as handles:
            with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False):
                df.to_csv(handles.handle, compression=compression_only)
Example #8
0
    def from_json(cls, path_or_buf=None):
        """ Returns an ACN-Sim object loaded from in_registry.
        Note URLs have not been tested as path_or_buf input.

        Args:
            path_or_buf (Union[str, FilePathOrBuffer]): a valid JSON
                str, path object or file-like object. Any valid string
                path is acceptable.
        """
        # The code here is from pandas 1.0.1, io.json.from_json(), with
        # modifications.
        filepath_or_buffer, _, _, should_close = get_filepath_or_buffer(
            path_or_buf)

        exists = False
        if isinstance(filepath_or_buffer, str):
            try:
                exists = os.path.exists(filepath_or_buffer)
            except (TypeError, ValueError):
                pass

        if exists:
            filepath_or_buffer, _ = get_handle(filepath_or_buffer, "r")
            should_close = True

        if isinstance(filepath_or_buffer, str):
            should_close = False
            out_registry = json.loads(filepath_or_buffer)
        else:
            out_registry = json.load(filepath_or_buffer)
        if should_close:
            filepath_or_buffer.close()

        if out_registry["version"] is None:
            warnings.warn(
                f"Missing a recorded version of acnportal in the "
                f"loaded registry. Object may have been dumped with a "
                f"different version of acnportal.",
                UserWarning,
            )
        if out_registry["dependency_versions"] is None:
            warnings.warn(
                f"Missing recorded dependency versions of acnportal in "
                f"the loaded registry. Object may have been dumped "
                f"with different dependency versions of acnportal.",
                UserWarning,
            )

        out_obj = cls._from_registry(out_registry)[0]
        return out_obj
Example #9
-1
    def read(self,
             path,
             columns=None,
             storage_options: StorageOptions = None,
             **kwargs):
        path = stringify_path(path)
        handles = None
        fs = kwargs.pop("filesystem", None)
        if is_fsspec_url(path) and fs is None:
            fsspec = import_optional_dependency("fsspec")

            fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
        elif storage_options:
            raise ValueError(
                "storage_options passed with buffer or non-fsspec filepath")
        if not fs and isinstance(path, str) and not os.path.isdir(path):
            # use get_handle only when we are very certain that it is not a directory
            # fsspec resources can also point to directories
            # this branch is used for example when reading from non-fsspec URLs
            handles = get_handle(path, "rb", is_text=False)
            path = handles.handle

        kwargs["use_pandas_metadata"] = True
        result = self.api.parquet.read_table(path,
                                             columns=columns,
                                             filesystem=fs,
                                             **kwargs).to_pandas()

        if handles is not None:
            handles.close()

        return result
Example #10
-1
    def save(self) -> None:
        """
        Create the writer & save.
        """
        # GH21227 internal compression is not used for non-binary handles.
        if (
            self.compression
            and hasattr(self.path_or_buf, "write")
            and "b" not in self.mode
        ):
            warnings.warn(
                "compression has no effect when passing a non-binary object as input.",
                RuntimeWarning,
                stacklevel=2,
            )
            self.compression = None

        # get a handle or wrap an existing handle to take care of 1) compression and
        # 2) text -> byte conversion
        f, handles = get_handle(
            self.path_or_buf,
            self.mode,
            encoding=self.encoding,
            errors=self.errors,
            compression=dict(self.compression_args, method=self.compression),
        )

        try:
            # Note: self.encoding is irrelevant here
            self.writer = csvlib.writer(
                f,
                lineterminator=self.line_terminator,
                delimiter=self.sep,
                quoting=self.quoting,
                doublequote=self.doublequote,
                escapechar=self.escapechar,
                quotechar=self.quotechar,
            )

            self._save()

        finally:
            if self.should_close:
                f.close()
            elif (
                isinstance(f, TextIOWrapper)
                and not f.closed
                and f != self.path_or_buf
                and hasattr(self.path_or_buf, "write")
            ):
                # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper
                # closes the wrapped handle if it is not detached.
                f.flush()  # make sure everything is written
                f.detach()  # makes f unusable
                del f
            elif f != self.path_or_buf:
                f.close()
            for _fh in handles:
                _fh.close()
Example #11
-1
def read_feather(path,
                 columns=None,
                 use_threads: bool = True,
                 storage_options: StorageOptions = None):
    """
    Load a feather-format object from the file path.

    Parameters
    ----------
    path : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.feather``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handle (e.g. via builtin ``open`` function)
        or ``StringIO``.
    columns : sequence, default None
        If not provided, all columns are read.

        .. versionadded:: 0.24.0
    use_threads : bool, default True
        Whether to parallelize reading using multiple threads.

       .. versionadded:: 0.24.0
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will
        be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
        will be raised if providing this argument with a local path or
        a file-like buffer. See the fsspec and backend storage implementation
        docs for the set of allowed keys and values.

        .. versionadded:: 1.2.0

    Returns
    -------
    type of object stored in file
    """
    import_optional_dependency("pyarrow")
    from pyarrow import feather

    with get_handle(path, "rb", storage_options=storage_options,
                    is_text=False) as handles:

        return feather.read_feather(handles.handle,
                                    columns=columns,
                                    use_threads=bool(use_threads))