Beispiel #1
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.etree import XMLSyntaxError
        from lxml.html import (
            HTMLParser,
            fromstring,
            parse,
        )

        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, OSError) as e:
            # if the input is a blob of html goop
            if not is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, "text_content"):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)

        for br in r.xpath("*//br"):
            br.tail = "\n" + (br.tail or "")

        return r
Beispiel #2
0
    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        The function read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.
        """
        # if it is a string but the file does not exist, it might be a JSON string
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if (not isinstance(filepath_or_buffer, str)
                or is_url(filepath_or_buffer)
                or is_fsspec_url(filepath_or_buffer)
                or file_exists(filepath_or_buffer)):
            self.handles = get_handle(
                filepath_or_buffer,
                "r",
                encoding=self.encoding,
                compression=self.compression,
                storage_options=self.storage_options,
                errors=self.encoding_errors,
            )
            filepath_or_buffer = self.handles.handle

        return filepath_or_buffer
Beispiel #3
0
def _get_path_or_handle(
    path: FilePathOrBuffer,
    fs: Any,
    storage_options: StorageOptions = None,
    mode: str = "rb",
    is_dir: bool = False,
) -> tuple[FilePathOrBuffer, IOHandles | None, Any]:
    """File handling for PyArrow."""
    path_or_handle = stringify_path(path)
    if is_fsspec_url(path_or_handle) and fs is None:
        fsspec = import_optional_dependency("fsspec")

        fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle,
                                                   **(storage_options or {}))
    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
        # can't write to a remote url
        # without making use of fsspec at the moment
        raise ValueError(
            "storage_options passed with buffer, or non-supported URL")

    handles = None
    if (not fs and not is_dir and isinstance(path_or_handle, str)
            and not os.path.isdir(path_or_handle)):
        # use get_handle only when we are very certain that it is not a directory
        # fsspec resources can also point to directories
        # this branch is used for example when reading from non-fsspec URLs
        handles = get_handle(path_or_handle,
                             mode,
                             is_text=False,
                             storage_options=storage_options)
        fs = None
        path_or_handle = handles.handle
    return path_or_handle, handles, fs
Beispiel #4
0
def _read(obj):
    """
    Try to read from a url, file or string.

    Parameters
    ----------
    obj : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if is_url(obj):
        with urlopen(obj) as url:
            text = url.read()
    elif hasattr(obj, "read"):
        text = obj.read()
    elif isinstance(obj, (str, bytes)):
        text = obj
        try:
            if os.path.isfile(text):
                with open(text, "rb") as f:
                    return f.read()
        except (TypeError, ValueError):
            pass
    else:
        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
    return text
Beispiel #5
0
    def __init__(self,
                 filepath_or_buffer,
                 storage_options: StorageOptions = None):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer,
                            (ExcelFile, self._workbook_class)):
            filepath_or_buffer = get_filepath_or_buffer(
                filepath_or_buffer,
                storage_options=storage_options).filepath_or_buffer

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, bytes):
            self.book = self.load_workbook(BytesIO(filepath_or_buffer))
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
Beispiel #6
0
    def _write_cell(
        self, s: Any, kind: str = "td", indent: int = 0, tags: Optional[str] = None
    ) -> None:
        if tags is not None:
            start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags)
        else:
            start_tag = "<{kind}>".format(kind=kind)

        if self.escape:
            # escape & first to prevent double escaping of &
            esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
        else:
            esc = {}

        rs = pprint_thing(s, escape_chars=esc).strip()

        if self.render_links and is_url(rs):
            rs_unescaped = pprint_thing(s, escape_chars={}).strip()
            start_tag += '<a href="{url}" target="_blank">'.format(url=rs_unescaped)
            end_a = "</a>"
        else:
            end_a = ""

        self.write(
            "{start}{rs}{end_a}</{kind}>".format(
                start=start_tag, rs=rs, end_a=end_a, kind=kind
            ),
            indent,
        )
Beispiel #7
0
    def _write_cell(self,
                    s: Any,
                    kind: str = "td",
                    indent: int = 0,
                    tags: str | None = None) -> None:
        if tags is not None:
            start_tag = f"<{kind} {tags}>"
        else:
            start_tag = f"<{kind}>"

        if self.escape:
            # escape & first to prevent double escaping of &
            esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
        else:
            esc = {}

        rs = pprint_thing(s, escape_chars=esc).strip()

        if self.render_links and is_url(rs):
            rs_unescaped = pprint_thing(s, escape_chars={}).strip()
            start_tag += f'<a href="{rs_unescaped}" target="_blank">'
            end_a = "</a>"
        else:
            end_a = ""

        self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
Beispiel #8
0
def _read(
    obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None
) -> str | bytes:
    """
    Try to read from a url, file or string.

    Parameters
    ----------
    obj : str, unicode, path object, or file-like object

    Returns
    -------
    raw_text : str
    """
    text: str | bytes
    if (
        is_url(obj)
        or hasattr(obj, "read")
        or (isinstance(obj, str) and file_exists(obj))
    ):
        # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
        # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
        # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
        # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
        with get_handle(
            obj, "r", encoding=encoding  # type: ignore[arg-type]
        ) as handles:
            text = handles.handle.read()
    elif isinstance(obj, (str, bytes)):
        text = obj
    else:
        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
    return text
Beispiel #9
0
 def _check_source_format(self, src):
     src = self.source
     if is_url(src):
         fmt = 'url'
     elif is_file_like(src):
         fmt = 'filelike'
     elif is_fsspec_url(src):
         fmt = 's3'
     else:
         fmt = 'invalid'
     return fmt
Beispiel #10
0
def get_sbml_model(
    filepath_or_buffer
) -> Tuple[libsbml.SBMLReader, libsbml.SBMLDocument, libsbml.Model]:
    """Get an SBML model from file or URL or file handle

    :param filepath_or_buffer:
        File or URL or file handle to read the model from
    :return: The SBML document, model and reader
    """

    from pandas.io.common import get_filepath_or_buffer, is_url, is_file_like

    if is_file_like(filepath_or_buffer) or is_url(filepath_or_buffer):
        buffer = get_filepath_or_buffer(filepath_or_buffer, mode='r')[0]
        if is_url(filepath_or_buffer):
            buffer = ''.join(line.decode('utf-8') for line in buffer)
        else:
            buffer = ''.join(line for line in buffer)

        # URL or already opened file, we will load the model from a string

        return load_sbml_from_string(buffer)

    return load_sbml_from_file(filepath_or_buffer)
Beispiel #11
0
def get_sbml_model(
        filepath_or_buffer
) -> Tuple[libsbml.SBMLReader, libsbml.SBMLDocument, libsbml.Model]:
    """Get an SBML model from file or URL or file handle

    :param filepath_or_buffer:
        File or URL or file handle to read the model from
    :return: The SBML document, model and reader
    """
    if is_file_like(filepath_or_buffer) or is_url(filepath_or_buffer):
        with get_handle(filepath_or_buffer, mode='r') as io_handle:
            data = load_sbml_from_string(''.join(io_handle.handle))
        # URL or already opened file, we will load the model from a string
        return data

    return load_sbml_from_file(filepath_or_buffer)
Beispiel #12
0
def get_data_from_filepath(
    filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
    encoding,
    compression: CompressionOptions,
    storage_options: StorageOptions,
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
    """
    Extract raw XML data.

    The method accepts three input types:
        1. filepath (string-like)
        2. file-like object (e.g. open file object, StringIO)
        3. XML string or bytes

    This method turns (1) into (2) to simplify the rest of the processing.
    It returns input types (2) and (3) unchanged.
    """
    if not isinstance(filepath_or_buffer, bytes):
        filepath_or_buffer = stringify_path(filepath_or_buffer)

    if (
        isinstance(filepath_or_buffer, str)
        and not filepath_or_buffer.startswith(("<?xml", "<"))
    ) and (
        not isinstance(filepath_or_buffer, str)
        or is_url(filepath_or_buffer)
        or is_fsspec_url(filepath_or_buffer)
        or file_exists(filepath_or_buffer)
    ):
        with get_handle(
            filepath_or_buffer,
            "r",
            encoding=encoding,
            compression=compression,
            storage_options=storage_options,
        ) as handle_obj:
            filepath_or_buffer = (
                # error: Incompatible types in assignment (expression has type
                # "Union[str, IO[str]]", variable has type "Union[Union[str,
                # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
                handle_obj.handle.read()  # type: ignore[assignment]
                if hasattr(handle_obj.handle, "read")
                else handle_obj.handle
            )

    return filepath_or_buffer
Beispiel #13
0
    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        The function read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.

        It raises FileNotFoundError if the input is a string ending in
        one of .json, .json.gz, .json.bz2, etc. but no such file exists.
        """
        # if it is a string but the file does not exist, it might be a JSON string
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if (
            not isinstance(filepath_or_buffer, str)
            or is_url(filepath_or_buffer)
            or is_fsspec_url(filepath_or_buffer)
            or file_exists(filepath_or_buffer)
        ):
            self.handles = get_handle(
                filepath_or_buffer,
                "r",
                encoding=self.encoding,
                compression=self.compression,
                storage_options=self.storage_options,
                errors=self.encoding_errors,
            )
            filepath_or_buffer = self.handles.handle
        elif (
            isinstance(filepath_or_buffer, str)
            and filepath_or_buffer.lower().endswith(
                (".json",) + tuple(f".json{c}" for c in _extension_to_compression)
            )
            and not file_exists(filepath_or_buffer)
        ):
            raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")

        return filepath_or_buffer
Beispiel #14
0
    def __init__(self,
                 filepath_or_buffer,
                 storage_options: StorageOptions = None):
        self.ioargs = IOArgs(
            filepath_or_buffer=filepath_or_buffer,
            encoding=None,
            mode=None,
            compression={"method": None},
        )
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if is_url(filepath_or_buffer):
            self.ioargs = IOArgs(
                filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()),
                should_close=True,
                encoding=None,
                mode=None,
                compression={"method": None},
            )
        elif not isinstance(filepath_or_buffer,
                            (ExcelFile, self._workbook_class)):
            self.ioargs = get_filepath_or_buffer(
                filepath_or_buffer, storage_options=storage_options)

        if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class):
            self.book = self.ioargs.filepath_or_buffer
        elif hasattr(self.ioargs.filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            assert not isinstance(self.ioargs.filepath_or_buffer, str)
            self.ioargs.filepath_or_buffer.seek(0)
            self.book = self.load_workbook(self.ioargs.filepath_or_buffer)
        elif isinstance(self.ioargs.filepath_or_buffer, str):
            self.book = self.load_workbook(self.ioargs.filepath_or_buffer)
        elif isinstance(self.ioargs.filepath_or_buffer, bytes):
            self.book = self.load_workbook(
                BytesIO(self.ioargs.filepath_or_buffer))
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
Beispiel #15
0
def filepath_to_buffer(
    filepath: Any,
    encoding: Optional[str] = None,
    compression: Optional[str] = None,
    timeout: Optional[float] = None,
    start_byte: int = 0,
) -> Tuple[io.IOBase, Optional[str], Optional[str], int]:
    if not is_str(filepath):
        # if start_byte:
        #    filepath.seek(start_byte)
        return cast(io.IOBase, filepath), encoding, compression, filepath.size()
    if is_url(filepath):
        headers = None
        if start_byte:
            headers = {"Range": "bytes={}-".format(start_byte)}
        req = requests.get(filepath, stream=True, headers=headers, timeout=timeout)
        content_encoding = req.headers.get("Content-Encoding", None)
        if content_encoding == "gzip":
            compression = "gzip"
        size = req.headers.get("Content-Length", 0)
        # return HttpDesc(req.raw, filepath), encoding, compression, int(size)
        return cast(io.IOBase, req.raw), encoding, compression, int(size)
    if is_s3_url(filepath):
        reader, encoding, compression = s3_get_filepath_or_buffer(
            filepath, encoding=encoding, compression=compression
        )
        return cast(io.IOBase, reader), encoding, compression, reader.size
    if _is_buffer_url(filepath):
        buffer = _url_to_buffer(filepath)
        return cast(io.IOBase, buffer), encoding, compression, buffer.size()
    filepath = os.path.expanduser(filepath)
    if not os.path.exists(filepath):
        raise ValueError("wrong filepath: {}".format(filepath))
    size = os.stat(filepath).st_size
    stream = io.FileIO(filepath)
    if start_byte:
        stream.seek(start_byte)
    return stream, encoding, compression, size
Beispiel #16
0
def get_data_from_filepath(
    filepath_or_buffer,
    encoding,
    compression,
    storage_options,
) -> Union[str, bytes, Buffer]:
    """
    Extract raw XML data.

    The method accepts three input types:
        1. filepath (string-like)
        2. file-like object (e.g. open file object, StringIO)
        3. XML string or bytes

    This method turns (1) into (2) to simplify the rest of the processing.
    It returns input types (2) and (3) unchanged.
    """
    filepath_or_buffer = stringify_path(filepath_or_buffer)

    if (isinstance(filepath_or_buffer, str)
            and not filepath_or_buffer.startswith(
                ("<?xml", "<"))) and (not isinstance(filepath_or_buffer, str)
                                      or is_url(filepath_or_buffer)
                                      or is_fsspec_url(filepath_or_buffer)
                                      or file_exists(filepath_or_buffer)):
        with get_handle(
                filepath_or_buffer,
                "r",
                encoding=encoding,
                compression=compression,
                storage_options=storage_options,
        ) as handle_obj:
            filepath_or_buffer = (handle_obj.handle.read() if hasattr(
                handle_obj.handle, "read") else handle_obj.handle)

    return filepath_or_buffer
Beispiel #17
0
    def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
        """
        Iterparse xml nodes.

        This method will read in local disk, decompressed XML files for elements
        and underlying descendants using iterparse, a method to iterate through
        an XML tree without holding entire XML tree in memory.

        Raises
        ------
        TypeError
            * If `iterparse` is not a dict or its dict value is not list-like.
        ParserError
            * If `path_or_buffer` is not a physical, decompressed file on disk.
            * If no data is returned from selected items in `iterparse`.

        Notes
        -----
        Namespace URIs will be removed from return node values. Also,
        elements with missing children or attributes in submitted list
        will have optional keys filled with None values.
        """

        dicts: list[dict[str, str | None]] = []
        row: dict[str, str | None] | None = None

        if not isinstance(self.iterparse, dict):
            raise TypeError(
                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
            )

        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
        if not is_list_like(self.iterparse[row_node]):
            raise TypeError(
                f"{type(self.iterparse[row_node])} is not a valid type "
                "for value in iterparse"
            )

        if (
            not isinstance(self.path_or_buffer, str)
            or is_url(self.path_or_buffer)
            or is_fsspec_url(self.path_or_buffer)
            or self.path_or_buffer.startswith(("<?xml", "<"))
            or infer_compression(self.path_or_buffer, "infer") is not None
        ):
            raise ParserError(
                "iterparse is designed for large XML files that are fully extracted on "
                "local disk and not as compressed files or online sources."
            )

        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

            if event == "start":
                if curr_elem == row_node:
                    row = {}

            if row is not None:
                if self.names:
                    for col, nm in zip(self.iterparse[row_node], self.names):
                        if curr_elem == col:
                            elem_val = elem.text.strip() if elem.text else None
                            if row.get(nm) != elem_val and nm not in row:
                                row[nm] = elem_val
                        if col in elem.attrib:
                            if elem.attrib[col] not in row.values() and nm not in row:
                                row[nm] = elem.attrib[col]
                else:
                    for col in self.iterparse[row_node]:
                        if curr_elem == col:
                            row[col] = elem.text.strip() if elem.text else None
                        if col in elem.attrib:
                            row[col] = elem.attrib[col]

            if event == "end":
                if curr_elem == row_node and row is not None:
                    dicts.append(row)
                    row = None

                elem.clear()
                if hasattr(elem, "getprevious"):
                    while (
                        elem.getprevious() is not None and elem.getparent() is not None
                    ):
                        del elem.getparent()[0]

        if dicts == []:
            raise ParserError("No result from selected items in iterparse.")

        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

        if self.names:
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

        return dicts
Beispiel #18
0
    def _read_csv_check_support(
        cls,
        read_csv_kwargs: ReadCsvKwargsType,
    ) -> Tuple[bool, str]:
        """
        Check if passed parameters are supported by current ``modin.pandas.read_csv`` implementation.

        Parameters
        ----------
        read_csv_kwargs : dict
                Parameters of read_csv function.

        Returns
        -------
        bool
            Whether passed parameters are supported or not.
        str
            Error message that should be raised if user explicitly set `engine="arrow"`.
        """
        filepath_or_buffer = read_csv_kwargs.get("filepath_or_buffer", None)
        header = read_csv_kwargs.get("header", "infer")
        names = read_csv_kwargs.get("names", None)
        engine = read_csv_kwargs.get("engine", None)
        skiprows = read_csv_kwargs.get("skiprows", None)
        delimiter = read_csv_kwargs.get("delimiter", None)
        parse_dates = read_csv_kwargs.get("parse_dates", False)

        if read_csv_kwargs.get("compression", "infer") != "infer":
            return (
                False,
                "read_csv with 'arrow' engine doesn't support explicit compression parameter, compression"
                " must be inferred automatically (supported compression types are gzip and bz2)",
            )

        if isinstance(filepath_or_buffer, str):
            if not os.path.exists(filepath_or_buffer):
                if cls.file_exists(filepath_or_buffer) or is_url(filepath_or_buffer):
                    return (
                        False,
                        "read_csv with 'arrow' engine supports only local files",
                    )
                else:
                    raise FileNotFoundError("No such file or directory")
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            if hasattr(filepath_or_buffer, "read"):
                return (
                    False,
                    "read_csv with 'arrow' engine doesn't support file-like objects",
                )
            else:
                raise ValueError(
                    f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
                )

        for arg, def_value in cls.read_csv_unsup_defaults.items():
            if read_csv_kwargs[arg] != def_value:
                return (
                    False,
                    f"read_csv with 'arrow' engine doesn't support {arg} parameter",
                )
        if delimiter is not None and read_csv_kwargs.get("delim_whitespace", False):
            raise ValueError(
                "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
            )

        parse_dates_unsupported = isinstance(parse_dates, dict) or (
            isinstance(parse_dates, list) and isinstance(parse_dates[0], list)
        )
        if parse_dates_unsupported:
            return (
                False,
                "read_csv with 'arrow' engine supports only bool and "
                "flattened lists 'parse_dates' parameter",
            )
        if names and names != lib.no_default:
            if header not in [None, 0, "infer"]:
                return (
                    False,
                    "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and "
                    "'infer' header values",
                )
            if isinstance(parse_dates, list) and not set(parse_dates).issubset(names):
                raise ValueError("Missing column provided to 'parse_dates'")

            empty_pandas_df = pandas.read_csv(
                **dict(
                    read_csv_kwargs,
                    nrows=0,
                    skiprows=None,
                    skipfooter=0,
                    usecols=None,
                    index_col=None,
                    names=None,
                    parse_dates=None,
                    engine=None if engine == "arrow" else engine,
                ),
            )
            columns_number = len(empty_pandas_df.columns)
            if columns_number != len(names):
                return (
                    False,
                    "read_csv with 'arrow' engine doesn't support names parameter, which length doesn't match "
                    "with actual number of columns",
                )
        else:
            if header not in [0, "infer"]:
                return (
                    False,
                    "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' "
                    "header values",
                )
            if isinstance(parse_dates, list):
                empty_pandas_df = pandas.read_csv(
                    **dict(
                        read_csv_kwargs,
                        nrows=0,
                        skiprows=None,
                        skipfooter=0,
                        usecols=None,
                        index_col=None,
                        engine=None if engine == "arrow" else engine,
                    ),
                )
                if not set(parse_dates).issubset(empty_pandas_df.columns):
                    raise ValueError("Missing column provided to 'parse_dates'")

        if not read_csv_kwargs.get("skip_blank_lines", True):
            # in some corner cases empty lines are handled as '',
            # while pandas handles it as NaNs - issue #3084
            return (
                False,
                "read_csv with 'arrow' engine doesn't support skip_blank_lines = False parameter",
            )

        if skiprows is not None and not isinstance(skiprows, int):
            return (
                False,
                "read_csv with 'arrow' engine doesn't support non-integer skiprows parameter",
            )

        return True, None