Ejemplo n.º 1
0
    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)
Ejemplo n.º 2
0
    def test_include_columns(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        convert_options = ConvertOptions()
        convert_options.include_columns = ['ab']
        table = self.read_bytes(rows, convert_options=convert_options)
        self.check_names(table, ["ab"])
        assert table.to_pydict() == {
            "ab": ["ef", "ij", "mn"],
            }

        # Order of include_columns is respected, regardless of CSV order
        convert_options.include_columns = ['cd', 'ab']
        table = self.read_bytes(rows, convert_options=convert_options)
        schema = pa.schema([('cd', pa.string()),
                            ('ab', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "cd": ["gh", "kl", "op"],
            "ab": ["ef", "ij", "mn"],
            }

        # Include a column not in the CSV file => raises by default
        convert_options.include_columns = ['xx', 'ab', 'yy']
        with pytest.raises(KeyError,
                           match="Column 'xx' in include_columns "
                                 "does not exist in CSV file"):
            self.read_bytes(rows, convert_options=convert_options)
Ejemplo n.º 3
0
    def test_custom_nulls(self):
        # Infer nulls with custom values
        opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.null()), ('b', pa.string()),
                            ('c', pa.string()), ('d', pa.int64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [u"Xxx", u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
        }

        opts = ConvertOptions(null_values=['Xxx', 'Zzz'],
                              strings_can_be_null=True)
        table = self.read_bytes(rows, convert_options=opts)
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [None, u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
        }

        opts = ConvertOptions(null_values=[])
        rows = b"a,b\n#N/A,\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()), ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [u"#N/A"],
            'b': [u""],
        }
Ejemplo n.º 4
0
    def test_timestamp_parsers(self):
        # Infer timestamps with custom parsers
        rows = b"a,b\n1970/01/01,1980-01-01\n1970/01/02,1980-01-02\n"
        opts = ConvertOptions()

        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()), ('b', pa.timestamp('s'))])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': ['1970/01/01', '1970/01/02'],
            'b': [datetime(1980, 1, 1),
                  datetime(1980, 1, 2)],
        }

        opts.timestamp_parsers = ['%Y/%m/%d']
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.timestamp('s')), ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [datetime(1970, 1, 1),
                  datetime(1970, 1, 2)],
            'b': ['1980-01-01', '1980-01-02'],
        }

        opts.timestamp_parsers = ['%Y/%m/%d', ISO8601]
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.timestamp('s')),
                            ('b', pa.timestamp('s'))])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [datetime(1970, 1, 1),
                  datetime(1970, 1, 2)],
            'b': [datetime(1980, 1, 1),
                  datetime(1980, 1, 2)],
        }
Ejemplo n.º 5
0
def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )
Ejemplo n.º 6
0
 def test_column_types(self):
     # Ask for specific column types in ConvertOptions
     opts = ConvertOptions(column_types={'b': 'float32',
                                         'c': 'string',
                                         'd': 'boolean',
                                         'e': pa.decimal128(11, 2),
                                         'zz': 'null'})
     rows = b"a,b,c,d,e\n1,2,3,true,1.0\n4,-5,6,false,0\n"
     table = self.read_bytes(rows, convert_options=opts)
     schema = pa.schema([('a', pa.int64()),
                         ('b', pa.float32()),
                         ('c', pa.string()),
                         ('d', pa.bool_()),
                         ('e', pa.decimal128(11, 2))])
     expected = {
         'a': [1, 4],
         'b': [2.0, -5.0],
         'c': ["3", "6"],
         'd': [True, False],
         'e': [Decimal("1.00"), Decimal("0.00")]
         }
     assert table.schema == schema
     assert table.to_pydict() == expected
     # Pass column_types as schema
     opts = ConvertOptions(
         column_types=pa.schema([('b', pa.float32()),
                                 ('c', pa.string()),
                                 ('d', pa.bool_()),
                                 ('e', pa.decimal128(11, 2)),
                                 ('zz', pa.bool_())]))
     table = self.read_bytes(rows, convert_options=opts)
     assert table.schema == schema
     assert table.to_pydict() == expected
     # One of the columns in column_types fails converting
     rows = b"a,b,c,d,e\n1,XXX,3,true,5\n4,-5,6,false,7\n"
     with pytest.raises(pa.ArrowInvalid) as exc:
         self.read_bytes(rows, convert_options=opts)
     err = str(exc.value)
     assert "In CSV column #1: " in err
     assert "CSV conversion error to float: invalid value 'XXX'" in err
Ejemplo n.º 7
0
    def test_dates(self):
        # Dates are inferred as date32 by default
        rows = b"a,b\n1970-01-01,1970-01-02\n1971-01-01,1971-01-02\n"
        table = self.read_bytes(rows)
        schema = pa.schema([('a', pa.date32()), ('b', pa.date32())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [date(1970, 1, 1), date(1971, 1, 1)],
            'b': [date(1970, 1, 2), date(1971, 1, 2)],
        }

        # Can ask for date types explicitly
        opts = ConvertOptions()
        opts.column_types = {'a': pa.date32(), 'b': pa.date64()}
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.date32()), ('b', pa.date64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [date(1970, 1, 1), date(1971, 1, 1)],
            'b': [date(1970, 1, 2), date(1971, 1, 2)],
        }

        # Can ask for timestamp types explicitly
        opts = ConvertOptions()
        opts.column_types = {'a': pa.timestamp('s'), 'b': pa.timestamp('ms')}
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.timestamp('s')),
                            ('b', pa.timestamp('ms'))])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [datetime(1970, 1, 1),
                  datetime(1971, 1, 1)],
            'b': [datetime(1970, 1, 2),
                  datetime(1971, 1, 2)],
        }
Ejemplo n.º 8
0
 def test_column_types_with_column_names(self):
     # When both `column_names` and `column_types` are given, names
     # in `column_types` should refer to names in `column_names`
     rows = b"a,b\nc,d\ne,f\n"
     read_options = ReadOptions(column_names=['x', 'y'])
     convert_options = ConvertOptions(column_types={'x': pa.binary()})
     table = self.read_bytes(rows, read_options=read_options,
                             convert_options=convert_options)
     schema = pa.schema([('x', pa.binary()),
                         ('y', pa.string())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'x': [b'a', b'c', b'e'],
         'y': ['b', 'd', 'f'],
         }
Ejemplo n.º 9
0
    def test_auto_dict_encode(self):
        opts = ConvertOptions(auto_dict_encode=True)
        rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode()
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())),
                            ('b', pa.int64())])
        expected = {
            'a': ["ab", "cdé", "cdé", "ab"],
            'b': [1, 2, 3, 4],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        opts.auto_dict_max_cardinality = 2
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Cardinality above max => plain-encoded
        opts.auto_dict_max_cardinality = 1
        table = self.read_bytes(rows, convert_options=opts)
        assert table.schema == pa.schema([('a', pa.string()),
                                          ('b', pa.int64())])
        assert table.to_pydict() == expected

        # With invalid UTF8, not checked
        opts.auto_dict_max_cardinality = 50
        opts.check_utf8 = False
        rows = b"a,b\nab,1\ncd\xff,2\nab,3"
        table = self.read_bytes(rows,
                                convert_options=opts,
                                validate_full=False)
        assert table.schema == schema
        dict_values = table['a'].chunk(0).dictionary
        assert len(dict_values) == 2
        assert dict_values[0].as_py() == "ab"
        assert dict_values[1].as_buffer() == b"cd\xff"

        # With invalid UTF8, checked
        opts.check_utf8 = True
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())),
                            ('b', pa.int64())])
        expected = {
            'a': [b"ab", b"cd\xff", b"ab"],
            'b': [1, 2, 3],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected
Ejemplo n.º 10
0
 def test_custom_bools(self):
     # Infer booleans with custom values
     opts = ConvertOptions(true_values=['T', 'yes'],
                           false_values=['F', 'no'])
     rows = (b"a,b,c\n"
             b"True,T,t\n"
             b"False,F,f\n"
             b"True,yes,yes\n"
             b"False,no,no\n"
             b"N/A,N/A,N/A\n")
     table = self.read_bytes(rows, convert_options=opts)
     schema = pa.schema([('a', pa.string()), ('b', pa.bool_()),
                         ('c', pa.string())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'a': ["True", "False", "True", "False", "N/A"],
         'b': [True, False, True, False, None],
         'c': ["t", "f", "yes", "no", "N/A"],
     }
Ejemplo n.º 11
0
 def test_stress_convert_options_blowup(self):
     # ARROW-6481: A convert_options with a very large number of columns
     # should not blow memory and CPU time.
     try:
         clock = time.thread_time
     except AttributeError:
         clock = time.time
     num_columns = 10000
     col_names = ["K{0}".format(i) for i in range(num_columns)]
     csv = make_empty_csv(col_names)
     t1 = clock()
     convert_options = ConvertOptions(
         column_types={k: pa.string() for k in col_names[::2]})
     table = self.read_bytes(csv, convert_options=convert_options)
     dt = clock() - t1
     # Check that processing time didn't blow up.
     # This is a conservative check (it takes less than 300 ms
     # in debug mode on my local machine).
     assert dt <= 10.0
     # Check result
     assert table.num_columns == num_columns
     assert table.num_rows == 0
     assert table.column_names == col_names
Ejemplo n.º 12
0
    def test_include_missing_columns(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        read_options = ReadOptions()
        convert_options = ConvertOptions()
        convert_options.include_columns = ['xx', 'ab', 'yy']
        convert_options.include_missing_columns = True
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('xx', pa.null()),
                            ('ab', pa.string()),
                            ('yy', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "xx": [None, None, None],
            "ab": ["ef", "ij", "mn"],
            "yy": [None, None, None],
            }

        # Combining with `column_names`
        read_options.column_names = ["xx", "yy"]
        convert_options.include_columns = ["yy", "cd"]
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.string()),
                            ('cd', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": ["cd", "gh", "kl", "op"],
            "cd": [None, None, None, None],
            }

        # And with `column_types` as well
        convert_options.column_types = {"yy": pa.binary(),
                                        "cd": pa.int32()}
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.binary()),
                            ('cd', pa.int32())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": [b"cd", b"gh", b"kl", b"op"],
            "cd": [None, None, None, None],
            }
Ejemplo n.º 13
0
    def read(self,
             env: CylonEnv,
             table,
             relevant_cols=None,
             **kwargs) -> DataFrame:
        filepath = self.table_path_mapping[table].replace('$TABLE', table)

        names, _ = get_schema(table)
        # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
        # .with_delimiter('|')
        read_opts = ReadOptions(column_names=names, block_size=(1 << 30))
        parse_opts = ParseOptions(delimiter='|')
        convert_opts = ConvertOptions(include_columns=relevant_cols)

        # if table is in refresh_tables list, read that table and concat
        # NOTE: refresh tables have the same parallelism as its data tables
        if table in REFRESH_TABLES:
            data_table = pa_read_csv(filepath,
                                     read_options=read_opts,
                                     parse_options=parse_opts,
                                     convert_options=convert_opts)
            refresh_path = filepath.replace('/data/', '/data_refresh/')

            refresh_table = pa_read_csv(refresh_path,
                                        read_options=read_opts,
                                        parse_options=parse_opts,
                                        convert_options=convert_opts)

            pa_table = pa_concat_tables([data_table, refresh_table])
        else:
            pa_table = pa_read_csv(filepath,
                                   read_options=read_opts,
                                   parse_options=parse_opts,
                                   convert_options=convert_opts)

        return DataFrame(Table.from_arrow(env.context, pa_table))
Ejemplo n.º 14
0
    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=None,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=b".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        dialect=None,
        error_bad_lines=True,
        warn_bad_lines=True,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            if isinstance(dtype, dict):
                column_types = {
                    c: cls._dtype_to_arrow(t)
                    for c, t in dtype.items()
                }
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names:
                if header == 0:
                    skiprows = skiprows + 1 if skiprows is not None else 1
                elif header is None or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values"
                    )
            else:
                if header == 0 or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values"
                    )

            if delimiter is None:
                delimiter = sep

            if delim_whitespace and delimiter != ",":
                raise ValueError(
                    "Specified a delimiter and delim_whitespace=True; you can only specify one."
                )

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""]
                if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (pa.ArrowNotImplementedError, NotImplementedError):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)
Ejemplo n.º 15
0
    def test_column_options(self):
        # With column_names
        rows = b"1,2,3\n4,5,6"
        read_options = ReadOptions()
        read_options.column_names = ['d', 'e', 'f']
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()),
                                     ('f', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'd': [1, 4],
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With include_columns
        convert_options = ConvertOptions()
        convert_options.include_columns = ['f', 'e']
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With column_types
        convert_options.column_types = {'e': pa.string()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        # Missing columns in include_columns
        convert_options.include_columns = ['g', 'f', 'e']
        with pytest.raises(
                KeyError,
                match="Column 'g' in include_columns does not exist"):
            reader = self.open_bytes(rows,
                                     read_options=read_options,
                                     convert_options=convert_options)

        convert_options.include_missing_columns = True
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        convert_options.column_types = {'e': pa.string(), 'g': pa.float64()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])
Ejemplo n.º 16
0
    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=lib.no_default,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=lib.no_default,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        encoding_errors="strict",
        dialect=None,
        error_bad_lines=None,
        warn_bad_lines=None,
        on_bad_lines=None,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):  # noqa: PR01
        """
        Read data from `filepath_or_buffer` according to the passed `kwargs` parameters.

        For parameters description please refer to pandas API.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.

        Notes
        -----
        Reading performed by using of `pyarrow.read_csv` function.
        """
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            cls._validate_read_csv_kwargs(mykwargs)
            use_modin_impl, error_message = cls._read_csv_check_support(
                mykwargs,
            )
            if not use_modin_impl:
                raise ArrowEngineException(error_message)
            if isinstance(dtype, dict):
                column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names not in [lib.no_default, None] and header == 0:
                skiprows = skiprows + 1 if skiprows is not None else 1

            if delimiter is None and sep is not lib.no_default:
                delimiter = sep

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""] if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names if names is not lib.no_default else None,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (
            pa.ArrowNotImplementedError,
            pa.ArrowInvalid,
            NotImplementedError,
            ArrowEngineException,
        ):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)