Python ReadOptions Exemples, pyarrow.csv.ReadOptions Python Exemples

Exemple #1

0

Afficher le fichier

    def test_batch_lifetime(self):
        gc.collect()
        old_allocated = pa.total_allocated_bytes()

        # Memory occupation should not grow with CSV file size
        def check_one_batch(reader, expected):
            batch = reader.read_next_batch()
            assert batch.to_pydict() == expected

        rows = b"10,11\n12,13\n14,15\n16,17\n"
        read_options = ReadOptions()
        read_options.column_names = ['a', 'b']
        read_options.block_size = 6
        reader = self.open_bytes(rows, read_options=read_options)
        check_one_batch(reader, {'a': [10], 'b': [11]})
        allocated_after_first_batch = pa.total_allocated_bytes()
        check_one_batch(reader, {'a': [12], 'b': [13]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [14], 'b': [15]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [16], 'b': [17]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        with pytest.raises(StopIteration):
            reader.read_next_batch()
        assert pa.total_allocated_bytes() == old_allocated
        reader = None
        assert pa.total_allocated_bytes() == old_allocated

Exemple #2

0

Afficher le fichier

    def test_encoding(self):
        # latin-1 (invalid utf-8)
        rows = b"a,b\nun,\xe9l\xe9phant"
        read_options = ReadOptions()
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.string()), ('b', pa.binary())])
        self.check_reader(reader, expected_schema, [{
            'a': ["un"],
            'b': [b"\xe9l\xe9phant"]
        }])

        read_options.encoding = 'latin1'
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.string()), ('b', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'a': ["un"],
            'b': ["éléphant"]
        }])

        # utf-16
        rows = (b'\xff\xfea\x00,\x00b\x00\n\x00u\x00n\x00,'
                b'\x00\xe9\x00l\x00\xe9\x00p\x00h\x00a\x00n\x00t\x00')
        read_options.encoding = 'utf16'
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.string()), ('b', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'a': ["un"],
            'b': ["éléphant"]
        }])

Exemple #3

0

Afficher le fichier

    def test_inference_failure(self):
        # Inference on first block, then conversion failure on second block
        rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n"
        read_options = ReadOptions()
        read_options.block_size = len(rows) - 7
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
        assert reader.schema == expected_schema
        assert reader.read_next_batch().to_pydict() == {'a': [123], 'b': [456]}
        # Second block
        with pytest.raises(ValueError, match="CSV conversion error to int64"):
            reader.read_next_batch()
        # EOF
        with pytest.raises(StopIteration):
            reader.read_next_batch()

        # Inference on first block, then conversion failure on second block,
        # then success on third block
        rows = b"a,b\n1,2\nabc,def\n45,67\n"
        read_options.block_size = 8
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
        assert reader.schema == expected_schema
        assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]}
        # Second block
        with pytest.raises(ValueError, match="CSV conversion error to int64"):
            reader.read_next_batch()
        # Third block
        assert reader.read_next_batch().to_pydict() == {'a': [45], 'b': [67]}
        # EOF
        with pytest.raises(StopIteration):
            reader.read_next_batch()

Exemple #4

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

    def test_header_column_names(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        opts = ReadOptions()
        opts.column_names = ["x", "y"]
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["x", "y"])
        assert table.to_pydict() == {
            "x": ["ab", "ef", "ij", "mn"],
            "y": ["cd", "gh", "kl", "op"],
            }

        opts.skip_rows = 3
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["x", "y"])
        assert table.to_pydict() == {
            "x": ["mn"],
            "y": ["op"],
            }

        opts.skip_rows = 4
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["x", "y"])
        assert table.to_pydict() == {
            "x": [],
            "y": [],
            }

        opts.skip_rows = 5
        with pytest.raises(pa.ArrowInvalid):
            # Not enough rows
            table = self.read_bytes(rows, read_options=opts)

        # Unexpected number of columns
        opts.skip_rows = 0
        opts.column_names = ["x", "y", "z"]
        with pytest.raises(pa.ArrowInvalid,
                           match="Expected 3 columns, got 2"):
            table = self.read_bytes(rows, read_options=opts)

        # Can skip rows with a different number of columns
        rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
        opts.skip_rows = 2
        opts.column_names = ["x", "y"]
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["x", "y"])
        assert table.to_pydict() == {
            "x": ["ij", "mn"],
            "y": ["kl", "op"],
            }

Exemple #5

0

Afficher le fichier

Fichier : 01.py Projet : manselmi/data-engineering-assessment

def parse_green_taxi_csv(fobj):
    """
    Parse a binary file object of cleaned "green taxi" CSV data as
    returned by the "read_green_taxi_csv" function, and return a PyArrow
    table.
    """

    convert_options = ConvertOptions(
        column_types=SCHEMA,
        false_values=['N'],
        null_values=[''],
        timestamp_parsers=['%Y-%m-%d %H:%M:%S'],
        true_values=['Y'],
    )
    parse_options = ParseOptions(quote_char=False)
    read_options = ReadOptions(
        column_names=SCHEMA.names,
        encoding=ENCODING,
    )

    return read_csv(
        fobj,
        convert_options=convert_options,
        parse_options=parse_options,
        read_options=read_options,
    )

Exemple #6

0

Afficher le fichier

    def test_invalid_csv(self):
        # CSV errors on first block
        rows = b"a,b\n1,2,3\n4,5\n6,7\n"
        read_options = ReadOptions()
        read_options.block_size = 10
        with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"):
            reader = self.open_bytes(rows, read_options=read_options)

        # CSV errors on second block
        rows = b"a,b\n1,2\n3,4,5\n6,7\n"
        read_options.block_size = 8
        reader = self.open_bytes(rows, read_options=read_options)
        assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]}
        with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"):
            reader.read_next_batch()
        # Cannot continue after a parse error
        with pytest.raises(StopIteration):
            reader.read_next_batch()

Exemple #7

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

    def test_include_missing_columns(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        read_options = ReadOptions()
        convert_options = ConvertOptions()
        convert_options.include_columns = ['xx', 'ab', 'yy']
        convert_options.include_missing_columns = True
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('xx', pa.null()),
                            ('ab', pa.string()),
                            ('yy', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "xx": [None, None, None],
            "ab": ["ef", "ij", "mn"],
            "yy": [None, None, None],
            }

        # Combining with `column_names`
        read_options.column_names = ["xx", "yy"]
        convert_options.include_columns = ["yy", "cd"]
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.string()),
                            ('cd', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": ["cd", "gh", "kl", "op"],
            "cd": [None, None, None, None],
            }

        # And with `column_types` as well
        convert_options.column_types = {"yy": pa.binary(),
                                        "cd": pa.int32()}
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.binary()),
                            ('cd', pa.int32())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": [b"cd", b"gh", b"kl", b"op"],
            "cd": [None, None, None, None],
            }

Exemple #8

0

Afficher le fichier

Fichier : test_csv.py Projet : zxf/arrow

    def test_inference(self):
        # Inference is done on first block
        rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n"
        expected_schema = pa.schema([('a', pa.string()),
                                     ('b', pa.binary())])

        read_options = ReadOptions()
        read_options.block_size = len(rows)
        reader = self.open_bytes(rows, read_options=read_options)
        self.check_reader(reader, expected_schema,
                          [{'a': ['123', 'abc', 'gh'],
                            'b': [b'456', b'de\xff', b'ij']}])

        read_options.block_size = len(rows) - 1
        reader = self.open_bytes(rows, read_options=read_options)
        self.check_reader(reader, expected_schema,
                          [{'a': ['123', 'abc'],
                            'b': [b'456', b'de\xff']},
                           {'a': ['gh'],
                            'b': [b'ij']}])

Exemple #9

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

    def test_header_skip_rows(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        opts = ReadOptions()
        opts.skip_rows = 1
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["ef", "gh"])
        assert table.to_pydict() == {
            "ef": ["ij", "mn"],
            "gh": ["kl", "op"],
            }

        opts.skip_rows = 3
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["mn", "op"])
        assert table.to_pydict() == {
            "mn": [],
            "op": [],
            }

        opts.skip_rows = 4
        with pytest.raises(pa.ArrowInvalid):
            # Not enough rows
            table = self.read_bytes(rows, read_options=opts)

        # Can skip rows with a different number of columns
        rows = b"abcd\n,,,,,\nij,kl\nmn,op\n"
        opts.skip_rows = 2
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["ij", "kl"])
        assert table.to_pydict() == {
            "ij": ["mn"],
            "kl": ["op"],
            }

Exemple #10

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

 def test_stress_block_sizes(self):
     # Test a number of small block sizes to stress block stitching
     csv_base, expected = make_random_csv(num_cols=2, num_rows=500)
     block_sizes = [11, 12, 13, 17, 37, 111]
     csvs = [csv_base, csv_base.rstrip(b'\r\n')]
     for csv in csvs:
         for block_size in block_sizes:
             read_options = ReadOptions(block_size=block_size)
             table = self.read_bytes(csv, read_options=read_options)
             assert table.schema == expected.schema
             if not table.equals(expected):
                 # Better error output
                 assert table.to_pydict() == expected.to_pydict()

Exemple #11

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

 def test_column_types_with_column_names(self):
     # When both `column_names` and `column_types` are given, names
     # in `column_types` should refer to names in `column_names`
     rows = b"a,b\nc,d\ne,f\n"
     read_options = ReadOptions(column_names=['x', 'y'])
     convert_options = ConvertOptions(column_types={'x': pa.binary()})
     table = self.read_bytes(rows, read_options=read_options,
                             convert_options=convert_options)
     schema = pa.schema([('x', pa.binary()),
                         ('y', pa.string())])
     assert table.schema == schema
     assert table.to_pydict() == {
         'x': [b'a', b'c', b'e'],
         'y': ['b', 'd', 'f'],
         }

Exemple #12

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

    def test_header_autogenerate_column_names(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        opts = ReadOptions()
        opts.autogenerate_column_names = True
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["f0", "f1"])
        assert table.to_pydict() == {
            "f0": ["ab", "ef", "ij", "mn"],
            "f1": ["cd", "gh", "kl", "op"],
            }

        opts.skip_rows = 3
        table = self.read_bytes(rows, read_options=opts)
        self.check_names(table, ["f0", "f1"])
        assert table.to_pydict() == {
            "f0": ["mn"],
            "f1": ["op"],
            }

        # Not enough rows, impossible to infer number of columns
        opts.skip_rows = 4
        with pytest.raises(pa.ArrowInvalid):
            table = self.read_bytes(rows, read_options=opts)

Exemple #13

0

Afficher le fichier

 def test_stress_block_sizes(self):
     # Test a number of small block sizes to stress block stitching
     csv_base, expected = make_random_csv(num_cols=2, num_rows=500)
     block_sizes = [19, 21, 23, 26, 37, 111]
     csvs = [csv_base, csv_base.rstrip(b'\r\n')]
     for csv in csvs:
         for block_size in block_sizes:
             # Need at least two lines for type inference
             assert csv[:block_size].count(b'\n') >= 2
             read_options = ReadOptions(block_size=block_size)
             reader = self.open_bytes(csv, read_options=read_options)
             table = reader.read_all()
             assert table.schema == expected.schema
             if not table.equals(expected):
                 # Better error output
                 assert table.to_pydict() == expected.to_pydict()

Exemple #14

0

Afficher le fichier

Fichier : test_csv.py Projet : yingsu00/arrow

def test_write_read_round_trip():
    t = pa.Table.from_arrays([[1, 2, 3], ["a", "b", "c"]], ["c1", "c2"])
    record_batch = t.to_batches(max_chunksize=4)[0]
    for data in [t, record_batch]:
        # Test with header
        buf = io.BytesIO()
        write_csv(data, buf, WriteOptions(include_header=True))
        buf.seek(0)
        assert t == read_csv(buf)

        # Test without header
        buf = io.BytesIO()
        write_csv(data, buf, WriteOptions(include_header=False))
        buf.seek(0)

        read_options = ReadOptions(column_names=t.column_names)
        assert t == read_csv(buf, read_options=read_options)

Exemple #15

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

 def test_empty_lines(self):
     rows = b"a,b\n\r1,2\r\n\r\n3,4\r\n"
     table = self.read_bytes(rows)
     assert table.to_pydict() == {
         'a': [1, 3],
         'b': [2, 4],
         }
     parse_options = ParseOptions(ignore_empty_lines=False)
     table = self.read_bytes(rows, parse_options=parse_options)
     assert table.to_pydict() == {
         'a': [None, 1, None, 3],
         'b': [None, 2, None, 4],
         }
     read_options = ReadOptions(skip_rows=2)
     table = self.read_bytes(rows, parse_options=parse_options,
                             read_options=read_options)
     assert table.to_pydict() == {
         '1': [None, 3],
         '2': [None, 4],
         }

Exemple #16

0

Afficher le fichier

    def read(self,
             env: CylonEnv,
             table,
             relevant_cols=None,
             **kwargs) -> DataFrame:
        filepath = self.table_path_mapping[table].replace('$TABLE', table)

        names, _ = get_schema(table)
        # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
        # .with_delimiter('|')
        read_opts = ReadOptions(column_names=names, block_size=(1 << 30))
        parse_opts = ParseOptions(delimiter='|')
        convert_opts = ConvertOptions(include_columns=relevant_cols)

        # if table is in refresh_tables list, read that table and concat
        # NOTE: refresh tables have the same parallelism as its data tables
        if table in REFRESH_TABLES:
            data_table = pa_read_csv(filepath,
                                     read_options=read_opts,
                                     parse_options=parse_opts,
                                     convert_options=convert_opts)
            refresh_path = filepath.replace('/data/', '/data_refresh/')

            refresh_table = pa_read_csv(refresh_path,
                                        read_options=read_opts,
                                        parse_options=parse_opts,
                                        convert_options=convert_opts)

            pa_table = pa_concat_tables([data_table, refresh_table])
        else:
            pa_table = pa_read_csv(filepath,
                                   read_options=read_opts,
                                   parse_options=parse_opts,
                                   convert_options=convert_opts)

        return DataFrame(Table.from_arrow(env.context, pa_table))

Exemple #17

0

Afficher le fichier

    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=None,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=b".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        dialect=None,
        error_bad_lines=True,
        warn_bad_lines=True,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            if isinstance(dtype, dict):
                column_types = {
                    c: cls._dtype_to_arrow(t)
                    for c, t in dtype.items()
                }
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names:
                if header == 0:
                    skiprows = skiprows + 1 if skiprows is not None else 1
                elif header is None or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values"
                    )
            else:
                if header == 0 or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values"
                    )

            if delimiter is None:
                delimiter = sep

            if delim_whitespace and delimiter != ",":
                raise ValueError(
                    "Specified a delimiter and delim_whitespace=True; you can only specify one."
                )

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""]
                if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (pa.ArrowNotImplementedError, NotImplementedError):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)

Exemple #18

0

Afficher le fichier

Fichier : test_csv.py Projet : UCHI-DB/arrow

 def read_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_csv(*args, **kwargs)
     table.validate(full=True)
     return table

Exemple #19

0

Afficher le fichier

 def open_csv(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = False
     return open_csv(*args, **kwargs)

Exemple #20

0

Afficher le fichier

    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=lib.no_default,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=lib.no_default,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        encoding_errors="strict",
        dialect=None,
        error_bad_lines=None,
        warn_bad_lines=None,
        on_bad_lines=None,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):  # noqa: PR01
        """
        Read data from `filepath_or_buffer` according to the passed `kwargs` parameters.

        For parameters description please refer to pandas API.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.

        Notes
        -----
        Reading performed by using of `pyarrow.read_csv` function.
        """
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            cls._validate_read_csv_kwargs(mykwargs)
            use_modin_impl, error_message = cls._read_csv_check_support(
                mykwargs,
            )
            if not use_modin_impl:
                raise ArrowEngineException(error_message)
            if isinstance(dtype, dict):
                column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names not in [lib.no_default, None] and header == 0:
                skiprows = skiprows + 1 if skiprows is not None else 1

            if delimiter is None and sep is not lib.no_default:
                delimiter = sep

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""] if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names if names is not lib.no_default else None,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (
            pa.ArrowNotImplementedError,
            pa.ArrowInvalid,
            NotImplementedError,
            ArrowEngineException,
        ):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)

Exemple #21

0

Afficher le fichier

    def test_column_options(self):
        # With column_names
        rows = b"1,2,3\n4,5,6"
        read_options = ReadOptions()
        read_options.column_names = ['d', 'e', 'f']
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()),
                                     ('f', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'd': [1, 4],
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With include_columns
        convert_options = ConvertOptions()
        convert_options.include_columns = ['f', 'e']
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With column_types
        convert_options.column_types = {'e': pa.string()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        # Missing columns in include_columns
        convert_options.include_columns = ['g', 'f', 'e']
        with pytest.raises(
                KeyError,
                match="Column 'g' in include_columns does not exist"):
            reader = self.open_bytes(rows,
                                     read_options=read_options,
                                     convert_options=convert_options)

        convert_options.include_missing_columns = True
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        convert_options.column_types = {'e': pa.string(), 'g': pa.float64()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])