def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())), ('b', pa.dictionary(pa.int32(), pa.int64())), ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))), ('d', pa.dictionary(pa.int32(), pa.large_utf8()))] opts = ConvertOptions(column_types=dict(column_types)) rows = (b"a,b,c,d\n" b"abc,123456,1.0,zz\n" b"defg,123456,0.5,xx\n" b"abc,N/A,1.0,xx\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema(column_types) expected = { 'a': ["abc", "defg", "abc"], 'b': [123456, 123456, None], 'c': [Decimal("1.00"), Decimal("0.50"), Decimal("1.00")], 'd': ["zz", "xx", "xx"], } assert table.schema == schema assert table.to_pydict() == expected # Unsupported index type column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8())) opts = ConvertOptions(column_types=dict(column_types)) with pytest.raises(NotImplementedError): table = self.read_bytes(rows, convert_options=opts)
def test_include_columns(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" convert_options = ConvertOptions() convert_options.include_columns = ['ab'] table = self.read_bytes(rows, convert_options=convert_options) self.check_names(table, ["ab"]) assert table.to_pydict() == { "ab": ["ef", "ij", "mn"], } # Order of include_columns is respected, regardless of CSV order convert_options.include_columns = ['cd', 'ab'] table = self.read_bytes(rows, convert_options=convert_options) schema = pa.schema([('cd', pa.string()), ('ab', pa.string())]) assert table.schema == schema assert table.to_pydict() == { "cd": ["gh", "kl", "op"], "ab": ["ef", "ij", "mn"], } # Include a column not in the CSV file => raises by default convert_options.include_columns = ['xx', 'ab', 'yy'] with pytest.raises(KeyError, match="Column 'xx' in include_columns " "does not exist in CSV file"): self.read_bytes(rows, convert_options=convert_options)
def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), ('c', pa.string()), ('d', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [None, None], 'b': [u"Xxx", u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=['Xxx', 'Zzz'], strings_can_be_null=True) table = self.read_bytes(rows, convert_options=opts) assert table.to_pydict() == { 'a': [None, None], 'b': [None, u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [u"#N/A"], 'b': [u""], }
def test_timestamp_parsers(self): # Infer timestamps with custom parsers rows = b"a,b\n1970/01/01,1980-01-01\n1970/01/02,1980-01-02\n" opts = ConvertOptions() table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.timestamp('s'))]) assert table.schema == schema assert table.to_pydict() == { 'a': ['1970/01/01', '1970/01/02'], 'b': [datetime(1980, 1, 1), datetime(1980, 1, 2)], } opts.timestamp_parsers = ['%Y/%m/%d'] table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.timestamp('s')), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [datetime(1970, 1, 1), datetime(1970, 1, 2)], 'b': ['1980-01-01', '1980-01-02'], } opts.timestamp_parsers = ['%Y/%m/%d', ISO8601] table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.timestamp('s')), ('b', pa.timestamp('s'))]) assert table.schema == schema assert table.to_pydict() == { 'a': [datetime(1970, 1, 1), datetime(1970, 1, 2)], 'b': [datetime(1980, 1, 1), datetime(1980, 1, 2)], }
def parse_green_taxi_csv(fobj): """ Parse a binary file object of cleaned "green taxi" CSV data as returned by the "read_green_taxi_csv" function, and return a PyArrow table. """ convert_options = ConvertOptions( column_types=SCHEMA, false_values=['N'], null_values=[''], timestamp_parsers=['%Y-%m-%d %H:%M:%S'], true_values=['Y'], ) parse_options = ParseOptions(quote_char=False) read_options = ReadOptions( column_names=SCHEMA.names, encoding=ENCODING, ) return read_csv( fobj, convert_options=convert_options, parse_options=parse_options, read_options=read_options, )
def test_column_types(self): # Ask for specific column types in ConvertOptions opts = ConvertOptions(column_types={'b': 'float32', 'c': 'string', 'd': 'boolean', 'e': pa.decimal128(11, 2), 'zz': 'null'}) rows = b"a,b,c,d,e\n1,2,3,true,1.0\n4,-5,6,false,0\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.int64()), ('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_()), ('e', pa.decimal128(11, 2))]) expected = { 'a': [1, 4], 'b': [2.0, -5.0], 'c': ["3", "6"], 'd': [True, False], 'e': [Decimal("1.00"), Decimal("0.00")] } assert table.schema == schema assert table.to_pydict() == expected # Pass column_types as schema opts = ConvertOptions( column_types=pa.schema([('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_()), ('e', pa.decimal128(11, 2)), ('zz', pa.bool_())])) table = self.read_bytes(rows, convert_options=opts) assert table.schema == schema assert table.to_pydict() == expected # One of the columns in column_types fails converting rows = b"a,b,c,d,e\n1,XXX,3,true,5\n4,-5,6,false,7\n" with pytest.raises(pa.ArrowInvalid) as exc: self.read_bytes(rows, convert_options=opts) err = str(exc.value) assert "In CSV column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err
def test_dates(self): # Dates are inferred as date32 by default rows = b"a,b\n1970-01-01,1970-01-02\n1971-01-01,1971-01-02\n" table = self.read_bytes(rows) schema = pa.schema([('a', pa.date32()), ('b', pa.date32())]) assert table.schema == schema assert table.to_pydict() == { 'a': [date(1970, 1, 1), date(1971, 1, 1)], 'b': [date(1970, 1, 2), date(1971, 1, 2)], } # Can ask for date types explicitly opts = ConvertOptions() opts.column_types = {'a': pa.date32(), 'b': pa.date64()} table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.date32()), ('b', pa.date64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [date(1970, 1, 1), date(1971, 1, 1)], 'b': [date(1970, 1, 2), date(1971, 1, 2)], } # Can ask for timestamp types explicitly opts = ConvertOptions() opts.column_types = {'a': pa.timestamp('s'), 'b': pa.timestamp('ms')} table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.timestamp('s')), ('b', pa.timestamp('ms'))]) assert table.schema == schema assert table.to_pydict() == { 'a': [datetime(1970, 1, 1), datetime(1971, 1, 1)], 'b': [datetime(1970, 1, 2), datetime(1971, 1, 2)], }
def test_column_types_with_column_names(self): # When both `column_names` and `column_types` are given, names # in `column_types` should refer to names in `column_names` rows = b"a,b\nc,d\ne,f\n" read_options = ReadOptions(column_names=['x', 'y']) convert_options = ConvertOptions(column_types={'x': pa.binary()}) table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('x', pa.binary()), ('y', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'x': [b'a', b'c', b'e'], 'y': ['b', 'd', 'f'], }
def test_auto_dict_encode(self): opts = ConvertOptions(auto_dict_encode=True) rows = "a,b\nab,1\ncdé,2\ncdé,3\nab,4".encode() table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.string())), ('b', pa.int64())]) expected = { 'a': ["ab", "cdé", "cdé", "ab"], 'b': [1, 2, 3, 4], } assert table.schema == schema assert table.to_pydict() == expected opts.auto_dict_max_cardinality = 2 table = self.read_bytes(rows, convert_options=opts) assert table.schema == schema assert table.to_pydict() == expected # Cardinality above max => plain-encoded opts.auto_dict_max_cardinality = 1 table = self.read_bytes(rows, convert_options=opts) assert table.schema == pa.schema([('a', pa.string()), ('b', pa.int64())]) assert table.to_pydict() == expected # With invalid UTF8, not checked opts.auto_dict_max_cardinality = 50 opts.check_utf8 = False rows = b"a,b\nab,1\ncd\xff,2\nab,3" table = self.read_bytes(rows, convert_options=opts, validate_full=False) assert table.schema == schema dict_values = table['a'].chunk(0).dictionary assert len(dict_values) == 2 assert dict_values[0].as_py() == "ab" assert dict_values[1].as_buffer() == b"cd\xff" # With invalid UTF8, checked opts.check_utf8 = True table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.dictionary(pa.int32(), pa.binary())), ('b', pa.int64())]) expected = { 'a': [b"ab", b"cd\xff", b"ab"], 'b': [1, 2, 3], } assert table.schema == schema assert table.to_pydict() == expected
def test_custom_bools(self): # Infer booleans with custom values opts = ConvertOptions(true_values=['T', 'yes'], false_values=['F', 'no']) rows = (b"a,b,c\n" b"True,T,t\n" b"False,F,f\n" b"True,yes,yes\n" b"False,no,no\n" b"N/A,N/A,N/A\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.bool_()), ('c', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': ["True", "False", "True", "False", "N/A"], 'b': [True, False, True, False, None], 'c': ["t", "f", "yes", "no", "N/A"], }
def test_stress_convert_options_blowup(self): # ARROW-6481: A convert_options with a very large number of columns # should not blow memory and CPU time. try: clock = time.thread_time except AttributeError: clock = time.time num_columns = 10000 col_names = ["K{0}".format(i) for i in range(num_columns)] csv = make_empty_csv(col_names) t1 = clock() convert_options = ConvertOptions( column_types={k: pa.string() for k in col_names[::2]}) table = self.read_bytes(csv, convert_options=convert_options) dt = clock() - t1 # Check that processing time didn't blow up. # This is a conservative check (it takes less than 300 ms # in debug mode on my local machine). assert dt <= 10.0 # Check result assert table.num_columns == num_columns assert table.num_rows == 0 assert table.column_names == col_names
def test_include_missing_columns(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" read_options = ReadOptions() convert_options = ConvertOptions() convert_options.include_columns = ['xx', 'ab', 'yy'] convert_options.include_missing_columns = True table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('xx', pa.null()), ('ab', pa.string()), ('yy', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "xx": [None, None, None], "ab": ["ef", "ij", "mn"], "yy": [None, None, None], } # Combining with `column_names` read_options.column_names = ["xx", "yy"] convert_options.include_columns = ["yy", "cd"] table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.string()), ('cd', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "yy": ["cd", "gh", "kl", "op"], "cd": [None, None, None, None], } # And with `column_types` as well convert_options.column_types = {"yy": pa.binary(), "cd": pa.int32()} table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.binary()), ('cd', pa.int32())]) assert table.schema == schema assert table.to_pydict() == { "yy": [b"cd", b"gh", b"kl", b"op"], "cd": [None, None, None, None], }
def read(self, env: CylonEnv, table, relevant_cols=None, **kwargs) -> DataFrame: filepath = self.table_path_mapping[table].replace('$TABLE', table) names, _ = get_schema(table) # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) # .with_delimiter('|') read_opts = ReadOptions(column_names=names, block_size=(1 << 30)) parse_opts = ParseOptions(delimiter='|') convert_opts = ConvertOptions(include_columns=relevant_cols) # if table is in refresh_tables list, read that table and concat # NOTE: refresh tables have the same parallelism as its data tables if table in REFRESH_TABLES: data_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) refresh_path = filepath.replace('/data/', '/data_refresh/') refresh_table = pa_read_csv(refresh_path, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) pa_table = pa_concat_tables([data_table, refresh_table]) else: pa_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) return DataFrame(Table.from_arrow(env.context, pa_table))
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) if isinstance(dtype, dict): column_types = { c: cls._dtype_to_arrow(t) for c, t in dtype.items() } else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names: if header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 elif header is None or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values" ) else: if header == 0 or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values" ) if delimiter is None: delimiter = sep if delim_whitespace and delimiter != ",": raise ValueError( "Specified a delimiter and delim_whitespace=True; you can only specify one." ) usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except (pa.ArrowNotImplementedError, NotImplementedError): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)
def test_column_options(self): # With column_names rows = b"1,2,3\n4,5,6" read_options = ReadOptions() read_options.column_names = ['d', 'e', 'f'] reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()), ('f', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'd': [1, 4], 'e': [2, 5], 'f': [3, 6] }]) # With include_columns convert_options = ConvertOptions() convert_options.include_columns = ['f', 'e'] reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'e': [2, 5], 'f': [3, 6] }]) # With column_types convert_options.column_types = {'e': pa.string()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'e': ["2", "5"], 'f': [3, 6] }]) # Missing columns in include_columns convert_options.include_columns = ['g', 'f', 'e'] with pytest.raises( KeyError, match="Column 'g' in include_columns does not exist"): reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) convert_options.include_missing_columns = True reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }]) convert_options.column_types = {'e': pa.string(), 'g': pa.float64()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }])
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, squeeze=False, prefix=lib.no_default, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, encoding_errors="strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): # noqa: PR01 """ Read data from `filepath_or_buffer` according to the passed `kwargs` parameters. For parameters description please refer to pandas API. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- Reading performed by using of `pyarrow.read_csv` function. """ items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) cls._validate_read_csv_kwargs(mykwargs) use_modin_impl, error_message = cls._read_csv_check_support( mykwargs, ) if not use_modin_impl: raise ArrowEngineException(error_message) if isinstance(dtype, dict): column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()} else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names not in [lib.no_default, None] and header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 if delimiter is None and sep is not lib.no_default: delimiter = sep usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names if names is not lib.no_default else None, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except ( pa.ArrowNotImplementedError, pa.ArrowInvalid, NotImplementedError, ArrowEngineException, ): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)