def test_duration(): for unit in ('s', 'ms', 'us', 'ns'): ty = pa.duration(unit) assert ty.unit == unit for invalid_unit in ('m', 'arbit', 'rary'): with pytest.raises(ValueError, match='Invalid time unit'): pa.duration(invalid_unit)
def test_duration_null(self, duckdb_cursor): if not can_run: return data = (pa.array([None], type=pa.duration('ns')),pa.array([None], type=pa.duration('us')),pa.array([None], pa.duration('ms')),pa.array([None], pa.duration('s'))) arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d']) rel = duckdb.from_arrow_table(arrow_table).arrow() assert (rel['a'] == arrow_table['c']) assert (rel['b'] == arrow_table['c']) assert (rel['c'] == arrow_table['c']) assert (rel['d'] == arrow_table['c'])
def test_duration_nanos_pandas(): import pandas as pd arr = pa.array([0, 3600000000000], type=pa.duration('ns')) expected = pd.Timedelta('1 hour') assert isinstance(arr[1].as_py(), pd.Timedelta) assert arr[1].as_py() == expected assert arr[1].value == expected.value # Non-zero nanos work fine arr = pa.array([946684800000000001], type=pa.duration('ns')) assert arr[0].as_py() == pd.Timedelta(946684800000000001, unit='ns')
def test_duration_nanos_nopandas(): arr = pa.array([0, 3600000000000], pa.duration('ns')) expected = datetime.timedelta(seconds=60 * 60) assert isinstance(arr[1].as_py(), datetime.timedelta) assert arr[1].as_py() == expected assert arr[1].value == expected.total_seconds() * 1e9 # Non-zero nanos yields ValueError arr = pa.array([946684800000000001], type=pa.duration('ns')) with pytest.raises(ValueError): arr[0].as_py()
def test_sequence_duration_nested_lists(): td1 = datetime.timedelta(1, 1, 1000) td2 = datetime.timedelta(1, 100) data = [[td1, None], [td1, td2]] arr = pa.array(data) assert len(arr) == 2 assert arr.type == pa.list_(pa.duration('us')) assert arr.to_pylist() == data arr = pa.array(data, type=pa.list_(pa.duration('ms'))) assert len(arr) == 2 assert arr.type == pa.list_(pa.duration('ms')) assert arr.to_pylist() == data
def decode(encoding, type_spec): if isinstance(type_spec, dict): if type_spec['type'] == 'duration': return DataType(pa.duration(type_spec['unit'])) elif type_spec['type'] == 'timestamp': return DataType(pa.timestamp(type_spec['unit'])) elif type_spec['type'] == 'list': sub = encoding.decode('dtype', type_spec['value_type']).arrow return DataType(pa.list_(sub)) elif type_spec['type'] == 'dict': value_type = encoding.decode('dtype', type_spec["value_type"]).arrow index_type = encoding.decode('dtype', type_spec["index_type"]).arrow bool_ordered = type_spec["ordered"] return DataType(pa.dictionary(index_type, value_type, bool_ordered)) else: raise ValueError(f'Do not understand type {type_spec}') if type_spec == 'string': return DataType(pa.string()) if type_spec == 'large_string': return DataType(pa.large_string()) # TODO: find a proper way to support all arrow types if type_spec == 'timestamp[ms]': return DataType(pa.timestamp('ms')) else: return DataType(np.dtype(type_spec))
def to_arrow_type(dt: DataType) -> "pa.DataType": """Convert Spark data type to pyarrow type""" from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp("us", tz="UTC") elif type(dt) == TimestampNTZType: arrow_type = pa.timestamp("us", tz=None) elif type(dt) == DayTimeIntervalType: arrow_type = pa.duration("us") elif type(dt) == ArrayType: if type(dt.elementType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) elif type(dt) == MapType: if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if type(dt.keyType) in [StructType, TimestampType] or type(dt.valueType) in [ StructType, TimestampType, ]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) elif type(dt) == StructType: if any(type(field.dataType) == StructType for field in dt): raise TypeError("Nested StructType not supported in conversion to Arrow") fields = [ pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) for field in dt ] arrow_type = pa.struct(fields) elif type(dt) == NullType: arrow_type = pa.null() else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.duration('s'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.large_list(pa.uint16()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.struct([ pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string()) ]), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union( [pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string()) ], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()))
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] duration_types = [pa.duration('ms')] for case in date_types + time_types + timestamp_types + duration_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) assert not types.is_duration(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) assert not types.is_duration(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_duration(case) for case in duration_types: assert types.is_duration(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) assert not types.is_temporal(pa.int32())
def test_sequence_duration_from_int_with_unit(unit): data = [5] ty = pa.duration(unit) arr = pa.array(data, type=ty) assert len(arr) == 1 assert arr.type == ty assert arr[0].value == 5
def data_timedelta(f): data = [ datetime.timedelta(days=100), datetime.timedelta(days=1), datetime.timedelta(seconds=1), ] return pa.array(data, type=pa.duration(f), mask=np.array([False, True, False]))
def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype.""" # special case when dtype is np.datetime64 if dtype.kind == "M": time_unit, _ = np.datetime_data(dtype) if time_unit in ("s", "ms", "us", "ns"): # return a pa.Timestamp of the appropriate unit return pa.timestamp(time_unit) # default is int64_t UNIX ms return pa.date64() elif dtype.kind == "m": time_unit, _ = np.datetime_data(dtype) if time_unit in ("s", "ms", "us", "ns"): # return a pa.Duration of the appropriate unit return pa.duration(time_unit) # default fallback unit is ns return pa.duration("ns") return _np_pa_dtypes[cudf.dtype(dtype).type]
def test_duration_type(): # ARROW-6780 arrays = [ pa.array([0, 1, 2, 3], type=pa.duration(unit)) for unit in ["s", "ms", "us", "ns"] ] table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"]) _check_roundtrip(table)
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ('duration[s]', pa.duration('s')), ('duration[ms]', pa.duration('ms')), ('duration[us]', pa.duration('us')), ('duration[ns]', pa.duration('ns')), ('month_day_nano_interval', pa.month_day_nano_interval()), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_sequence_duration_nested_lists_numpy(): td1 = datetime.timedelta(1, 1, 1000) td2 = datetime.timedelta(1, 100) data = [[np.timedelta64(td1), None], [np.timedelta64(td1), np.timedelta64(td2)]] arr = pa.array(data) assert len(arr) == 2 assert arr.type == pa.list_(pa.duration('us')) assert arr.to_pylist() == [[td1, None], [td1, td2]] data = [np.array([np.timedelta64(td1), None], dtype='timedelta64[us]'), np.array([np.timedelta64(td1), np.timedelta64(td2)])] arr = pa.array(data) assert len(arr) == 2 assert arr.type == pa.list_(pa.duration('us')) assert arr.to_pylist() == [[td1, None], [td1, td2]]
def test_duration_overflow(self, duckdb_cursor): if not can_run: return # Only seconds can overflow data = pa.array([9223372036854775807], pa.duration('s')) arrow_table = pa.Table.from_arrays([data], ['a']) with pytest.raises(Exception): arrow_from_duck = duckdb.from_arrow(arrow_table).arrow()
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')), (np.dtype('timedelta64[s]'), pa.duration('s')), (np.dtype('timedelta64[ms]'), pa.duration('ms')), (np.dtype('timedelta64[us]'), pa.duration('us')), (np.dtype('timedelta64[ns]'), pa.duration('ns')), ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_sequence_duration(np_scalar): td1 = datetime.timedelta(2, 3601, 1) td2 = datetime.timedelta(1, 100, 1000) if np_scalar: data = [np.timedelta64(td1), None, np.timedelta64(td2)] else: data = [td1, None, td2] arr = pa.array(data) assert len(arr) == 3 assert arr.type == pa.duration('us') assert arr.null_count == 1 assert arr[0].as_py() == td1 assert arr[1].as_py() is None assert arr[2].as_py() == td2
def test_sequence_duration_with_unit(unit): data = [ datetime.timedelta(3, 22, 1001), ] expected = {'s': datetime.timedelta(3, 22), 'ms': datetime.timedelta(3, 22, 1000), 'us': datetime.timedelta(3, 22, 1001), 'ns': datetime.timedelta(3, 22, 1001)} ty = pa.duration(unit) arr_s = pa.array(data, type=ty) assert len(arr_s) == 1 assert arr_s.type == ty assert arr_s[0].as_py() == expected[unit]
def test_string_to_arrow_bijection_for_primitive_types(self): supported_pyarrow_datatypes = [ pa.time32("s"), pa.time64("us"), pa.timestamp("s"), pa.timestamp("ns", tz="America/New_York"), pa.date32(), pa.date64(), pa.duration("s"), pa.decimal128(10, 2), pa.decimal256(40, -3), pa.string(), pa.int32(), pa.float64(), ] for dt in supported_pyarrow_datatypes: self.assertEqual(dt, string_to_arrow(_arrow_to_datasets_dtype(dt))) unsupported_pyarrow_datatypes = [pa.list_(pa.float64())] for dt in unsupported_pyarrow_datatypes: with self.assertRaises(ValueError): string_to_arrow(_arrow_to_datasets_dtype(dt)) supported_datasets_dtypes = [ "time32[s]", "timestamp[ns]", "timestamp[ns, tz=+07:30]", "duration[us]", "decimal128(30, -4)", "int32", "float64", ] for sdt in supported_datasets_dtypes: self.assertEqual(sdt, _arrow_to_datasets_dtype(string_to_arrow(sdt))) unsupported_datasets_dtypes = [ "time32[ns]", "timestamp[blob]", "timestamp[[ns]]", "timestamp[ns, tz=[ns]]", "duration[[us]]", "decimal20(30, -4)", "int", ] for sdt in unsupported_datasets_dtypes: with self.assertRaises(ValueError): string_to_arrow(sdt)
pyarrow.binary(), pyarrow.large_binary(), ) _pyarrow_to_numpy_dtype = { pyarrow.date32(): (True, np.dtype("M8[D]")), pyarrow.date64(): (False, np.dtype("M8[ms]")), pyarrow.time32("s"): (True, np.dtype("M8[s]")), pyarrow.time32("ms"): (True, np.dtype("M8[ms]")), pyarrow.time64("us"): (False, np.dtype("M8[us]")), pyarrow.time64("ns"): (False, np.dtype("M8[ns]")), pyarrow.timestamp("s"): (False, np.dtype("M8[s]")), pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")), pyarrow.timestamp("us"): (False, np.dtype("M8[us]")), pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")), pyarrow.duration("s"): (False, np.dtype("m8[s]")), pyarrow.duration("ms"): (False, np.dtype("m8[ms]")), pyarrow.duration("us"): (False, np.dtype("m8[us]")), pyarrow.duration("ns"): (False, np.dtype("m8[ns]")), } if not ak._v2._util.numpy_at_least("1.17.0"): def packbits(bytearray, lsb_order=True): if lsb_order: if len(bytearray) % 8 == 0: ready_to_pack = bytearray else: ready_to_pack = numpy.empty( int(numpy.ceil(len(bytearray) / 8.0)) * 8, dtype=bytearray.dtype,
supported_array_types = (np.ndarray, ) + supported_arrow_array_types string_types = [pa.string(), pa.large_string()] _type_names_int = [ "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] _type_names = ["float64", "float32"] + _type_names_int map_arrow_to_numpy = { getattr(pa, name)(): np.dtype(name) for name in _type_names } map_arrow_to_numpy[pa.bool_()] = np.dtype("?") for unit in 's ms us ns'.split(): map_arrow_to_numpy[pa.timestamp(unit)] = np.dtype(f"datetime64[{unit}]") for unit in 's ms us ns'.split(): map_arrow_to_numpy[pa.duration(unit)] = np.dtype(f"timedelta64[{unit}]") def full(n, value, dtype): from .datatype import DataType dtype = DataType(dtype) values = np.full(n, value, dtype=dtype.numpy) if dtype.is_arrow: return pa.array(values) else: return values def is_arrow_array(ar): return isinstance(ar, supported_arrow_array_types)
("Q" if sys.platform == "win32" else "L"): UInt64, "f": Float32, "d": Float64, "?": Boolean, } if _PYARROW_AVAILABLE: _PY_TYPE_TO_ARROW_TYPE: dict[type, pa.lib.DataType] = { float: pa.float64(), int: pa.int64(), str: pa.large_utf8(), bool: pa.bool_(), date: pa.date32(), time: pa.time64("us"), datetime: pa.timestamp("us"), timedelta: pa.duration("us"), } _DTYPE_TO_ARROW_TYPE = { Int8: pa.int8(), Int16: pa.int16(), Int32: pa.int32(), Int64: pa.int64(), UInt8: pa.uint8(), UInt16: pa.uint16(), UInt32: pa.uint32(), UInt64: pa.uint64(), Float32: pa.float32(), Float64: pa.float64(), Boolean: pa.bool_(), Utf8: pa.large_utf8(),
def from_ibis_interval(dtype): try: return pa.duration(dtype.unit) except ValueError: raise com.IbisTypeError(f"Unsupported interval unit: {dtype.unit}")
pa.date32(), pa.date64() ]) time_types = st.sampled_from([ pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns') ]) timestamp_types = st.builds( pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones() ) duration_types = st.sampled_from([ pa.duration(unit) for unit in ['s', 'ms', 'us', 'ns']]) temporal_types = st.one_of( date_types, time_types, timestamp_types, duration_types) primitive_types = st.one_of( null_type, bool_type, binary_type, string_type, large_binary_type, large_string_type, numeric_types, temporal_types ) metadata = st.dictionaries(st.text(), st.text())
def test_datetime_subclassing(): data = [ MyDate(2007, 7, 13), ] date_type = pa.date32() arr_date = pa.array(data, type=date_type) assert len(arr_date) == 1 assert arr_date.type == date_type assert arr_date[0].as_py() == datetime.date(2007, 7, 13) data = [ MyDatetime(2007, 7, 13, 1, 23, 34, 123456), ] s = pa.timestamp('s') ms = pa.timestamp('ms') us = pa.timestamp('us') arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 0) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) data = [ MyTimedelta(123, 456, 1002), ] s = pa.duration('s') ms = pa.duration('ms') us = pa.duration('us') arr_s = pa.array(data) assert len(arr_s) == 1 assert arr_s.type == us assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002) arr_s = pa.array(data, type=s) assert len(arr_s) == 1 assert arr_s.type == s assert arr_s[0].as_py() == datetime.timedelta(123, 456) arr_ms = pa.array(data, type=ms) assert len(arr_ms) == 1 assert arr_ms.type == ms assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000) arr_us = pa.array(data, type=us) assert len(arr_us) == 1 assert arr_us.type == us assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
pa.field("c", pa.string()), ] ), pa.struct( [ pa.field("a", pa.int32(), nullable=False), pa.field("b", pa.int8(), nullable=False), pa.field("c", pa.string()), ] ), pa.dictionary(pa.int8(), pa.string()), ] _unsupported_pyarrow_types = [ pa.decimal256(76, 38), pa.duration("s"), pa.map_(pa.string(), pa.int32()), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, type_codes=[4, 8], ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_SPARSE, ), pa.union(
def test_basics(fletcher_array): df = pd.DataFrame( { "null": fletcher_array(pa.array([None, None], type=pa.null())), "bool": fletcher_array(pa.array([None, True], type=pa.bool_())), "int8": fletcher_array(pa.array([None, -1], type=pa.int8())), "uint8": fletcher_array(pa.array([None, 1], type=pa.uint8())), "int16": fletcher_array(pa.array([None, -1], type=pa.int16())), "uint16": fletcher_array(pa.array([None, 1], type=pa.uint16())), "int32": fletcher_array(pa.array([None, -1], type=pa.int32())), "uint32": fletcher_array(pa.array([None, 1], type=pa.uint32())), "int64": fletcher_array(pa.array([None, -1], type=pa.int64())), "uint64": fletcher_array(pa.array([None, 1], type=pa.uint64())), "float16": fletcher_array( pa.array([None, np.float16(-0.1)], type=pa.float16()) ), "float32": fletcher_array(pa.array([None, -0.1], type=pa.float32())), "float64": fletcher_array(pa.array([None, -0.1], type=pa.float64())), "date32": fletcher_array( pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32()) ), "date64": fletcher_array( pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64()) ), # https://github.com/pandas-dev/pandas/issues/34986 # "timestamp[s]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8)], # type=pa.timestamp("s"), # ) # ), # "timestamp[ms]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)], # type=pa.timestamp("ms"), # ) # ), # "timestamp[us]": fletcher_array( # pa.array( # [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], # type=pa.timestamp("us"), # ) # ), # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds # 'timestamp[ns]': fletcher_array(pa.array([None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns"))), "binary": fletcher_array(pa.array([None, b"122"], type=pa.binary())), "string": fletcher_array(pa.array([None, "🤔"], type=pa.string())), "duration[s]": fletcher_array( pa.array([None, datetime.timedelta(seconds=9)], type=pa.duration("s")) ), "duration[ms]": fletcher_array( pa.array( [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms") ) ), "duration[us]": fletcher_array( pa.array( [None, datetime.timedelta(microseconds=7)], type=pa.duration("us") ) ), # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds # 'duration[ns]': fletcher_array(pa.array([None, datetime.timedelta(microseconds=7)], type=pa.duration("ns"))), "list[string]": fletcher_array( pa.array([None, [None, "🤔"]], type=pa.list_(pa.string())) ), } ) ddf = dd.from_pandas(df, npartitions=2) meta_nonempty = ddf._meta_nonempty pdt.assert_frame_equal(meta_nonempty, df) result = ddf.compute() pdt.assert_frame_equal(result, df)
def generate_test_parquet(): import pyarrow as pa import datetime import decimal import json import pandas as pd import pathlib import pyarrow.parquet as pq import struct boolean = pa.array([True, False, None, False, True], type=pa.bool_()) uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)], type=pa.uint8()) int8 = pa.array([None if i == 2 else -2 + i for i in range(5)], type=pa.int8()) uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)], type=pa.uint16()) int16 = pa.array( [None if i == 2 else -20000 + i * 10000 for i in range(5)], type=pa.int16()) uint32 = pa.array( [None if i == 2 else 1 + i * 1000000000 for i in range(5)], type=pa.uint32()) int32 = pa.array( [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)], type=pa.int32()) uint64 = pa.array( [None if i == 2 else 1 + i * 100000000000 for i in range(5)], type=pa.uint64()) int64 = pa.array([ None if i == 2 else -200000000000 + i * 100000000000 for i in range(5) ], type=pa.int64()) float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float32()) float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float64()) string = pa.array(["abcd", "", None, "c", "d"], type=pa.string()) large_string = pa.array(["abcd", "", None, "c", "d"], type=pa.large_string()) gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2)) timestamp_ms_gmt_plus_2 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_plus_2) ] * 5, type=pa.timestamp('ms', tz=gmt_plus_2)) gmt = datetime.timezone(datetime.timedelta(hours=0)) timestamp_ms_gmt = pa.array([ pd.Timestamp( year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt) ] * 5, type=pa.timestamp('ms', tz=gmt)) gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25)) timestamp_ms_gmt_minus_0215 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_minus_0215) ] * 5, type=pa.timestamp( 'ms', tz=gmt_minus_0215)) timestamp_s_no_tz = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6) ] * 5, type=pa.timestamp('s')) time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s')) time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5], type=pa.time32('ms')) time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5], type=pa.time64('us')) time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5], type=pa.time64('ns')) date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32()) date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64()) duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s')) duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms')) binary = pa.array([b'\x00\x01'] * 5, type=pa.binary()) large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary()) fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2)) decimal128 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal128(7, 3)) decimal256 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal256(7, 3)) list_boolean = pa.array([ None if i == 2 else [ None if j == 0 else True if (j % 2) == 0 else False for j in range(i) ] for i in range(5) ], type=pa.list_(pa.bool_())) list_uint8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint8())) list_int8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int8())) list_uint16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint16())) list_int16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int16())) list_uint32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint32())) list_int32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int32())) list_uint64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint64())) list_int64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int64())) list_float32 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float32())) list_float64 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float64())) list_string = pa.array([ None if i == 2 else [ "".join(["%c" % (65 + j + k) for k in range(1 + j)]) for j in range(i) ] for i in range(5) ]) fixed_size_list_boolean = pa.array( [[True, False], [False, True], [True, False], [False, True], [True, False]], type=pa.list_(pa.bool_(), 2)) fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint8(), 2)) fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int8(), 2)) fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint16(), 2)) fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int16(), 2)) fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint32(), 2)) fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int32(), 2)) fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint64(), 2)) fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int64(), 2)) fixed_size_list_float32 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float32(), 2)) fixed_size_list_float64 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float64(), 2)) fixed_size_list_string = pa.array( [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]], type=pa.list_(pa.string(), 2)) struct_field = pa.array([{ "a": 1, "b": 2.5, "c": { "d": "e", "f": "g" }, "h": [5, 6], "i": 3 }] * 5) #struct_val = { "a": 5 } #for i in range(123): # struct_val = { "a": struct_val } #struct_field = pa.array([struct_val] * 5) map_boolean = pa.array([[('x', None), ('y', True)], [('z', True)], None, [], []], type=pa.map_(pa.string(), pa.bool_())) map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint8())) map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int8())) map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint16())) map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int16())) map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint32())) map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int32())) map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint64())) map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int64())) map_float32 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float32())) map_float64 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float64())) map_string = pa.array([[('x', 'x_val'), ('y', None)], [('z', 'z_val')], None, [], []], type=pa.map_(pa.string(), pa.string())) indices = pa.array([0, 1, 2, None, 2]) dictionary = pa.array(['foo', 'bar', 'baz']) dict = pa.DictionaryArray.from_arrays(indices, dictionary) map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []], type=pa.map_(pa.string(), pa.list_(pa.uint32()))) geometry = pa.array([ None if i == 1 else (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5) ], type=pa.binary()) names = [ "boolean", "uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64", "float32", "float64", "string", "large_string", "timestamp_ms_gmt", "timestamp_ms_gmt_plus_2", "timestamp_ms_gmt_minus_0215", "timestamp_s_no_tz", "time32_s", "time32_ms", "time64_us", "time64_ns", "date32", "date64", # "duration_s", # "duration_ms", "binary", "large_binary", "fixed_size_binary", "decimal128", "decimal256", "list_boolean", "list_uint8", "list_int8", "list_uint16", "list_int16", "list_uint32", "list_int32", "list_uint64", "list_int64", "list_float32", "list_float64", "list_string", "fixed_size_list_boolean", "fixed_size_list_uint8", "fixed_size_list_int8", "fixed_size_list_uint16", "fixed_size_list_int16", "fixed_size_list_uint32", "fixed_size_list_int32", "fixed_size_list_uint64", "fixed_size_list_int64", "fixed_size_list_float32", "fixed_size_list_float64", "fixed_size_list_string", "struct_field", "map_boolean", "map_uint8", "map_int8", "map_uint16", "map_int16", "map_uint32", "map_int32", "map_uint64", "map_int64", "map_float32", "map_float64", "map_string", # "map_list", "dict", "geometry", ] locals_ = locals() table = pa.table([locals_[x] for x in names], names=names) my_schema = table.schema.with_metadata({ "geo": json.dumps({ "version": "0.1.0", "primary_column": "geometry", "columns": { "geometry": { 'crs': wkt_epsg_4326, 'bbox': [0, 2, 4, 2], 'encoding': 'WKB' } } }) }) table = table.cast(my_schema) HERE = pathlib.Path(__file__).parent pq.write_table(table, HERE / "ogr/data/parquet/test.parquet", compression='NONE', row_group_size=3)
FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] TIME_PYARROW_DTYPES = [ pa.time32("s"), pa.time32("ms"), pa.time64("us"), pa.time64("ns"), ] DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] DATETIME_PYARROW_DTYPES = [ pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] TIMEDELTA_PYARROW_DTYPES = [ pa.duration(unit) for unit in ["s", "ms", "us", "ns"] ] BOOL_PYARROW_DTYPES = [pa.bool_()] # TODO: Add container like pyarrow types: # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions ALL_PYARROW_DTYPES = (ALL_INT_PYARROW_DTYPES + FLOAT_PYARROW_DTYPES + TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES + DATETIME_PYARROW_DTYPES + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES) EMPTY_STRING_PATTERN = re.compile("^$") # set testing_mode _testing_mode_warnings = (DeprecationWarning, ResourceWarning)